Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Carlos-Francisco Méndez-Cruz
/
nlp-preprocessing-pipeline
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
Carlos-Francisco Méndez-Cruz
2018-03-07 17:19:23 -0600
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
f742cde91e3643293dfdd703b61159b5bca75a82
f742cde9
1 parent
81a1f15b
New terminological tagging for CRFs
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
166 additions
and
5 deletions
biologicalTermTagging_CRF.py → biologicalTermTagging-CRF.py
nlp-preprocessing-pipeline.sh
transforming-CRF.py
biologicalTermTagging
_
CRF.py
→
biologicalTermTagging
-
CRF.py
View file @
f742cde
File moved
nlp-preprocessing-pipeline.sh
View file @
f742cde
...
...
@@ -6,11 +6,11 @@ TERM_PATH=/export/space1/users/compu2/bionlp/nlp-preprocessing-pipeline/dictiona
POST_PATH
=
/export/space1/users/compu2/bionlp/stanford-postagger-2018-02-27
LEMMA_PATH
=
/export/space1/users/compu2/bionlp/biolemmatizer
PRE
=
TRU
E
PRE
=
FALS
E
echo
" Preprocessing:
$PRE
"
POS
=
TRU
E
POS
=
FALS
E
echo
" POS Tagging:
$POS
"
LEMMA
=
TRU
E
LEMMA
=
FALS
E
echo
" Lemmatization:
$LEMMA
"
TERM
=
TRUE
echo
" Terminological tagging:
$TERM
"
...
...
@@ -42,12 +42,12 @@ if [ "$TERM" = "TRUE" ]; then
echo
"Terminological tagging..."
INPUT_PATH
=
$CORPUS_PATH
/lemma
OUTPUT_PATH
=
$CORPUS_PATH
/term
python3.4 biologicalTermTagging
_
CRF.py --inputPath
$INPUT_PATH
--outputPath
$OUTPUT_PATH
--termPath
$TERM_PATH
--termFiles termFilesTag.json > outputTerm.txt
python3.4 biologicalTermTagging
-
CRF.py --inputPath
$INPUT_PATH
--outputPath
$OUTPUT_PATH
--termPath
$TERM_PATH
--termFiles termFilesTag.json > outputTerm.txt
fi
if
[
"
$TRANS
"
=
"TRUE"
]
;
then
echo
"Transformation..."
INPUT_PATH
=
$CORPUS_PATH
/term
OUTPUT_PATH
=
$CORPUS_PATH
/transformed
python3.4 transforming.py --inputPath
$INPUT_PATH
--outputPath
$OUTPUT_PATH
--minWordsInLine 5 > outputTransformation.txt
python3.4 transforming
-CRF
.py --inputPath
$INPUT_PATH
--outputPath
$OUTPUT_PATH
--minWordsInLine 5 > outputTransformation.txt
fi
...
...
transforming-CRF.py
0 → 100644
View file @
f742cde
# -*- coding: UTF-8 -*-
import
re
from
optparse
import
OptionParser
import
os
import
sys
from
time
import
time
__author__
=
'CMendezC'
# Objective: Transforming BIOLemmatized files:
# 1) Transformed files
# 2) Text files to extract aspects
# Parameters:
# 1) --inputPath Path to read input files.
# 2) --outputPath Path to place output files.
# 3) --textPath Path to place output files.
# 4) --minWordsInLine Minimum length sentence in number of words
# 5) --classes Classes to indicate final of sentence when line contains: PMID\tNUMSENT\tSENT\tCLASS
# Output:
# 1) transformed files
# 2) text files
# Execution:
# GntR
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012096_GntR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012096_GntR\transformed --minWordsInLine 5
# FhlA
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011394_FhlA\transformed --minWordsInLine 5
# MarA
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011412_MarA\transformed --minWordsInLine 5
# ArgR
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011670_ArgR\transformed --minWordsInLine 5
# CytR
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120012407_CytR\transformed --minWordsInLine 5
# Rob
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\term --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\TF_PMIDs_TXT_ECK120011190_Rob\transformed --minWordsInLine 5
# EXTRACTING REGULATORY INTERACTIONS
# python transforming.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\lemma --outputPath C:\Users\cmendezc\Documents\GENOMICAS\EXTRACTING_REGULATORY_INTERACTIONS\corpus_ecoli\transformed --minWordsInLine 5
def
length
(
listWords
):
regexWord
=
re
.
compile
(
'[a-zA-Z]'
)
words
=
0
chars
=
0
for
word
in
listWords
:
listTemp
=
word
.
split
(
'|'
)
if
regexWord
.
search
(
listTemp
[
1
])
is
not
None
:
words
+=
1
chars
+=
len
(
listTemp
[
0
])
return
words
,
chars
###########################################################
# MAIN PROGRAM #
###########################################################
if
__name__
==
"__main__"
:
# Parameter definition
parser
=
OptionParser
()
parser
.
add_option
(
"-i"
,
"--inputPath"
,
dest
=
"inputPath"
,
help
=
"Path to read input files"
,
metavar
=
"PATH"
)
parser
.
add_option
(
"-o"
,
"--outputPath"
,
dest
=
"outputPath"
,
help
=
"Path to place transformed files"
,
metavar
=
"PATH"
)
parser
.
add_option
(
"--minWordsInLine"
,
type
=
"int"
,
dest
=
"minWordsInLine"
,
default
=
3
,
help
=
"Minimum length sentence in number of words"
,
metavar
=
"NUM"
)
parser
.
add_option
(
"--classes"
,
dest
=
"classes"
,
help
=
"Classes to indicate final of sentence when line contains: PMID-NUMSENT-SENT-CLASS"
,
metavar
=
"CLASS,CLASS"
)
(
options
,
args
)
=
parser
.
parse_args
()
if
len
(
args
)
>
0
:
parser
.
error
(
"None parameters indicated."
)
sys
.
exit
(
1
)
# Printing parameter values
print
(
'-------------------------------- PARAMETERS --------------------------------'
)
print
(
"Path to read input files: "
+
str
(
options
.
inputPath
))
print
(
"Path to place transformed files: "
+
str
(
options
.
outputPath
))
print
(
"Minimum length sentence in number of words: "
+
str
(
options
.
minWordsInLine
))
print
(
"Classes to indicate final of sentence: "
+
str
(
options
.
classes
))
# We realized that POS tags from Biolemmatizer are very specific, therefore we decided to use Standford tags
bioPOST
=
False
filesProcessed
=
0
# minWordsInLine = 3
if
not
options
.
classes
is
None
:
listClasses
=
options
.
classes
.
split
(
','
)
t0
=
time
()
print
(
"Transforming files..."
)
# Walk directory to read files
for
path
,
dirs
,
files
in
os
.
walk
(
options
.
inputPath
):
# For each file in dir
for
file
in
files
:
print
(
" Transforming file..."
+
str
(
file
))
#TrpR NN TrpR NN PennPOS
# , , , , NUPOS
# tryptophan NN tryptophan NN PennPOS
listLine1
=
[]
listLine2
=
[]
text
=
''
lemma
=
''
pos
=
''
textTransformed
=
''
textText
=
''
with
open
(
os
.
path
.
join
(
path
,
file
),
"r"
,
encoding
=
"utf-8"
,
errors
=
"replace"
)
as
iFile
:
# Create output file to write
with
open
(
os
.
path
.
join
(
options
.
outputPath
,
file
.
replace
(
'term.txt'
,
'tra.txt'
)),
"w"
,
encoding
=
"utf-8"
)
as
transformedFile
:
for
line
in
iFile
:
if
line
==
'
\n
'
:
if
options
.
classes
is
None
:
if
length
(
textTransformed
.
split
())[
0
]
>
options
.
minWordsInLine
and
length
(
textTransformed
.
split
())[
1
]
<=
1000
:
transformedFile
.
write
(
textTransformed
+
'
\n
'
)
textTransformed
=
''
textText
=
''
else
:
continue
else
:
line
=
line
.
strip
(
'
\n
'
)
#print('Line ' + str(line.encode(encoding='UTF-8', errors='replace')))
listLine1
=
line
.
split
(
'
\t
'
)
if
len
(
listLine1
)
!=
3
:
continue
text
=
listLine1
[
0
]
# Replacing an estrange space character
text
=
text
.
replace
(
' '
,
'-'
)
listLine2
=
listLine1
[
2
]
.
split
(
' '
)
lemma
=
listLine2
[
0
]
# Replacing an estrange space character
lemma
=
lemma
.
replace
(
' '
,
'-'
)
if
bioPOST
:
pos
=
listLine2
[
1
]
#print('Line ' + str(line.encode(encoding='UTF-8', errors='replace')))
else
:
pos
=
listLine1
[
1
]
if
listLine2
[
2
]
==
"TermTag"
:
tag
=
listLine2
[
1
]
else
:
tag
=
"O"
textText
=
textText
+
text
+
' '
textTransformed
=
textTransformed
+
text
+
'|'
+
lemma
+
'|'
+
pos
+
'|'
+
tag
+
' '
# RI+GC NN RI+GC NN PennPOS
if
not
options
.
classes
is
None
:
if
text
in
listClasses
:
# if length(textTransformed.split()) > options.minWordsInLine:
if
length
(
textTransformed
.
split
())[
0
]
>
options
.
minWordsInLine
and
length
(
textTransformed
.
split
())[
1
]
<=
1000
:
transformedFile
.
write
(
textTransformed
+
'
\n
'
)
# print(textTransformed)
textTransformed
=
''
textText
=
''
filesProcessed
+=
1
# Imprime archivos procesados
print
()
print
(
"Files processed: "
+
str
(
filesProcessed
))
print
(
"In:
%
fs"
%
(
time
()
-
t0
))
Please
register
or
login
to post a comment