Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Carlos-Francisco Méndez-Cruz
/
nlp-preprocessing-pipeline
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
Carlos-Francisco Méndez-Cruz
2018-03-07 19:58:25 -0600
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
c0dad336fef2cce9fba0dd77e927b33a5735fd4d
c0dad336
1 parent
f972fd01
New terminological tagging for CRFs
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
14 additions
and
9 deletions
biologicalTermTagging-CRF.py
biologicalTermTagging-CRF.py
View file @
c0dad33
...
...
@@ -5,6 +5,7 @@ import os
import
sys
from
time
import
time
from
nltk.corpus
import
words
import
re
__author__
=
'CMendezC'
...
...
@@ -114,6 +115,7 @@ if __name__ == "__main__":
filesPreprocessed
=
0
t0
=
time
()
reHyphen
=
re
.
compile
(
'-'
)
print
(
"Biological term tagging files..."
)
# Walk directory to read files
for
path
,
dirs
,
files
in
os
.
walk
(
options
.
inputPath
):
...
...
@@ -139,15 +141,18 @@ if __name__ == "__main__":
for
termTag
in
hashTerms
:
if
word
in
hashTerms
[
termTag
]:
if
word
.
find
(
'-'
)
>
-
1
:
wordOrig
=
word
.
replace
(
'-'
,
' '
)
#print("Word: {}".format(word))
if
wordOrig
in
hashTermsOrig
[
termTag
]:
print
(
"WordOrig: {}"
.
format
(
wordOrig
))
line
=
''
for
w
,
l
in
zip
(
word
.
split
(
'-'
),
lemma
.
split
(
'-'
)):
line
+=
w
+
'
\t
'
+
listLine1
[
1
]
+
'
\t
'
+
l
+
' '
+
termTag
+
' TermTag'
+
'
\n
'
line
.
rstrip
(
'
\n
'
)
else
:
found
=
False
for
i
in
range
(
word
.
count
(
'-'
)):
wordOrig
=
word
.
replace
(
'-'
,
' '
,
1
)
#print("Word: {}".format(word))
if
wordOrig
in
hashTermsOrig
[
termTag
]:
print
(
"WordOrig: {}"
.
format
(
wordOrig
))
found
=
True
line
=
''
for
w
,
l
in
zip
(
word
.
split
(
'-'
),
lemma
.
split
(
'-'
)):
line
+=
w
+
'
\t
'
+
listLine1
[
1
]
+
'
\t
'
+
l
+
' '
+
termTag
+
' TermTag'
+
'
\n
'
line
.
rstrip
(
'
\n
'
)
if
not
found
:
line
=
listLine1
[
0
]
+
'
\t
'
+
listLine1
[
1
]
+
'
\t
'
+
listLine2
[
0
]
+
' '
+
termTag
+
' TermTag'
else
:
line
=
listLine1
[
0
]
+
'
\t
'
+
listLine1
[
1
]
+
'
\t
'
+
listLine2
[
0
]
+
' '
+
termTag
+
' TermTag'
...
...
Please
register
or
login
to post a comment