Showing
1 changed file
with
14 additions
and
9 deletions
... | @@ -5,6 +5,7 @@ import os | ... | @@ -5,6 +5,7 @@ import os |
5 | import sys | 5 | import sys |
6 | from time import time | 6 | from time import time |
7 | from nltk.corpus import words | 7 | from nltk.corpus import words |
8 | +import re | ||
8 | 9 | ||
9 | __author__ = 'CMendezC' | 10 | __author__ = 'CMendezC' |
10 | 11 | ||
... | @@ -114,6 +115,7 @@ if __name__ == "__main__": | ... | @@ -114,6 +115,7 @@ if __name__ == "__main__": |
114 | 115 | ||
115 | filesPreprocessed = 0 | 116 | filesPreprocessed = 0 |
116 | t0 = time() | 117 | t0 = time() |
118 | + reHyphen = re.compile('-') | ||
117 | print("Biological term tagging files...") | 119 | print("Biological term tagging files...") |
118 | # Walk directory to read files | 120 | # Walk directory to read files |
119 | for path, dirs, files in os.walk(options.inputPath): | 121 | for path, dirs, files in os.walk(options.inputPath): |
... | @@ -139,15 +141,18 @@ if __name__ == "__main__": | ... | @@ -139,15 +141,18 @@ if __name__ == "__main__": |
139 | for termTag in hashTerms: | 141 | for termTag in hashTerms: |
140 | if word in hashTerms[termTag]: | 142 | if word in hashTerms[termTag]: |
141 | if word.find('-') > -1: | 143 | if word.find('-') > -1: |
142 | - wordOrig = word.replace('-', ' ') | 144 | + found = False |
143 | - #print("Word: {}".format(word)) | 145 | + for i in range(word.count('-')): |
144 | - if wordOrig in hashTermsOrig[termTag]: | 146 | + wordOrig = word.replace('-', ' ', 1) |
145 | - print("WordOrig: {}".format(wordOrig)) | 147 | + #print("Word: {}".format(word)) |
146 | - line = '' | 148 | + if wordOrig in hashTermsOrig[termTag]: |
147 | - for w, l in zip(word.split('-'), lemma.split('-')): | 149 | + print("WordOrig: {}".format(wordOrig)) |
148 | - line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n' | 150 | + found = True |
149 | - line.rstrip('\n') | 151 | + line = '' |
150 | - else: | 152 | + for w, l in zip(word.split('-'), lemma.split('-')): |
153 | + line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n' | ||
154 | + line.rstrip('\n') | ||
155 | + if not found: | ||
151 | line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' | 156 | line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' |
152 | else: | 157 | else: |
153 | line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' | 158 | line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' | ... | ... |
-
Please register or login to post a comment