Showing
1 changed file
with
7 additions
and
2 deletions
... | @@ -5,6 +5,7 @@ import os | ... | @@ -5,6 +5,7 @@ import os |
5 | import sys | 5 | import sys |
6 | from time import time | 6 | from time import time |
7 | from nltk.corpus import words | 7 | from nltk.corpus import words |
8 | +import re | ||
8 | 9 | ||
9 | __author__ = 'CMendezC' | 10 | __author__ = 'CMendezC' |
10 | 11 | ||
... | @@ -114,6 +115,7 @@ if __name__ == "__main__": | ... | @@ -114,6 +115,7 @@ if __name__ == "__main__": |
114 | 115 | ||
115 | filesPreprocessed = 0 | 116 | filesPreprocessed = 0 |
116 | t0 = time() | 117 | t0 = time() |
118 | + reHyphen = re.compile('-') | ||
117 | print("Biological term tagging files...") | 119 | print("Biological term tagging files...") |
118 | # Walk directory to read files | 120 | # Walk directory to read files |
119 | for path, dirs, files in os.walk(options.inputPath): | 121 | for path, dirs, files in os.walk(options.inputPath): |
... | @@ -139,15 +141,18 @@ if __name__ == "__main__": | ... | @@ -139,15 +141,18 @@ if __name__ == "__main__": |
139 | for termTag in hashTerms: | 141 | for termTag in hashTerms: |
140 | if word in hashTerms[termTag]: | 142 | if word in hashTerms[termTag]: |
141 | if word.find('-') > -1: | 143 | if word.find('-') > -1: |
142 | - wordOrig = word.replace('-', ' ') | 144 | + found = False |
145 | + for i in range(word.count('-')): | ||
146 | + wordOrig = word.replace('-', ' ', 1) | ||
143 | #print("Word: {}".format(word)) | 147 | #print("Word: {}".format(word)) |
144 | if wordOrig in hashTermsOrig[termTag]: | 148 | if wordOrig in hashTermsOrig[termTag]: |
145 | print("WordOrig: {}".format(wordOrig)) | 149 | print("WordOrig: {}".format(wordOrig)) |
150 | + found = True | ||
146 | line = '' | 151 | line = '' |
147 | for w, l in zip(word.split('-'), lemma.split('-')): | 152 | for w, l in zip(word.split('-'), lemma.split('-')): |
148 | line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n' | 153 | line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n' |
149 | line.rstrip('\n') | 154 | line.rstrip('\n') |
150 | - else: | 155 | + if not found: |
151 | line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' | 156 | line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' |
152 | else: | 157 | else: |
153 | line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' | 158 | line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' | ... | ... |
-
Please register or login to post a comment