Showing
1 changed file
with
14 additions
and
9 deletions
| ... | @@ -5,6 +5,7 @@ import os | ... | @@ -5,6 +5,7 @@ import os |
| 5 | import sys | 5 | import sys |
| 6 | from time import time | 6 | from time import time |
| 7 | from nltk.corpus import words | 7 | from nltk.corpus import words |
| 8 | +import re | ||
| 8 | 9 | ||
| 9 | __author__ = 'CMendezC' | 10 | __author__ = 'CMendezC' |
| 10 | 11 | ||
| ... | @@ -114,6 +115,7 @@ if __name__ == "__main__": | ... | @@ -114,6 +115,7 @@ if __name__ == "__main__": |
| 114 | 115 | ||
| 115 | filesPreprocessed = 0 | 116 | filesPreprocessed = 0 |
| 116 | t0 = time() | 117 | t0 = time() |
| 118 | + reHyphen = re.compile('-') | ||
| 117 | print("Biological term tagging files...") | 119 | print("Biological term tagging files...") |
| 118 | # Walk directory to read files | 120 | # Walk directory to read files |
| 119 | for path, dirs, files in os.walk(options.inputPath): | 121 | for path, dirs, files in os.walk(options.inputPath): |
| ... | @@ -139,15 +141,18 @@ if __name__ == "__main__": | ... | @@ -139,15 +141,18 @@ if __name__ == "__main__": |
| 139 | for termTag in hashTerms: | 141 | for termTag in hashTerms: |
| 140 | if word in hashTerms[termTag]: | 142 | if word in hashTerms[termTag]: |
| 141 | if word.find('-') > -1: | 143 | if word.find('-') > -1: |
| 142 | - wordOrig = word.replace('-', ' ') | 144 | + found = False |
| 143 | - #print("Word: {}".format(word)) | 145 | + for i in range(word.count('-')): |
| 144 | - if wordOrig in hashTermsOrig[termTag]: | 146 | + wordOrig = word.replace('-', ' ', 1) |
| 145 | - print("WordOrig: {}".format(wordOrig)) | 147 | + #print("Word: {}".format(word)) |
| 146 | - line = '' | 148 | + if wordOrig in hashTermsOrig[termTag]: |
| 147 | - for w, l in zip(word.split('-'), lemma.split('-')): | 149 | + print("WordOrig: {}".format(wordOrig)) |
| 148 | - line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n' | 150 | + found = True |
| 149 | - line.rstrip('\n') | 151 | + line = '' |
| 150 | - else: | 152 | + for w, l in zip(word.split('-'), lemma.split('-')): |
| 153 | + line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n' | ||
| 154 | + line.rstrip('\n') | ||
| 155 | + if not found: | ||
| 151 | line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' | 156 | line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' |
| 152 | else: | 157 | else: |
| 153 | line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' | 158 | line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' | ... | ... |
-
Please register or login to post a comment