Carlos-Francisco Méndez-Cruz

New terminological tagging for CRFs

...@@ -5,6 +5,7 @@ import os ...@@ -5,6 +5,7 @@ import os
5 import sys 5 import sys
6 from time import time 6 from time import time
7 from nltk.corpus import words 7 from nltk.corpus import words
8 +import re
8 9
9 __author__ = 'CMendezC' 10 __author__ = 'CMendezC'
10 11
...@@ -114,6 +115,7 @@ if __name__ == "__main__": ...@@ -114,6 +115,7 @@ if __name__ == "__main__":
114 115
115 filesPreprocessed = 0 116 filesPreprocessed = 0
116 t0 = time() 117 t0 = time()
118 + reHyphen = re.compile('-')
117 print("Biological term tagging files...") 119 print("Biological term tagging files...")
118 # Walk directory to read files 120 # Walk directory to read files
119 for path, dirs, files in os.walk(options.inputPath): 121 for path, dirs, files in os.walk(options.inputPath):
...@@ -139,15 +141,18 @@ if __name__ == "__main__": ...@@ -139,15 +141,18 @@ if __name__ == "__main__":
139 for termTag in hashTerms: 141 for termTag in hashTerms:
140 if word in hashTerms[termTag]: 142 if word in hashTerms[termTag]:
141 if word.find('-') > -1: 143 if word.find('-') > -1:
142 - wordOrig = word.replace('-', ' ') 144 + found = False
143 - #print("Word: {}".format(word)) 145 + for i in range(word.count('-')):
144 - if wordOrig in hashTermsOrig[termTag]: 146 + wordOrig = word.replace('-', ' ', 1)
145 - print("WordOrig: {}".format(wordOrig)) 147 + #print("Word: {}".format(word))
146 - line = '' 148 + if wordOrig in hashTermsOrig[termTag]:
147 - for w, l in zip(word.split('-'), lemma.split('-')): 149 + print("WordOrig: {}".format(wordOrig))
148 - line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n' 150 + found = True
149 - line.rstrip('\n') 151 + line = ''
150 - else: 152 + for w, l in zip(word.split('-'), lemma.split('-')):
153 + line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n'
154 + line.rstrip('\n')
155 + if not found:
151 line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' 156 line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
152 else: 157 else:
153 line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' 158 line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
......