Carlos-Francisco Méndez-Cruz

New terminological tagging for CRFs

...@@ -5,6 +5,7 @@ import os ...@@ -5,6 +5,7 @@ import os
5 import sys 5 import sys
6 from time import time 6 from time import time
7 from nltk.corpus import words 7 from nltk.corpus import words
8 +import re
8 9
9 __author__ = 'CMendezC' 10 __author__ = 'CMendezC'
10 11
...@@ -114,6 +115,7 @@ if __name__ == "__main__": ...@@ -114,6 +115,7 @@ if __name__ == "__main__":
114 115
115 filesPreprocessed = 0 116 filesPreprocessed = 0
116 t0 = time() 117 t0 = time()
118 + reHyphen = re.compile('-')
117 print("Biological term tagging files...") 119 print("Biological term tagging files...")
118 # Walk directory to read files 120 # Walk directory to read files
119 for path, dirs, files in os.walk(options.inputPath): 121 for path, dirs, files in os.walk(options.inputPath):
...@@ -139,15 +141,18 @@ if __name__ == "__main__": ...@@ -139,15 +141,18 @@ if __name__ == "__main__":
139 for termTag in hashTerms: 141 for termTag in hashTerms:
140 if word in hashTerms[termTag]: 142 if word in hashTerms[termTag]:
141 if word.find('-') > -1: 143 if word.find('-') > -1:
142 - wordOrig = word.replace('-', ' ') 144 + found = False
145 + for i in range(word.count('-')):
146 + wordOrig = word.replace('-', ' ', 1)
143 #print("Word: {}".format(word)) 147 #print("Word: {}".format(word))
144 if wordOrig in hashTermsOrig[termTag]: 148 if wordOrig in hashTermsOrig[termTag]:
145 print("WordOrig: {}".format(wordOrig)) 149 print("WordOrig: {}".format(wordOrig))
150 + found = True
146 line = '' 151 line = ''
147 for w, l in zip(word.split('-'), lemma.split('-')): 152 for w, l in zip(word.split('-'), lemma.split('-')):
148 line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n' 153 line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n'
149 line.rstrip('\n') 154 line.rstrip('\n')
150 - else: 155 + if not found:
151 line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' 156 line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
152 else: 157 else:
153 line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' 158 line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
......