New terminological tagging for CRFs

Carlos-Francisco Méndez-Cruz
Commit c0dad336fef2cce9fba0dd77e927b33a5735fd4d c0dad336 1 parent f972fd01
Showing 1 changed file with 14 additions and 9 deletions
biologicalTermTagging-CRF.py
--- a/biologicalTermTagging-CRF.py
View file @c0dad33
+++ b/biologicalTermTagging-CRF.py
View file @c0dad33
@@ -5,6 +5,7 @@ import os
 import sys
 from time import time
 from nltk.corpus import words
+ import re
 
 __author__ = 'CMendezC'
 
@@ -114,6 +115,7 @@ if __name__ == "__main__":
 
     filesPreprocessed = 0
     t0 = time()
+     reHyphen = re.compile('-')
     print("Biological term tagging files...")
     # Walk directory to read files
     for path, dirs, files in os.walk(options.inputPath):
@@ -139,15 +141,18 @@ if __name__ == "__main__":
                                 for termTag in hashTerms:
                                     if word in hashTerms[termTag]:
                                         if word.find('-') > -1:
-                                             wordOrig = word.replace('-', ' ')
-                                             #print("Word: {}".format(word))
-                                             if wordOrig in hashTermsOrig[termTag]:
-                                                 print("WordOrig: {}".format(wordOrig))
-                                                 line = ''
-                                                 for w, l in zip(word.split('-'), lemma.split('-')):
-                                                     line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n'
-                                                 line.rstrip('\n')
-                                             else:
+                                             found = False
+                                             for i in range(word.count('-')):
+                                                 wordOrig = word.replace('-', ' ', 1)
+                                                 #print("Word: {}".format(word))
+                                                 if wordOrig in hashTermsOrig[termTag]:
+                                                     print("WordOrig: {}".format(wordOrig))
+                                                     found = True
+                                                     line = ''
+                                                     for w, l in zip(word.split('-'), lemma.split('-')):
+                                                         line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n'
+                                                     line.rstrip('\n')
+                                             if not found:
                                                 line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
                                         else:
                                             line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'