New terminological tagging for CRFs

Carlos-Francisco Méndez-Cruz
Commit c0dad336fef2cce9fba0dd77e927b33a5735fd4d c0dad336 1 parent f972fd01
Showing 1 changed file with 7 additions and 2 deletions
biologicalTermTagging-CRF.py
--- a/biologicalTermTagging-CRF.py
View file @c0dad33
+++ b/biologicalTermTagging-CRF.py
View file @c0dad33
@@ -5,6 +5,7 @@ import os
 import sys
 from time import time
 from nltk.corpus import words
+import re
 __author__ = 'CMendezC'
@@ -114,6 +115,7 @@ if __name__ == "__main__":
     filesPreprocessed = 0
     t0 = time()
+    reHyphen = re.compile('-')
     print("Biological term tagging files...")
     # Walk directory to read files
     for path, dirs, files in os.walk(options.inputPath):
@@ -139,15 +141,18 @@ if __name__ == "__main__":
                                 for termTag in hashTerms:
                                     if word in hashTerms[termTag]:
                                         if word.find('-') > -1:
-                                            wordOrig = word.replace('-', ' ')
+                                            found = False
+                                            for i in range(word.count('-')):
+                                                wordOrig = word.replace('-', ' ', 1)
                                                 #print("Word: {}".format(word))
                                                 if wordOrig in hashTermsOrig[termTag]:
                                                     print("WordOrig: {}".format(wordOrig))
+                                                    found = True
                                                     line = ''
                                                     for w, l in zip(word.split('-'), lemma.split('-')):
                                                         line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n'
                                                     line.rstrip('\n')
-                                            else:
+                                            if not found:
                                                 line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
                                         else:
                                             line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'