Carlos-Francisco Méndez-Cruz

New terminological tagging for CRFs

......@@ -5,6 +5,7 @@ import os
import sys
from time import time
from nltk.corpus import words
import re
__author__ = 'CMendezC'
......@@ -114,6 +115,7 @@ if __name__ == "__main__":
filesPreprocessed = 0
t0 = time()
reHyphen = re.compile('-')
print("Biological term tagging files...")
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
......@@ -139,15 +141,18 @@ if __name__ == "__main__":
for termTag in hashTerms:
if word in hashTerms[termTag]:
if word.find('-') > -1:
wordOrig = word.replace('-', ' ')
#print("Word: {}".format(word))
if wordOrig in hashTermsOrig[termTag]:
print("WordOrig: {}".format(wordOrig))
line = ''
for w, l in zip(word.split('-'), lemma.split('-')):
line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n'
line.rstrip('\n')
else:
found = False
for i in range(word.count('-')):
wordOrig = word.replace('-', ' ', 1)
#print("Word: {}".format(word))
if wordOrig in hashTermsOrig[termTag]:
print("WordOrig: {}".format(wordOrig))
found = True
line = ''
for w, l in zip(word.split('-'), lemma.split('-')):
line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n'
line.rstrip('\n')
if not found:
line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
else:
line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
......