Carlos-Francisco Méndez-Cruz

New terminological tagging for CRFs

......@@ -104,6 +104,7 @@ if __name__ == "__main__":
hashTerms[key].append(lineHyp.capitalize())
hashTermsOrig[key].append(line.capitalize())
print(' Terms read {} size: {}'.format(key, len(hashTerms[key])))
print(' Terms read {} size: {}'.format(key, len(hashTermsOrig[key])))
#regularWords = words.words('en')
print()
......@@ -128,7 +129,6 @@ if __name__ == "__main__":
if len(listLine1) < 3:
continue
word = listLine1[0]
print("Word: {}".format(word))
pos = listLine1[1]
listLine2 = listLine1[2].split(' ')
lemma = listLine2[0]
......@@ -136,9 +136,9 @@ if __name__ == "__main__":
for termTag in hashTerms:
if word in hashTerms[termTag]:
wordOrig = word.replace('-', ' ')
print("Word: {}".format(word))
print("WordOrig: {}".format(wordOrig))
#print("Word: {}".format(word))
if wordOrig in hashTermsOrig[termTag]:
print("WordOrig: {}".format(wordOrig))
line = ''
for w, l in zip(word.split('-'), lemma.split('-')):
line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n'
......