Carlos-Francisco Méndez-Cruz

New terminological tagging for CRFs

......@@ -129,7 +129,7 @@ if __name__ == "__main__":
if line == '\n':
oFile.write(line)
else:
line = line.strip('\n')
line = line.strip('\r\n')
listLine1 = line.split('\t')
if len(listLine1) < 3:
continue
......@@ -143,18 +143,19 @@ if __name__ == "__main__":
if word.find('-') > -1:
found = False
repetitions = word.count('-')
print("repetitions: {}".format(repetitions))
#print("repetitions: {}".format(repetitions))
wordOrig = word
for i in range(0, repetitions):
wordOrig = wordOrig.replace('-', ' ', 1)
print("Word: {}".format(wordOrig))
#print("Word: {}".format(wordOrig))
if wordOrig in hashTermsOrig[termTag]:
print("WordOrig: {}".format(wordOrig))
#print("WordOrig: {}".format(wordOrig))
found = True
line = ''
for w, l in zip(word.split('-'), lemma.split('-')):
line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n'
line.rstrip('\r\n')
print("Line: {}".format(line))
if not found:
line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
else:
......