Carlos-Francisco Méndez-Cruz

New terminological tagging for CRFs

...@@ -129,7 +129,7 @@ if __name__ == "__main__": ...@@ -129,7 +129,7 @@ if __name__ == "__main__":
129 if line == '\n': 129 if line == '\n':
130 oFile.write(line) 130 oFile.write(line)
131 else: 131 else:
132 - line = line.strip('\n') 132 + line = line.strip('\r\n')
133 listLine1 = line.split('\t') 133 listLine1 = line.split('\t')
134 if len(listLine1) < 3: 134 if len(listLine1) < 3:
135 continue 135 continue
...@@ -143,18 +143,19 @@ if __name__ == "__main__": ...@@ -143,18 +143,19 @@ if __name__ == "__main__":
143 if word.find('-') > -1: 143 if word.find('-') > -1:
144 found = False 144 found = False
145 repetitions = word.count('-') 145 repetitions = word.count('-')
146 - print("repetitions: {}".format(repetitions)) 146 + #print("repetitions: {}".format(repetitions))
147 wordOrig = word 147 wordOrig = word
148 for i in range(0, repetitions): 148 for i in range(0, repetitions):
149 wordOrig = wordOrig.replace('-', ' ', 1) 149 wordOrig = wordOrig.replace('-', ' ', 1)
150 - print("Word: {}".format(wordOrig)) 150 + #print("Word: {}".format(wordOrig))
151 if wordOrig in hashTermsOrig[termTag]: 151 if wordOrig in hashTermsOrig[termTag]:
152 - print("WordOrig: {}".format(wordOrig)) 152 + #print("WordOrig: {}".format(wordOrig))
153 found = True 153 found = True
154 line = '' 154 line = ''
155 for w, l in zip(word.split('-'), lemma.split('-')): 155 for w, l in zip(word.split('-'), lemma.split('-')):
156 line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n' 156 line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n'
157 line.rstrip('\r\n') 157 line.rstrip('\r\n')
158 + print("Line: {}".format(line))
158 if not found: 159 if not found:
159 line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag' 160 line = listLine1[0] + '\t' + listLine1[1] + '\t' + listLine2[0] + ' ' + termTag + ' TermTag'
160 else: 161 else:
......