Carlos-Francisco Méndez-Cruz

New terminological tagging for CRFs

...@@ -104,6 +104,7 @@ if __name__ == "__main__": ...@@ -104,6 +104,7 @@ if __name__ == "__main__":
104 hashTerms[key].append(lineHyp.capitalize()) 104 hashTerms[key].append(lineHyp.capitalize())
105 hashTermsOrig[key].append(line.capitalize()) 105 hashTermsOrig[key].append(line.capitalize())
106 print(' Terms read {} size: {}'.format(key, len(hashTerms[key]))) 106 print(' Terms read {} size: {}'.format(key, len(hashTerms[key])))
107 + print(' Terms read {} size: {}'.format(key, len(hashTermsOrig[key])))
107 108
108 #regularWords = words.words('en') 109 #regularWords = words.words('en')
109 print() 110 print()
...@@ -128,7 +129,6 @@ if __name__ == "__main__": ...@@ -128,7 +129,6 @@ if __name__ == "__main__":
128 if len(listLine1) < 3: 129 if len(listLine1) < 3:
129 continue 130 continue
130 word = listLine1[0] 131 word = listLine1[0]
131 - print("Word: {}".format(word))
132 pos = listLine1[1] 132 pos = listLine1[1]
133 listLine2 = listLine1[2].split(' ') 133 listLine2 = listLine1[2].split(' ')
134 lemma = listLine2[0] 134 lemma = listLine2[0]
...@@ -136,9 +136,9 @@ if __name__ == "__main__": ...@@ -136,9 +136,9 @@ if __name__ == "__main__":
136 for termTag in hashTerms: 136 for termTag in hashTerms:
137 if word in hashTerms[termTag]: 137 if word in hashTerms[termTag]:
138 wordOrig = word.replace('-', ' ') 138 wordOrig = word.replace('-', ' ')
139 - print("Word: {}".format(word)) 139 + #print("Word: {}".format(word))
140 - print("WordOrig: {}".format(wordOrig))
141 if wordOrig in hashTermsOrig[termTag]: 140 if wordOrig in hashTermsOrig[termTag]:
141 + print("WordOrig: {}".format(wordOrig))
142 line = '' 142 line = ''
143 for w, l in zip(word.split('-'), lemma.split('-')): 143 for w, l in zip(word.split('-'), lemma.split('-')):
144 line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n' 144 line += w + '\t' + listLine1[1] + '\t' + l + ' ' + termTag + ' TermTag' + '\n'
......