Estefani Gaytan Nunez

upload

...@@ -138,7 +138,8 @@ if __name__ == "__main__": ...@@ -138,7 +138,8 @@ if __name__ == "__main__":
138 sentencesOutputDataI = [] 138 sentencesOutputDataI = []
139 # Preprocessing input sentences 139 # Preprocessing input sentences
140 with open(os.path.join(options.inputPath, file), "r") as iFile: 140 with open(os.path.join(options.inputPath, file), "r") as iFile:
141 - sentencesInputData = [ line.strip('\n').split() for line in iFile] 141 + lines = iFile.readlines()
142 + sentencesInputData = [ line.strip('\n').split() for line in lines]
142 # Save input sentences 143 # Save input sentences
143 X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData] 144 X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData]
144 print("Sentences input data: " + str(len(sentencesInputData))) 145 print("Sentences input data: " + str(len(sentencesInputData)))
...@@ -148,12 +149,13 @@ if __name__ == "__main__": ...@@ -148,12 +149,13 @@ if __name__ == "__main__":
148 print("Predicting tags with model...") 149 print("Predicting tags with model...")
149 y_pred = crf.predict(X_input) 150 y_pred = crf.predict(X_input)
150 151
152 + #print(y_pred)
151 print("Prediction done in: %fs" % (time() - t1)) 153 print("Prediction done in: %fs" % (time() - t1))
152 154
153 ########################################### Tagging with CRF model ########################################### 155 ########################################### Tagging with CRF model ###########################################
154 print("Tagging file...") 156 print("Tagging file...")
155 lidx = 0 157 lidx = 0
156 - for line, tagLine in zip(iFile.readlines(), y_pred): 158 + for line, tagLine in zip(lines, y_pred):
157 # unique tags 159 # unique tags
158 Ltags = set(labels).intersection(set(tagLine)) 160 Ltags = set(labels).intersection(set(tagLine))
159 # Skip untagged sentence 161 # Skip untagged sentence
...@@ -178,6 +180,7 @@ if __name__ == "__main__": ...@@ -178,6 +180,7 @@ if __name__ == "__main__":
178 else: 180 else:
179 outputLine = line.split(' ')[0] 181 outputLine = line.split(' ')[0]
180 # Saving Sentence Ouput I 182 # Saving Sentence Ouput I
183 + print(outputLine)
181 sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + '\t' + ', '.join(Ltags)) 184 sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + '\t' + ', '.join(Ltags))
182 # Increase sentence counter 185 # Increase sentence counter
183 lidx += 1 186 lidx += 1
...@@ -212,41 +215,43 @@ if __name__ == "__main__": ...@@ -212,41 +215,43 @@ if __name__ == "__main__":
212 outputLine += word.split('|')[0] + ' ' 215 outputLine += word.split('|')[0] + ' '
213 i += 1 216 i += 1
214 # Saving Sentence Ouput I 217 # Saving Sentence Ouput I
218 + print(outputLine)
215 sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ '\t' +', '.join(Ltags)) 219 sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ '\t' +', '.join(Ltags))
216 lidx += 1 220 lidx += 1
221 +
217 print("\n".join(sentencesOutputDataI[1:3])) 222 print("\n".join(sentencesOutputDataI[1:3]))
218 - ########################################### Save Output I ########################################## 223 + ########################################### Save Output I ##########################################
219 - print("Saving Ouput I...") 224 + print("Saving Ouput I...")
220 - with open(os.path.join(options.outputPath, options.outFileI + '_' + options.modelName + '.tsv'), "w") as oFileI: 225 + with open(os.path.join(options.outputPath, options.outFileI + '_' + options.modelName + '.tsv'), "w") as oFileI:
221 - for line in sentencesOutputDataI: 226 + for line in sentencesOutputDataI:
222 - if re.findall('</', line): 227 + if re.findall('</', line):
223 - #print(line) 228 + #print(line)
224 - oline = line.replace('LDR','(')
225 - oline = oline.replace('RDR',')')
226 - oFileI.write(oline + '\n')
227 -
228 - ########################################### Save Output II ##########################################
229 - print("Saving Ouput II...")
230 - with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFileII:
231 - for line in sentencesOutputDataI:
232 oline = line.replace('LDR','(') 229 oline = line.replace('LDR','(')
233 oline = oline.replace('RDR',')') 230 oline = oline.replace('RDR',')')
234 - for ttex, tag in re.findall(r'<[^>]+>([^<]+)</([^>]+)>', oline): 231 + oFileI.write(oline + '\n')
235 - lline = oline.split('\t')[0:-2] + [ttex, tag]
236 - nline = '\t'.join(lline)
237 - oFileII.write(nline + '\n')
238 -
239 - ########################################### Save Output III ##########################################
240 - print("Saving Ouput III...")
241 - with open(os.path.join(options.outputPath, options.outFileIII + '_' + options.modelName + '.tsv'), "w") as oFileIII:
242 - for line, tagLine in zip(iFile.readlines(), y_pred):
243 - oline = [ w.split('|')[0].replace('LDR','(').replace('LDR','(')+'|'+tag for w,tag in zip(line.split(' '), tagLine)]
244 232
245 - oFileIII.write(' '.join(oline) + '\n') 233 + ########################################### Save Output II ##########################################
246 - 234 + print("Saving Ouput II...")
247 - ########################################### Save Probs ########################################## 235 + with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFileII:
248 - y_probs = crf.predict_marginals(X_input) 236 + for line in sentencesOutputDataI:
249 - # from https://stackoverflow.com/questions/7100125/storing-python-dictionaries 237 + oline = line.replace('LDR','(')
250 - with open(os.path.join(options.outputPath, 'crf_probs.json'), 'w') as fp: 238 + oline = oline.replace('RDR',')')
251 - json.dump(y_probs, fp) 239 + for ttex, tag in re.findall(r'<[^>]+>([^<]+)</([^>]+)>', oline):
252 - print("Processing corpus done in: %fs" % (time() - t0)) 240 + lline = oline.split('\t')[0:-2] + [ttex, tag]
241 + nline = '\t'.join(lline)
242 + oFileII.write(nline + '\n')
243 +
244 + ########################################### Save Output III ##########################################
245 + print("Saving Ouput III...")
246 + with open(os.path.join(options.outputPath, options.outFileIII + '_' + options.modelName + '.tsv'), "w") as oFileIII:
247 + for line, tagLine in zip(lines, y_pred):
248 + oline = [ w.split('|')[0].replace('LDR','(').replace('LDR','(')+'|'+tag for w,tag in zip(line.split(' '), tagLine)]
249 +
250 + oFileIII.write(' '.join(oline) + '\n')
251 +
252 + ########################################### Save Probs ##########################################
253 + y_probs = crf.predict_marginals(X_input)
254 + # from https://stackoverflow.com/questions/7100125/storing-python-dictionaries
255 + with open(os.path.join(options.outputPath, 'crf_probs.json'), 'w') as fp:
256 + json.dump(y_probs, fp)
257 + print("Pssing corpus done in: %fs" % (time() - t0))
......
This diff could not be displayed because it is too large.