Estefani Gaytan Nunez

upload

......@@ -138,7 +138,8 @@ if __name__ == "__main__":
sentencesOutputDataI = []
# Preprocessing input sentences
with open(os.path.join(options.inputPath, file), "r") as iFile:
sentencesInputData = [ line.strip('\n').split() for line in iFile]
lines = iFile.readlines()
sentencesInputData = [ line.strip('\n').split() for line in lines]
# Save input sentences
X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData]
print("Sentences input data: " + str(len(sentencesInputData)))
......@@ -148,12 +149,13 @@ if __name__ == "__main__":
print("Predicting tags with model...")
y_pred = crf.predict(X_input)
#print(y_pred)
print("Prediction done in: %fs" % (time() - t1))
########################################### Tagging with CRF model ###########################################
print("Tagging file...")
lidx = 0
for line, tagLine in zip(iFile.readlines(), y_pred):
for line, tagLine in zip(lines, y_pred):
# unique tags
Ltags = set(labels).intersection(set(tagLine))
# Skip untagged sentence
......@@ -178,6 +180,7 @@ if __name__ == "__main__":
else:
outputLine = line.split(' ')[0]
# Saving Sentence Ouput I
print(outputLine)
sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + '\t' + ', '.join(Ltags))
# Increase sentence counter
lidx += 1
......@@ -212,41 +215,43 @@ if __name__ == "__main__":
outputLine += word.split('|')[0] + ' '
i += 1
# Saving Sentence Ouput I
print(outputLine)
sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ '\t' +', '.join(Ltags))
lidx += 1
print("\n".join(sentencesOutputDataI[1:3]))
########################################### Save Output I ##########################################
print("Saving Ouput I...")
with open(os.path.join(options.outputPath, options.outFileI + '_' + options.modelName + '.tsv'), "w") as oFileI:
for line in sentencesOutputDataI:
if re.findall('</', line):
#print(line)
oline = line.replace('LDR','(')
oline = oline.replace('RDR',')')
oFileI.write(oline + '\n')
########################################### Save Output II ##########################################
print("Saving Ouput II...")
with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFileII:
for line in sentencesOutputDataI:
########################################### Save Output I ##########################################
print("Saving Ouput I...")
with open(os.path.join(options.outputPath, options.outFileI + '_' + options.modelName + '.tsv'), "w") as oFileI:
for line in sentencesOutputDataI:
if re.findall('</', line):
#print(line)
oline = line.replace('LDR','(')
oline = oline.replace('RDR',')')
for ttex, tag in re.findall(r'<[^>]+>([^<]+)</([^>]+)>', oline):
lline = oline.split('\t')[0:-2] + [ttex, tag]
nline = '\t'.join(lline)
oFileII.write(nline + '\n')
########################################### Save Output III ##########################################
print("Saving Ouput III...")
with open(os.path.join(options.outputPath, options.outFileIII + '_' + options.modelName + '.tsv'), "w") as oFileIII:
for line, tagLine in zip(iFile.readlines(), y_pred):
oline = [ w.split('|')[0].replace('LDR','(').replace('LDR','(')+'|'+tag for w,tag in zip(line.split(' '), tagLine)]
oFileI.write(oline + '\n')
oFileIII.write(' '.join(oline) + '\n')
########################################### Save Probs ##########################################
y_probs = crf.predict_marginals(X_input)
# from https://stackoverflow.com/questions/7100125/storing-python-dictionaries
with open(os.path.join(options.outputPath, 'crf_probs.json'), 'w') as fp:
json.dump(y_probs, fp)
print("Processing corpus done in: %fs" % (time() - t0))
########################################### Save Output II ##########################################
print("Saving Ouput II...")
with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFileII:
for line in sentencesOutputDataI:
oline = line.replace('LDR','(')
oline = oline.replace('RDR',')')
for ttex, tag in re.findall(r'<[^>]+>([^<]+)</([^>]+)>', oline):
lline = oline.split('\t')[0:-2] + [ttex, tag]
nline = '\t'.join(lline)
oFileII.write(nline + '\n')
########################################### Save Output III ##########################################
print("Saving Ouput III...")
with open(os.path.join(options.outputPath, options.outFileIII + '_' + options.modelName + '.tsv'), "w") as oFileIII:
for line, tagLine in zip(lines, y_pred):
oline = [ w.split('|')[0].replace('LDR','(').replace('LDR','(')+'|'+tag for w,tag in zip(line.split(' '), tagLine)]
oFileIII.write(' '.join(oline) + '\n')
########################################### Save Probs ##########################################
y_probs = crf.predict_marginals(X_input)
# from https://stackoverflow.com/questions/7100125/storing-python-dictionaries
with open(os.path.join(options.outputPath, 'crf_probs.json'), 'w') as fp:
json.dump(y_probs, fp)
print("Pssing corpus done in: %fs" % (time() - t0))
......
This diff could not be displayed because it is too large.