Estefani Gaytan Nunez

upload

{"key1": "keyinfo", "key2": "keyinfo2"}
\ No newline at end of file
......@@ -25,6 +25,8 @@ from nltk.corpus import stopwords
import training_validation_v14 as training
import json
#-------------------------------------------------------------------------------
# Objective
# Tagging transformed file with CRF model with sklearn-crfsuite.
......@@ -61,10 +63,7 @@ import training_validation_v14 as training
# --variant 13
#Examples
#python3 tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI.txt --outputFileII annot-input_bg_outputII.txt --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx.txt --variant 13 --S4 --S1 > ../../reports/output_tagging_report.txt
#python3 predict-annot/bin/tagging/tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI_v4.txt --outputFileII annot-input_bg_outputII_v4 --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx_v4.txt --variant 13 --S4 --S1 > predict-annot/reports/output_tagging_report_v4.txt
#python3 predict-annot/bin/tagging/tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI_v4.txt --outputFileII annot-input_bg_outputII_v4 --outputFileII annot-input_bg_outputIII_v4 --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx_v4.txt --variant 13 --S4 --S1 > predict-annot/reports/annot-input_bg_report_v4.txt
#predict-annot/bin/tagging/tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI_v5.txt --outputFileII annot-input_bg_outputII_v5 --outputFileIII annot-input_bg_outputIII_v5 --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx_v4.txt --variant 10 --S2 > predict-annot/reports/annot-input_bg_report_v4.txt > predict-annot/reports/output_tagging_report_v5.txt
__author__ = 'egaytan'
##########################################
......@@ -84,12 +83,10 @@ if __name__ == "__main__":
parser.add_option("--infoPath", dest="infoPath", help="Path of GSE-GSM index file", metavar="PATH")
parser.add_option("--infoFile", dest="idx", help="GSE-GSM index file", metavar="FILE")
parser.add_option("--variant", dest="variant", help="Run variant", metavar="FILE")
parser.add_option("--S1", dest="S1", help="General features", action="store_true", default=False)
parser.add_option("--S2", dest="S2", help="Inner/Complete word features", action="store_true", default=False)
parser.add_option("--S1", dest="S1", help="Inner word features", action="store_true", default=False)
parser.add_option("--S2", dest="S2", help="Complete word features", action="store_true", default=False)
parser.add_option("--S3", dest="S3", help="Extended context features", action="store_true", default=False)
parser.add_option("--S4", dest="S4", help="Semantic features", action="store_true", default=False)
parser.add_option("--filterStopWords", dest="filterStopWords", help="Filtering stop words", action="store_true", default=False)
parser.add_option("--filterSymbols", dest="filterSymbols", help="Filtering punctuation marks", action="store_true", default=False)
(options, args) = parser.parse_args()
if len(args) > 0:
......@@ -109,39 +106,29 @@ if __name__ == "__main__":
print("--infoPath Path of GSE-GSM index file : " + str(options.infoPath ))
print("--infoFile GSE-GSM index file : " + str(options.idx ))
print("--variant Run variant : " + str(options.variant ))
print("--S1 General features : " + str(options.S1 ))
print("--S2 Inner/Complete word features : " + str(options.S2 ))
print("--S1 Inner word features set : " + str(options.S1 ))
print("--S2 Complete word features : " + str(options.S2 ))
print("--S3 Extended context features : " + str(options.S3 ))
print("--S4 Semantic features : " + str(options.S4 ))
print("--filteringStopWords Filtering stop words : " + str(options.filterStopWords ))
print("--filterSymbols Filtering punctuation marks : " + str(options.filterSymbols ))
symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
'}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
#print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
########################################### PROCESSING ##########################################
print('-------------------------------- PROCESSING --------------------------------')
stopwords = [word for word in stopwords.words('english')]
# Read index mapping GSE file information
idx = open(os.path.join(options.infoPath, options.idx), "r").readlines()
########################################### Read CRF model ##########################################
t0 = time()
print('Reading CRF model...')
crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod'))
print("Reading CRF model done in: %fs" % (time() - t0))
########################################### Reading sentences ##########################################
print('Processing corpus...')
t0 = time()
labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
#####################################################################################
########################################### Preprocessing ###########################################
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
# For each file in dir
......@@ -149,74 +136,60 @@ if __name__ == "__main__":
print("Preprocessing file..." + str(file))
sentencesInputData = []
sentencesOutputDataI = []
sentencesOutputDataII = []
# Preprocessing input sentences
with open(os.path.join(options.inputPath, file), "r") as iFile:
lines = iFile.readlines()
for line in lines:
listLine = []
for token in line.strip('\n').split():
if options.filterStopWords:
listToken = token.split('|')
lemma = listToken[1]
if lemma in stopwords:
continue
if options.filterSymbols:
listToken = token.split('|')
lemma = listToken[1]
if lemma in symbols:
if lemma == ',':
print("Coma , identificada")
continue
listLine.append(token)
sentencesInputData.append(listLine)
sentencesInputData = [ line.strip('\n').split() for line in iFile]
# Save input sentences
X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData]
print("Sentences input data: " + str(len(sentencesInputData)))
########################################### Predicting tags ###########################################
t1 = time()
print("Predicting tags with model...")
y_pred = crf.predict(X_input)
print("Prediction done in: %fs" % (time() - t1))
print("Prediction done in: %fs" % (time() - t1))
########################################### Tagging with CRF model ###########################################
print("Tagging file...")
lidx = 0
for line, tagLine in zip(lines, y_pred):
for line, tagLine in zip(iFile.readlines(), y_pred):
# unique tags
Ltags = set(labels).intersection(set(tagLine))
outputLine = ''
# Skip untagged sentence
if Ltags == {'O'}: continue
line = line.strip('\n')
# starting empty sentence
outputLine = ''
# tag behind
tb = 'O'
# per sentence word count
i = 0
########################## one word sentences ##########################
if len(tagLine)==1:
# Exception for one word sentences
if len(tagLine) == 1:
if tagLine[0] in labels:
# add start tagging signature
start = '<' + tagLine[0] + '> '
end = '</' + tagLine[0] + '/>'
# add end tagging signature
end = '</' + tagLine[0] + '>'
word = line.split('|')[0] + ' '
# save output tagged sentence
outputLine = start + word + end
else:
outputLine = line.split(' ')[0]
########################## Saving Sentence Ouput I ##########################
# Saving Sentence Ouput I
sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + '\t' + ', '.join(Ltags))
########################## Saving Sentence Ouput II ##########################
sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + word.split('|')[0] + '\t' + tag)
# Increase sentence counter
lidx += 1
# Continue with the next sentence
continue
sentence = ''
sb = False
# Tagging sentences
for word,tag in zip(line.split(' '), tagLine):
# start tagging
if tag in labels and tb != tag:
# start tagging
outputLine += '<' + tag + '> '
sb = True
sentence = word.split('|')[0] + ' '
tb = tag
outputLine += word.split('|')[0] + ' '
tb = tag
i += 1
continue
# end tagging
......@@ -224,32 +197,24 @@ if __name__ == "__main__":
if i+1==len(tagLine):
# end sentence
outputLine += word.split('|')[0] + ' '
outputLine += '</' + tag + '/> '
########################## Saving Sentence Ouput II ##########################
sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + sentence + word.split('|')[0] + '\t' +tag)
sb = False
outputLine += '</' + tag + '> '
tb = 'O'
i += 1
continue
elif tag!=tagLine[i+1]:
# start new tag
outputLine += word.split('|')[0] + ' '
outputLine += '</' + tag + '/> '
########################## Saving Sentence Ouput II ##########################
sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + sentence + word.split('|')[0] + '\t' +tag)
sb = False
outputLine += '</' + tag + '> '
tb = 'O'
i += 1
continue
# word tagged
outputLine += word.split('|')[0] + ' '
i += 1
if sb:
sentence+= word.split('|')[0] + ' '
########################## Saving Sentence Ouput I ##########################
# Saving Sentence Ouput I
sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ '\t' +', '.join(Ltags))
lidx += 1
print("\n".join(sentencesOutputDataI[1:3]))
########################################### Save Output I ##########################################
print("Saving Ouput I...")
with open(os.path.join(options.outputPath, options.outFileI + '_' + options.modelName + '.tsv'), "w") as oFileI:
......@@ -259,19 +224,29 @@ if __name__ == "__main__":
oline = line.replace('LDR','(')
oline = oline.replace('RDR',')')
oFileI.write(oline + '\n')
########################################### Save Output II ##########################################
print("Saving Ouput II...")
with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFileII:
for line in sentencesOutputDataII:
#print(line)
for line in sentencesOutputDataI:
oline = line.replace('LDR','(')
oline = oline.replace('RDR',')')
oFileII.write(oline + '\n')
for ttex, tag in re.findall(r'<[^>]+>([^<]+)</([^>]+)>', oline):
lline = oline.split('\t')[0:-2] + [ttex, tag]
nline = '\t'.join(lline)
oFileII.write(nline + '\n')
########################################### Save Output III ##########################################
print("Saving Ouput III...")
with open(os.path.join(options.outputPath, options.outFileIII + '_' + options.modelName + '.tsv'), "w") as oFileIII:
for line, tagLine in zip(lines, y_pred):
for line, tagLine in zip(iFile.readlines(), y_pred):
oline = [ w.split('|')[0].replace('LDR','(').replace('LDR','(')+'|'+tag for w,tag in zip(line.split(' '), tagLine)]
oFileIII.write(' '.join(oline) + '\n')
########################################### Save Probs ##########################################
y_probs = crf.predict_marginals(X_input)
# from https://stackoverflow.com/questions/7100125/storing-python-dictionaries
with open(os.path.join(options.outputPath, 'crf_probs.json'), 'w') as fp:
json.dump(y_probs, fp)
print("Processing corpus done in: %fs" % (time() - t0))
......
This file is too large to display.
-------------------------------- PARAMETERS --------------------------------
--inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
--outputFileI Output tagged file I : annot-input_bg_outputI_v4
--outputFileII Output tagged file II : annot-input_bg_outputII_v4
--outputFileII Output tagged file III : annot-input_bg_outputIII_v4
--outputFileI Output tagged file I : annot-input_bg_outputI_v5.txt
--outputFileII Output tagged file II : annot-input_bg_outputII_v5
--outputFileII Output tagged file III : annot-input_bg_outputIII_v5
--modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
--modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
--infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
--infoFile GSE-GSM index file : bg_sentences_midx_v4.txt
--variant Run variant : 13
--S1 General features : True
--S2 Inner/Complete word features : False
--variant Run variant : 10
--S1 Inner word features set : False
--S2 Complete word features : True
--S3 Extended context features : False
--S4 Semantic features : True
--filteringStopWords Filtering stop words : False
--filterSymbols Filtering punctuation marks : False
--S4 Semantic features : False
-------------------------------- PROCESSING --------------------------------
Reading CRF model...
Reading CRF model done in: 0.009463s
Reading CRF model done in: 0.009485s
Processing corpus...
Preprocessing file...annot-input_bg_v4.txt
Sentences input data: 90688
Predicting tags with model...
Prediction done in: 26.367272s
Prediction done in: 27.326342s
Tagging file...
Saving Ouput I...
Saving Ouput II...
Saving Ouput III...
Processing corpus done in: 56.584394s
Processing corpus done in: 247.353067s
......