Estefani Gaytan Nunez

upload

......@@ -34,7 +34,7 @@ echo
echo
echo "Add sentence-end-tag PGCGROWTHCONDITIONS"
#cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f8,9 | sort | uniq | awk 'BEGIN {FS="\t"} length($2) > 3 { print $_}' | sed 's/\\null\\/null/g'| sed 's/.tsv//g' | sed 's/-/\t/' | sed 's/-/\t/' )
cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7'| cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g' | sed 's/1.\tNeubauer//'| sed 's/\\null\\/null/g' | sort | uniq)
cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7' | sed 's/1.\tNeubauer//' | cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g'|sed 's/\\null\\//g' | sort | uniq)
echo "$cext" | cut -f4 | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output
wc $output
echo "$cext" | cut -f1-3,5 > $mapping
......
This diff could not be displayed because it is too large.
......@@ -60,8 +60,10 @@ import training_validation_v14 as training
# --infoFile bg_sentences_midx.txt
# --variant 13
#Examples
#python3 tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI.txt --outputFileII annot-input_bg_outputII.txt --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx.txt --variant 13 --S4 --S1 > ../../reports/output_tagging_report.txt
#python3 predict-annot/bin/tagging/tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI_v4.txt --outputFileII annot-input_bg_outputII_v4 --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx_v4.txt --variant 13 --S4 --S1 > predict-annot/reports/output_tagging_report_v4.txt
#python3 predict-annot/bin/tagging/tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI_v4.txt --outputFileII annot-input_bg_outputII_v4 --outputFileII annot-input_bg_outputIII_v4 --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx_v4.txt --variant 13 --S4 --S1 > predict-annot/reports/annot-input_bg_report_v4.txt
__author__ = 'egaytan'
......@@ -70,12 +72,13 @@ __author__ = 'egaytan'
##########################################
if __name__ == "__main__":
# Defining parameters
########################################### Defining parameters ##########################################
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath", help="Path of training data set", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
parser.add_option("--outputFileI", dest="outFileI", help="Output tagged file I", metavar="FILE")
parser.add_option("--outputFileII", dest="outFileII", help="Output tagged file II", metavar="FILE")
parser.add_option("--outputFileIII", dest="outFileIII", help="Output tagged file III", metavar="FILE")
parser.add_option("--modelPath", dest="modelPath", help="Path to read CRF model", metavar="PATH")
parser.add_option("--modelName", dest="modelName", help="Model name", metavar="TEXT")
parser.add_option("--infoPath", dest="infoPath", help="Path of GSE-GSM index file", metavar="PATH")
......@@ -93,13 +96,14 @@ if __name__ == "__main__":
parser.error("Any parameter given.")
sys.exit(1)
########################################### DISP PARAMETERS ##########################################
print('-------------------------------- PARAMETERS --------------------------------')
print("--inputPath Path of training data set : " + str(options.inputPath ))
print("--outputPath Output path to place output files: " + str(options.outputPath ))
print("--outputFileI Output tagged file I : " + str(options.outFileI ))
print("--outputFileII Output tagged file II : " + str(options.outFileII ))
print("--outputFileII Output tagged file III : " + str(options.outFileIII ))
print("--modelPath Path to read CRF model : " + str(options.modelPath ))
print("--modelName Model name : " + str(options.modelName ))
print("--infoPath Path of GSE-GSM index file : " + str(options.infoPath ))
......@@ -115,25 +119,29 @@ if __name__ == "__main__":
symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
'}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
#print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
########################################### PROCESSING ##########################################
print('-------------------------------- PROCESSING --------------------------------')
stopwords = [word for word in stopwords.words('english')]
# Read index
# Read index mapping GSE file information
idx = open(os.path.join(options.infoPath, options.idx), "r").readlines()
# Read CRF model
########################################### Read CRF model ##########################################
t0 = time()
print('Reading CRF model...')
crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod'))
print("Reading CRF model done in: %fs" % (time() - t0))
# Reading sentences
########################################### Reading sentences ##########################################
print('Processing corpus...')
t0 = time()
labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
#####################################################################################
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
# For each file in dir
......@@ -165,25 +173,24 @@ if __name__ == "__main__":
print("Sentences input data: " + str(len(sentencesInputData)))
# Predicting tags
########################################### Predicting tags ###########################################
t1 = time()
print("Predicting tags with model")
print("Predicting tags with model...")
y_pred = crf.predict(X_input)
print("Prediction done in: %fs" % (time() - t1))
# Tagging with CRF model
print("Tagging file")
########################################### Tagging with CRF model ###########################################
print("Tagging file...")
lidx = 0
for line, tagLine in zip(lines, y_pred):
Ltags = set(labels).intersection(set(tagLine))
outputLine = ''
line = line.strip('\n')
#print("\nLine: " + str(line))
#print ("CRF tagged line: " + str(tagLine))
tb = 'O'
i = 0
########################## one word sentences ##########################
if len(tagLine)==1:
if tagLine[0] in labels:
start = '<' + tagLine[0] + '> '
......@@ -192,9 +199,11 @@ if __name__ == "__main__":
outputLine = start + word + end
else:
outputLine = line.split(' ')[0]
#print(outputLine + '\t' + ', '.join(Ltags))
sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + ', '.join(Ltags))
########################## Saving Sentence Ouput I ##########################
sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + '\t' + ', '.join(Ltags))
########################## Saving Sentence Ouput II ##########################
sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + word.split('|')[0] + '\t' + tag)
lidx += 1
continue
sentence = ''
......@@ -216,6 +225,7 @@ if __name__ == "__main__":
# end sentence
outputLine += word.split('|')[0] + ' '
outputLine += '</' + tag + '/> '
########################## Saving Sentence Ouput II ##########################
sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + sentence + word.split('|')[0] + '\t' +tag)
sb = False
tb = 'O'
......@@ -225,6 +235,7 @@ if __name__ == "__main__":
# start new tag
outputLine += word.split('|')[0] + ' '
outputLine += '</' + tag + '/> '
########################## Saving Sentence Ouput II ##########################
sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + sentence + word.split('|')[0] + '\t' +tag)
sb = False
tb = 'O'
......@@ -235,21 +246,32 @@ if __name__ == "__main__":
i += 1
if sb:
sentence+= word.split('|')[0] + ' '
#print(outputLine + '\t' + ', '.join(Ltags))
sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ ', '.join(Ltags))
########################## Saving Sentence Ouput I ##########################
sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ '\t' +', '.join(Ltags))
lidx += 1
#print( DF(sentencesOutputDataI) )
#print( '\n'.join(sentencesOutputDataII) )
# Save tags
with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFile:
for line in sentencesOutputDataII:
#print(line)
oFile.write(line + '\n')
########################################### Save Output I ##########################################
print("Saving Ouput I...")
with open(os.path.join(options.outputPath, options.outFileI + '_' + options.modelName + '.tsv'), "w") as oFileI:
for line in sentencesOutputDataI:
if re.findall('</', line):
print(line)
#oFileI.write(line + '\n')
#print(line)
oline = line.replace('LDR','(')
oline = oline.replace('RDR',')')
oFileI.write(oline + '\n')
########################################### Save Output II ##########################################
print("Saving Ouput II...")
with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFileII:
for line in sentencesOutputDataII:
#print(line)
oline = line.replace('LDR','(')
oline = oline.replace('RDR',')')
oFileII.write(oline + '\n')
########################################### Save Output III ##########################################
print("Saving Ouput III...")
with open(os.path.join(options.outputPath, options.outFileIII + '_' + options.modelName + '.tsv'), "w") as oFileIII:
for line, tagLine in zip(lines, y_pred):
oline = [ w.split('|')[0].replace('LDR','(').replace('LDR','(')+'|'+tag for w,tag in zip(line.split(' '), tagLine)]
oFileIII.write(' '.join(oline) + '\n')
print("Processing corpus done in: %fs" % (time() - t0))
......
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
......@@ -9328,7 +9328,7 @@ GSE12006 GSM303526 GPL3154-PMID:18940002 characteristics_ch1.1
GSE12006 GSM303526 GPL3154-PMID:18940002 growth_protocol_ch1.1
GSE12006 GSM303526 GPL3154-PMID:18940002 growth_protocol_ch1.2
GSE12006 GSM303527 GPL3154-PMID:18940002 growth_protocol_ch1.4
GSE12006 GSM303527 GPL3154-PMID:18940002
GSE12006 GSM303527 GPL3154-PMID:18940002 extract_protocol_ch1.3
GSE12006 GSM303527 GPL3154-PMID:18940002 title.1
GSE12006 GSM303527 GPL3154-PMID:18940002 source_name_ch1.1
GSE12006 GSM303527 GPL3154-PMID:18940002 organism_ch1.1
......@@ -9340,7 +9340,7 @@ GSE12006 GSM303527 GPL3154-PMID:18940002 characteristics_ch1.1
GSE12006 GSM303527 GPL3154-PMID:18940002 growth_protocol_ch1.1
GSE12006 GSM303527 GPL3154-PMID:18940002 growth_protocol_ch1.2
GSE12006 GSM303528 GPL3154-PMID:18940002 growth_protocol_ch1.4
GSE12006 GSM303528 GPL3154-PMID:18940002
GSE12006 GSM303528 GPL3154-PMID:18940002 extract_protocol_ch1.3
GSE12006 GSM303528 GPL3154-PMID:18940002 title.1
GSE12006 GSM303528 GPL3154-PMID:18940002 source_name_ch1.1
GSE12006 GSM303528 GPL3154-PMID:18940002 organism_ch1.1
......@@ -9352,7 +9352,7 @@ GSE12006 GSM303528 GPL3154-PMID:18940002 characteristics_ch1.1
GSE12006 GSM303528 GPL3154-PMID:18940002 growth_protocol_ch1.1
GSE12006 GSM303528 GPL3154-PMID:18940002 growth_protocol_ch1.2
GSE12006 GSM303529 GPL3154-PMID:18940002 growth_protocol_ch1.4
GSE12006 GSM303529 GPL3154-PMID:18940002
GSE12006 GSM303529 GPL3154-PMID:18940002 extract_protocol_ch1.3
GSE12006 GSM303529 GPL3154-PMID:18940002 title.1
GSE12006 GSM303529 GPL3154-PMID:18940002 source_name_ch1.1
GSE12006 GSM303529 GPL3154-PMID:18940002 organism_ch1.1
......
This diff could not be displayed because it is too large.
-------------------------------- PARAMETERS --------------------------------
Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
File with CoreNLP-tagging bg-sentences: bg_sentences_v3.txt.ner
Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
File to save recontrsucted bg-sentences: annot-input_bg_v3.txt
-------------------------------- PROCESSING --------------------------------
Number of sentences: 14716
==================================END===================================
-------------------------------- PARAMETERS --------------------------------
Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
File with CoreNLP-tagging bg-sentences: bg_sentences_v4.txt.ner
Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
File to save recontrsucted bg-sentences: annot-input_bg_v4.txt
--inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
--outputFileI Output tagged file I : annot-input_bg_outputI_v4
--outputFileII Output tagged file II : annot-input_bg_outputII_v4
--outputFileII Output tagged file III : annot-input_bg_outputIII_v4
--modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
--modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
--infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
--infoFile GSE-GSM index file : bg_sentences_midx_v4.txt
--variant Run variant : 13
--S1 General features : True
--S2 Inner/Complete word features : False
--S3 Extended context features : False
--S4 Semantic features : True
--filteringStopWords Filtering stop words : False
--filterSymbols Filtering punctuation marks : False
-------------------------------- PROCESSING --------------------------------
Number of sentences: 90904
==================================END===================================
Reading CRF model...
Reading CRF model done in: 0.009463s
Processing corpus...
Preprocessing file...annot-input_bg_v4.txt
Sentences input data: 90688
Predicting tags with model...
Prediction done in: 26.367272s
Tagging file...
Saving Ouput I...
Saving Ouput II...
Saving Ouput III...
Processing corpus done in: 56.584394s
......
-------------------------------- PARAMETERS --------------------------------
--inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
--outputFileI Output tagged file I : annot-input_bg_outputI.txt
--outputFileII Output tagged file II : annot-input_bg_outputII.txt
--modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
--modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
--infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
--infoFile GSE-GSM index file : bg_sentences_midx.txt
--variant Run variant : 13
--S1 General features : True
--S2 Inner/Complete word features : False
--S3 Extended context features : False
--S4 Semantic features : True
--filteringStopWords Filtering stop words : False
--filterSymbols Filtering punctuation marks : False
Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
-------------------------------- PROCESSING --------------------------------
Reading CRF model...
Reading CRF model done in: 0.008336s
Processing corpus...
Preprocessing file...annot-input_bg_v3.txt
Sentences input data: 14716
Predicting tags with model
Prediction done in: 1.688127s
Tagging file
Processing corpus done in: 3.948320s
......@@ -17,10 +17,16 @@
Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
-------------------------------- PROCESSING --------------------------------
Reading CRF model...
Reading CRF model done in: 0.009804s
Reading CRF model done in: 0.009363s
Processing corpus...
Preprocessing file...annot-input_bg_v3.txt
Sentences input data: 14716
Predicting tags with model
Prediction done in: 1.811103s
Prediction done in: 1.737334s
Tagging file
Preprocessing file...annot-input_bg_v4.txt
Sentences input data: 90688
Predicting tags with model
Prediction done in: 26.434549s
Tagging file
Processing corpus done in: 58.304885s
......