Estefani Gaytan Nunez

upload

index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all-output-index.txt"
mapfiel="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/bg_map_index.txt"
grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f7,9 | sort | uniq | awk 'BEGIN {FS="\t"} length($2) > 3 { print $_}' > $mapfile
......@@ -9,18 +9,22 @@ from optparse import OptionParser
# --inputPath=PATH Path of inputfile
# --outputPath=PATH Path to place output files
# --outputFile=File Output data set
# --minWordLen Minimum word length
# --minSenLen Minimum sentence length
#
# Output
# Tagged sentences reconstruction
#
# Examples
# python label-split_training_test_v1.py
# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
# --inputFile annot-input_bg.txt
# --outputFile input_bg_sentences.txt
# --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
# --inputFile bg_sentences_v2.txt.ner
# --outputFile annot-input_bg.txt
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
# --minWordLen 2
# --minSenLen 1
#
#python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output --outputFile annot-input_bg.txt
#python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1
##########################################
# MAIN PROGRAM #
......@@ -32,7 +36,10 @@ if __name__ == "__main__":
parser.add_option("--inputPath", dest="inputPath", help="Path of output from CoreNLP", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
parser.add_option("--inputFile", dest="inputFile", help="File with CoreNLP-tagging sentences", metavar="FILE")
parser.add_option("--outputFile", dest="outputFile", help="File with training data set", metavar="FILE")
parser.add_option("--outputFile", dest="outputFile", help="File with training data set", metavar="FILE")
parser.add_option("--minWordLen", dest="wL", help="Minimum word length", type="int")
parser.add_option("--minSenLen", dest="sL", help="Minimum word length", type="int")
(options, args) = parser.parse_args()
if len(args) > 0:
......@@ -56,11 +63,15 @@ if __name__ == "__main__":
if len(line.split('\t')) > 1:
w = line.split('\t')[1]
if w == "PGCGROWTHCONDITIONS":
#End of sentence
lista.append(sentence)
if len( sentence.lstrip().split(' ') ) <= options.sL and len(sentence.lstrip().split(' ')[0].split('|')[0]) <= options.wL:
print( "EXCLUDE: " + sentence.lstrip() )
else:
#End of sentence
lista.append(sentence.lstrip())
#New setence
n = n+1
#New setence
sentence = ''
n=n+1
sentence = ''
else:
#Building and save tagging sentence
sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4]))
......
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
-------------------------------- PARAMETERS --------------------------------
Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
File with CoreNLP-tagging bg-sentences: bg_sentences_v2.txt.ner
Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
File to save recontrsucted bg-sentences: annot-input_bg_v1.txt
-------------------------------- PROCESSING --------------------------------
Number of sentences: 13903
==================================END===================================
-------------------------------- PARAMETERS --------------------------------
Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
File with CoreNLP-tagging bg-sentences: bg_sentences_v2.txt.ner
Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
File to save recontrsucted bg-sentences: annot-input_bg_v2.txt
-------------------------------- PROCESSING --------------------------------
EXCLUDE:
EXCLUDE:
EXCLUDE: .|.|.
EXCLUDE: \|\|SYM
EXCLUDE: C1|c1|NN
EXCLUDE: C2|c2|NN
EXCLUDE: F1|f1|NN
EXCLUDE: F2|f2|NN
EXCLUDE: LB|lb|NN
EXCLUDE: NA|NA|NNP
EXCLUDE: NC|nc|NN
EXCLUDE: V1|v1|NN
EXCLUDE: wt|wt|JJ
EXCLUDE: WT|WT|NNP
Number of sentences: 13889
==================================END===================================