Estefani Gaytan Nunez

upload

1 +index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all-output-index.txt"
2 +mapfiel="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/bg_map_index.txt"
3 +grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f7,9 | sort | uniq | awk 'BEGIN {FS="\t"} length($2) > 3 { print $_}' > $mapfile
4 +
5 +
...@@ -9,18 +9,22 @@ from optparse import OptionParser ...@@ -9,18 +9,22 @@ from optparse import OptionParser
9 # --inputPath=PATH Path of inputfile 9 # --inputPath=PATH Path of inputfile
10 # --outputPath=PATH Path to place output files 10 # --outputPath=PATH Path to place output files
11 # --outputFile=File Output data set 11 # --outputFile=File Output data set
12 +# --minWordLen Minimum word length
13 +# --minSenLen Minimum sentence length
12 # 14 #
13 # Output 15 # Output
14 # Tagged sentences reconstruction 16 # Tagged sentences reconstruction
15 # 17 #
16 # Examples 18 # Examples
17 # python label-split_training_test_v1.py 19 # python label-split_training_test_v1.py
18 -# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ 20 +# --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
19 -# --inputFile annot-input_bg.txt 21 +# --inputFile bg_sentences_v2.txt.ner
20 -# --outputFile input_bg_sentences.txt 22 +# --outputFile annot-input_bg.txt
21 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input 23 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
24 +# --minWordLen 2
25 +# --minSenLen 1
22 # 26 #
23 -#python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output --outputFile annot-input_bg.txt 27 +#python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1
24 28
25 ########################################## 29 ##########################################
26 # MAIN PROGRAM # 30 # MAIN PROGRAM #
...@@ -32,7 +36,10 @@ if __name__ == "__main__": ...@@ -32,7 +36,10 @@ if __name__ == "__main__":
32 parser.add_option("--inputPath", dest="inputPath", help="Path of output from CoreNLP", metavar="PATH") 36 parser.add_option("--inputPath", dest="inputPath", help="Path of output from CoreNLP", metavar="PATH")
33 parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH") 37 parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
34 parser.add_option("--inputFile", dest="inputFile", help="File with CoreNLP-tagging sentences", metavar="FILE") 38 parser.add_option("--inputFile", dest="inputFile", help="File with CoreNLP-tagging sentences", metavar="FILE")
35 - parser.add_option("--outputFile", dest="outputFile", help="File with training data set", metavar="FILE") 39 + parser.add_option("--outputFile", dest="outputFile", help="File with training data set", metavar="FILE")
40 + parser.add_option("--minWordLen", dest="wL", help="Minimum word length", type="int")
41 + parser.add_option("--minSenLen", dest="sL", help="Minimum word length", type="int")
42 +
36 43
37 (options, args) = parser.parse_args() 44 (options, args) = parser.parse_args()
38 if len(args) > 0: 45 if len(args) > 0:
...@@ -56,11 +63,15 @@ if __name__ == "__main__": ...@@ -56,11 +63,15 @@ if __name__ == "__main__":
56 if len(line.split('\t')) > 1: 63 if len(line.split('\t')) > 1:
57 w = line.split('\t')[1] 64 w = line.split('\t')[1]
58 if w == "PGCGROWTHCONDITIONS": 65 if w == "PGCGROWTHCONDITIONS":
59 - #End of sentence 66 + if len( sentence.lstrip().split(' ') ) <= options.sL and len(sentence.lstrip().split(' ')[0].split('|')[0]) <= options.wL:
60 - lista.append(sentence) 67 + print( "EXCLUDE: " + sentence.lstrip() )
68 + else:
69 + #End of sentence
70 + lista.append(sentence.lstrip())
71 + #New setence
72 + n = n+1
61 #New setence 73 #New setence
62 - sentence = '' 74 + sentence = ''
63 - n=n+1
64 else: 75 else:
65 #Building and save tagging sentence 76 #Building and save tagging sentence
66 sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])) 77 sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4]))
......
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
1 +-------------------------------- PARAMETERS --------------------------------
2 +Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
3 +File with CoreNLP-tagging bg-sentences: bg_sentences_v2.txt.ner
4 +Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
5 +File to save recontrsucted bg-sentences: annot-input_bg_v1.txt
6 +-------------------------------- PROCESSING --------------------------------
7 +Number of sentences: 13903
8 +==================================END===================================
1 +-------------------------------- PARAMETERS --------------------------------
2 +Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
3 +File with CoreNLP-tagging bg-sentences: bg_sentences_v2.txt.ner
4 +Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
5 +File to save recontrsucted bg-sentences: annot-input_bg_v2.txt
6 +-------------------------------- PROCESSING --------------------------------
7 +EXCLUDE:
8 +EXCLUDE:
9 +EXCLUDE: .|.|.
10 +EXCLUDE: \|\|SYM
11 +EXCLUDE: C1|c1|NN
12 +EXCLUDE: C2|c2|NN
13 +EXCLUDE: F1|f1|NN
14 +EXCLUDE: F2|f2|NN
15 +EXCLUDE: LB|lb|NN
16 +EXCLUDE: NA|NA|NNP
17 +EXCLUDE: NC|nc|NN
18 +EXCLUDE: V1|v1|NN
19 +EXCLUDE: wt|wt|JJ
20 +EXCLUDE: WT|WT|NNP
21 +Number of sentences: 13889
22 +==================================END===================================