Showing
8 changed files
with
55 additions
and
9 deletions
predict-annot/bin/mapping/bg_map_index.sh
0 → 100644
1 | +index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all-output-index.txt" | ||
2 | +mapfiel="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/bg_map_index.txt" | ||
3 | +grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f7,9 | sort | uniq | awk 'BEGIN {FS="\t"} length($2) > 3 { print $_}' > $mapfile | ||
4 | + | ||
5 | + |
... | @@ -9,18 +9,22 @@ from optparse import OptionParser | ... | @@ -9,18 +9,22 @@ from optparse import OptionParser |
9 | # --inputPath=PATH Path of inputfile | 9 | # --inputPath=PATH Path of inputfile |
10 | # --outputPath=PATH Path to place output files | 10 | # --outputPath=PATH Path to place output files |
11 | # --outputFile=File Output data set | 11 | # --outputFile=File Output data set |
12 | +# --minWordLen Minimum word length | ||
13 | +# --minSenLen Minimum sentence length | ||
12 | # | 14 | # |
13 | # Output | 15 | # Output |
14 | # Tagged sentences reconstruction | 16 | # Tagged sentences reconstruction |
15 | # | 17 | # |
16 | # Examples | 18 | # Examples |
17 | # python label-split_training_test_v1.py | 19 | # python label-split_training_test_v1.py |
18 | -# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ | 20 | +# --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation |
19 | -# --inputFile annot-input_bg.txt | 21 | +# --inputFile bg_sentences_v2.txt.ner |
20 | -# --outputFile input_bg_sentences.txt | 22 | +# --outputFile annot-input_bg.txt |
21 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input | 23 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input |
24 | +# --minWordLen 2 | ||
25 | +# --minSenLen 1 | ||
22 | # | 26 | # |
23 | -#python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output --outputFile annot-input_bg.txt | 27 | +#python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1 |
24 | 28 | ||
25 | ########################################## | 29 | ########################################## |
26 | # MAIN PROGRAM # | 30 | # MAIN PROGRAM # |
... | @@ -32,7 +36,10 @@ if __name__ == "__main__": | ... | @@ -32,7 +36,10 @@ if __name__ == "__main__": |
32 | parser.add_option("--inputPath", dest="inputPath", help="Path of output from CoreNLP", metavar="PATH") | 36 | parser.add_option("--inputPath", dest="inputPath", help="Path of output from CoreNLP", metavar="PATH") |
33 | parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH") | 37 | parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH") |
34 | parser.add_option("--inputFile", dest="inputFile", help="File with CoreNLP-tagging sentences", metavar="FILE") | 38 | parser.add_option("--inputFile", dest="inputFile", help="File with CoreNLP-tagging sentences", metavar="FILE") |
35 | - parser.add_option("--outputFile", dest="outputFile", help="File with training data set", metavar="FILE") | 39 | + parser.add_option("--outputFile", dest="outputFile", help="File with training data set", metavar="FILE") |
40 | + parser.add_option("--minWordLen", dest="wL", help="Minimum word length", type="int") | ||
41 | + parser.add_option("--minSenLen", dest="sL", help="Minimum word length", type="int") | ||
42 | + | ||
36 | 43 | ||
37 | (options, args) = parser.parse_args() | 44 | (options, args) = parser.parse_args() |
38 | if len(args) > 0: | 45 | if len(args) > 0: |
... | @@ -56,11 +63,15 @@ if __name__ == "__main__": | ... | @@ -56,11 +63,15 @@ if __name__ == "__main__": |
56 | if len(line.split('\t')) > 1: | 63 | if len(line.split('\t')) > 1: |
57 | w = line.split('\t')[1] | 64 | w = line.split('\t')[1] |
58 | if w == "PGCGROWTHCONDITIONS": | 65 | if w == "PGCGROWTHCONDITIONS": |
59 | - #End of sentence | 66 | + if len( sentence.lstrip().split(' ') ) <= options.sL and len(sentence.lstrip().split(' ')[0].split('|')[0]) <= options.wL: |
60 | - lista.append(sentence) | 67 | + print( "EXCLUDE: " + sentence.lstrip() ) |
68 | + else: | ||
69 | + #End of sentence | ||
70 | + lista.append(sentence.lstrip()) | ||
71 | + #New setence | ||
72 | + n = n+1 | ||
61 | #New setence | 73 | #New setence |
62 | - sentence = '' | 74 | + sentence = '' |
63 | - n=n+1 | ||
64 | else: | 75 | else: |
65 | #Building and save tagging sentence | 76 | #Building and save tagging sentence |
66 | sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])) | 77 | sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])) | ... | ... |
This diff could not be displayed because it is too large.
predict-annot/input/annot-input_bg_v2.txt
0 → 100644
This diff could not be displayed because it is too large.
predict-annot/mapping/bg_GSE_map_index.txt
0 → 100644
This diff could not be displayed because it is too large.
predict-annot/mapping/bg_GSM_map_index.txt
0 → 100644
This diff could not be displayed because it is too large.
1 | +-------------------------------- PARAMETERS -------------------------------- | ||
2 | +Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation | ||
3 | +File with CoreNLP-tagging bg-sentences: bg_sentences_v2.txt.ner | ||
4 | +Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input | ||
5 | +File to save recontrsucted bg-sentences: annot-input_bg_v1.txt | ||
6 | +-------------------------------- PROCESSING -------------------------------- | ||
7 | +Number of sentences: 13903 | ||
8 | +==================================END=================================== |
1 | +-------------------------------- PARAMETERS -------------------------------- | ||
2 | +Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation | ||
3 | +File with CoreNLP-tagging bg-sentences: bg_sentences_v2.txt.ner | ||
4 | +Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input | ||
5 | +File to save recontrsucted bg-sentences: annot-input_bg_v2.txt | ||
6 | +-------------------------------- PROCESSING -------------------------------- | ||
7 | +EXCLUDE: | ||
8 | +EXCLUDE: | ||
9 | +EXCLUDE: .|.|. | ||
10 | +EXCLUDE: \|\|SYM | ||
11 | +EXCLUDE: C1|c1|NN | ||
12 | +EXCLUDE: C2|c2|NN | ||
13 | +EXCLUDE: F1|f1|NN | ||
14 | +EXCLUDE: F2|f2|NN | ||
15 | +EXCLUDE: LB|lb|NN | ||
16 | +EXCLUDE: NA|NA|NNP | ||
17 | +EXCLUDE: NC|nc|NN | ||
18 | +EXCLUDE: V1|v1|NN | ||
19 | +EXCLUDE: wt|wt|JJ | ||
20 | +EXCLUDE: WT|WT|NNP | ||
21 | +Number of sentences: 13889 | ||
22 | +==================================END=================================== |
-
Please register or login to post a comment