Showing
16 changed files
with
22 additions
and
40 deletions
... | @@ -7,10 +7,14 @@ cd /home/egaytan/automatic-extraction-growth-conditions/extraction-geo/outputs/ | ... | @@ -7,10 +7,14 @@ cd /home/egaytan/automatic-extraction-growth-conditions/extraction-geo/outputs/ |
7 | 7 | ||
8 | echo "Access to output extracted baglines" | 8 | echo "Access to output extracted baglines" |
9 | echo "directory: "$(pwd); | 9 | echo "directory: "$(pwd); |
10 | - | 10 | +#all output-extraction files |
11 | index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all-output-index.txt" | 11 | index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all-output-index.txt" |
12 | -output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v2.txt" | 12 | +#input sentences to run CoreNLP |
13 | -report="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/bg_report_v2.txt" | 13 | +output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v3.txt" |
14 | +#GSE index by bg_sentence row | ||
15 | +mapping="/home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping/bg_sentences_midx.txt" | ||
16 | +#Number of fields by bagline | ||
17 | +report="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/bg_report_v3.txt" | ||
14 | echo | 18 | echo |
15 | echo | 19 | echo |
16 | echo | 20 | echo |
... | @@ -29,10 +33,15 @@ echo "==============================Baglines extraction========================= | ... | @@ -29,10 +33,15 @@ echo "==============================Baglines extraction========================= |
29 | echo | 33 | echo |
30 | echo | 34 | echo |
31 | echo "Add sentence-end-tag PGCGROWTHCONDITIONS" | 35 | echo "Add sentence-end-tag PGCGROWTHCONDITIONS" |
32 | -for gsef in $( cat $index ) | 36 | +#for gsef in $( cat $index ) |
33 | -do | 37 | +#do |
34 | - cat $gsef | sort | uniq; | 38 | +# cat $gsef | sort | uniq; |
35 | -done | cut -f1 | cut -f2 -d'"' | sort | uniq | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output | 39 | +#done | cut -f1 | cut -f2 -d'"' | sort | uniq | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output |
40 | +cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f7,9 | sort | uniq | awk 'BEGIN {FS="\t"} length($2) > 3 { print $_}' | awk '{ print $_ " PGCGROWTHCONDITIONS" }' | sed 's/\\null\\/null/g' ) | ||
41 | +echo "$cext" | cut -f2 > $output | ||
42 | +echo "$cext" | cut -f2 |wc | ||
43 | +echo "$cext" | cut -f1 > $mapping | ||
44 | +echo "$cext" | cut -f1 |wc | ||
36 | echo | 45 | echo |
37 | echo | 46 | echo |
38 | echo "Number of total baglines: "$(wc -l $output ); | 47 | echo "Number of total baglines: "$(wc -l $output ); |
... | @@ -40,7 +49,7 @@ echo | ... | @@ -40,7 +49,7 @@ echo |
40 | echo "Baglines report" | 49 | echo "Baglines report" |
41 | 50 | ||
42 | 51 | ||
43 | -for gsef in $( cat ../reports/all-output-index.txt) | 52 | +for gsef in $( cat $index) |
44 | do | 53 | do |
45 | cat $gsef | sort | uniq ; | 54 | cat $gsef | sort | uniq ; |
46 | done | cut -f2 | cut -f2 -d'"' | sed 's/_ch/./g' | cut -f1 -d'.' | sort | uniq -c | awk '{print $1"\t"$2}' > $report | 55 | done | cut -f2 | cut -f2 -d'"' | sed 's/_ch/./g' | cut -f1 -d'.' | sort | uniq -c | awk '{print $1"\t"$2}' > $report |
... | @@ -48,4 +57,4 @@ done | cut -f2 | cut -f2 -d'"' | sed 's/_ch/./g' | cut -f1 -d'.' | sort | uniq - | ... | @@ -48,4 +57,4 @@ done | cut -f2 | cut -f2 -d'"' | sed 's/_ch/./g' | cut -f1 -d'.' | sort | uniq - |
48 | cat $report | 57 | cat $report |
49 | echo | 58 | echo |
50 | echo | 59 | echo |
51 | -echo "Saving file: /home/egaytan/automatic-extraction-growth-conditions/extraction/bg_sentences_v2.txt"; | 60 | +echo "Saving file: "$output; | ... | ... |
... | @@ -4,7 +4,7 @@ echo "==============================Run CoreNLP================================= | ... | @@ -4,7 +4,7 @@ echo "==============================Run CoreNLP================================= |
4 | echo | 4 | echo |
5 | echo | 5 | echo |
6 | 6 | ||
7 | -input="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v2.txt" | 7 | +input="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v3.txt" |
8 | output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation" | 8 | output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation" |
9 | regexfile="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt" | 9 | regexfile="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt" |
10 | 10 | ... | ... |
This diff could not be displayed because it is too large.
CoreNLP/input/annotation/bg_sentences_v3.txt
0 → 100644
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
1 | -index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all-output-index.txt" | ||
2 | -mapfiel="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/bg_map_index.txt" | ||
3 | -grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f7,9 | sort | uniq | awk 'BEGIN {FS="\t"} length($2) > 3 { print $_}' > $mapfile | ||
4 | - | ||
5 | - |
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
predict-annot/input/annot-input_bg_v3.txt
0 → 100644
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
predict-annot/mapping/bg_sentences_midx.txt
0 → 100644
This diff could not be displayed because it is too large.
1 | --------------------------------- PARAMETERS -------------------------------- | ||
2 | -Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation | ||
3 | -File with CoreNLP-tagging bg-sentences: bg_sentences_v2.txt.ner | ||
4 | -Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input | ||
5 | -File to save recontrsucted bg-sentences: annot-input_bg_v2.txt | ||
6 | --------------------------------- PROCESSING -------------------------------- | ||
7 | -EXCLUDE: | ||
8 | -EXCLUDE: | ||
9 | -EXCLUDE: .|.|. | ||
10 | -EXCLUDE: \|\|SYM | ||
11 | -EXCLUDE: C1|c1|NN | ||
12 | -EXCLUDE: C2|c2|NN | ||
13 | -EXCLUDE: F1|f1|NN | ||
14 | -EXCLUDE: F2|f2|NN | ||
15 | -EXCLUDE: LB|lb|NN | ||
16 | -EXCLUDE: NA|NA|NNP | ||
17 | -EXCLUDE: NC|nc|NN | ||
18 | -EXCLUDE: V1|v1|NN | ||
19 | -EXCLUDE: wt|wt|JJ | ||
20 | -EXCLUDE: WT|WT|NNP | ||
21 | -Number of sentences: 13889 | ||
22 | -==================================END=================================== |
1 | -------------------------------- PARAMETERS -------------------------------- | 1 | -------------------------------- PARAMETERS -------------------------------- |
2 | Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation | 2 | Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation |
3 | -File with CoreNLP-tagging bg-sentences: bg_sentences_v2.txt.ner | 3 | +File with CoreNLP-tagging bg-sentences: bg_sentences_v3.txt.ner |
4 | Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input | 4 | Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input |
5 | -File to save recontrsucted bg-sentences: annot-input_bg_v1.txt | 5 | +File to save recontrsucted bg-sentences: annot-input_bg_v3.txt |
6 | -------------------------------- PROCESSING -------------------------------- | 6 | -------------------------------- PROCESSING -------------------------------- |
7 | -Number of sentences: 13903 | 7 | +Number of sentences: 14716 |
8 | ==================================END=================================== | 8 | ==================================END=================================== | ... | ... |
-
Please register or login to post a comment