Estefani Gaytan Nunez

upload

......@@ -7,10 +7,14 @@ cd /home/egaytan/automatic-extraction-growth-conditions/extraction-geo/outputs/
echo "Access to output extracted baglines"
echo "directory: "$(pwd);
#all output-extraction files
index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all-output-index.txt"
output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v2.txt"
report="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/bg_report_v2.txt"
#input sentences to run CoreNLP
output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v3.txt"
#GSE index by bg_sentence row
mapping="/home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping/bg_sentences_midx.txt"
#Number of fields by bagline
report="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/bg_report_v3.txt"
echo
echo
echo
......@@ -29,10 +33,15 @@ echo "==============================Baglines extraction=========================
echo
echo
echo "Add sentence-end-tag PGCGROWTHCONDITIONS"
for gsef in $( cat $index )
do
cat $gsef | sort | uniq;
done | cut -f1 | cut -f2 -d'"' | sort | uniq | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output
#for gsef in $( cat $index )
#do
# cat $gsef | sort | uniq;
#done | cut -f1 | cut -f2 -d'"' | sort | uniq | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output
cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f7,9 | sort | uniq | awk 'BEGIN {FS="\t"} length($2) > 3 { print $_}' | awk '{ print $_ " PGCGROWTHCONDITIONS" }' | sed 's/\\null\\/null/g' )
echo "$cext" | cut -f2 > $output
echo "$cext" | cut -f2 |wc
echo "$cext" | cut -f1 > $mapping
echo "$cext" | cut -f1 |wc
echo
echo
echo "Number of total baglines: "$(wc -l $output );
......@@ -40,7 +49,7 @@ echo
echo "Baglines report"
for gsef in $( cat ../reports/all-output-index.txt)
for gsef in $( cat $index)
do
cat $gsef | sort | uniq ;
done | cut -f2 | cut -f2 -d'"' | sed 's/_ch/./g' | cut -f1 -d'.' | sort | uniq -c | awk '{print $1"\t"$2}' > $report
......@@ -48,4 +57,4 @@ done | cut -f2 | cut -f2 -d'"' | sed 's/_ch/./g' | cut -f1 -d'.' | sort | uniq -
cat $report
echo
echo
echo "Saving file: /home/egaytan/automatic-extraction-growth-conditions/extraction/bg_sentences_v2.txt";
echo "Saving file: "$output;
......
......@@ -4,7 +4,7 @@ echo "==============================Run CoreNLP=================================
echo
echo
input="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v2.txt"
input="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v3.txt"
output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation"
regexfile="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt"
......
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all-output-index.txt"
mapfiel="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/bg_map_index.txt"
grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f7,9 | sort | uniq | awk 'BEGIN {FS="\t"} length($2) > 3 { print $_}' > $mapfile
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
-------------------------------- PARAMETERS --------------------------------
Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
File with CoreNLP-tagging bg-sentences: bg_sentences_v2.txt.ner
Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
File to save recontrsucted bg-sentences: annot-input_bg_v2.txt
-------------------------------- PROCESSING --------------------------------
EXCLUDE:
EXCLUDE:
EXCLUDE: .|.|.
EXCLUDE: \|\|SYM
EXCLUDE: C1|c1|NN
EXCLUDE: C2|c2|NN
EXCLUDE: F1|f1|NN
EXCLUDE: F2|f2|NN
EXCLUDE: LB|lb|NN
EXCLUDE: NA|NA|NNP
EXCLUDE: NC|nc|NN
EXCLUDE: V1|v1|NN
EXCLUDE: wt|wt|JJ
EXCLUDE: WT|WT|NNP
Number of sentences: 13889
==================================END===================================
-------------------------------- PARAMETERS --------------------------------
Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
File with CoreNLP-tagging bg-sentences: bg_sentences_v2.txt.ner
File with CoreNLP-tagging bg-sentences: bg_sentences_v3.txt.ner
Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
File to save recontrsucted bg-sentences: annot-input_bg_v1.txt
File to save recontrsucted bg-sentences: annot-input_bg_v3.txt
-------------------------------- PROCESSING --------------------------------
Number of sentences: 13903
Number of sentences: 14716
==================================END===================================
......