get-raw-sentences_v3.sh 2.03 KB
echo
echo
echo
echo "===================================Extraction============================================ "

cd /home/egaytan/automatic-extraction-growth-conditions/extraction-geo/outputs/

echo "Access to output extracted baglines"
echo "directory: "$(pwd);
#all output-extraction files
index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all-output-index.txt"
#input sentences to run CoreNLP
output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v3.txt"
#GSE index by bg_sentence row
mapping="/home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping/bg_sentences_midx.txt"
#Number of fields by bagline
report="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/bg_report_v3.txt"
echo
echo
echo
echo "==============================Baglines index files======================================= "
# absolute file output path
for gse in $(ls -1)
do
  cd $gse; ls -d $PWD/*; cd ..;
done > $index
echo "Number if extracted files"
wc -l $index
echo
echo
echo
echo "==============================Baglines extraction======================================="
echo
echo
echo "Add sentence-end-tag PGCGROWTHCONDITIONS"
#for gsef in $( cat $index )
#do
#  cat $gsef | sort | uniq;
#done | cut -f1 | cut -f2 -d'"' | sort | uniq | awk '{ print $_ " PGCGROWTHCONDITIONS" }'  > $output
cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f7,9 | sort | uniq | awk 'BEGIN {FS="\t"}  length($2) > 3 { print  $_}' | awk '{ print $_ " PGCGROWTHCONDITIONS" }' | sed 's/\\null\\/null/g' )
echo "$cext" | cut -f2 > $output
echo "$cext" | cut -f2 |wc
echo "$cext" | cut -f1 > $mapping
echo "$cext" | cut -f1 |wc
echo
echo
echo "Number of total baglines: "$(wc -l $output );
echo
echo "Baglines report"


for gsef in $( cat $index)
do
  cat $gsef | sort | uniq ;
done | cut -f2 | cut -f2 -d'"' | sed 's/_ch/./g' | cut -f1 -d'.' | sort | uniq -c |  awk '{print $1"\t"$2}' > $report

cat $report
echo
echo
echo "Saving file: "$output;