get-raw-sentences_v4.sh 2.11 KB
echo
echo
echo
echo "===================================Extraction============================================ "

cd /home/egaytan/automatic-extraction-growth-conditions/extraction-geo/outputs/

echo "Access to output extracted baglines"
echo "directory: "$(pwd);
#all output-extraction files
index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all-output-index.txt"
#input sentences to run CoreNLP
output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt"
#GSE index by bg_sentence row
mapping="/home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping/bg_sentences_midx_v4.txt"
#Number of fields by bagline
report="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/bg_report_v4.txt"
echo
echo
echo
echo "==============================Baglines index files======================================= "
# absolute file output path
for gse in $(ls -1)
do
  cd $gse; ls -d $PWD/*; cd ..;
done > $index
echo "Number if extracted files"
wc -l $index
echo
echo
echo
echo "==============================Baglines extraction======================================="
echo
echo
echo "Add sentence-end-tag PGCGROWTHCONDITIONS"
#cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f8,9 | sort | uniq | awk 'BEGIN {FS="\t"}  length($2) > 3 { print  $_}' | sed 's/\\null\\/null/g'| sed 's/.tsv//g'  | sed 's/-/\t/' | sed 's/-/\t/' )
cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7'| cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g' | sed 's/1.\tNeubauer//'| sed 's/\\null\\/null/g' | sort | uniq)
echo "$cext" | cut -f4 | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output
wc $output
echo "$cext" | cut -f1-3,5 > $mapping
wc $mapping
echo
echo
echo "Number of total baglines: "$(wc -l $output );
echo
echo "Baglines report"


for gsef in $( cat $index)
do
  cat $gsef | sort | uniq ;
done | cut -f2 | cut -f2 -d'"' | sed 's/_ch/./g' | cut -f1 -d'.' | sort | uniq -c |  awk '{print $1"\t"$2}' > $report

cat $report
echo
echo
echo "Saving file: "$output;