get-raw-sentences.sh
892 Bytes
cd /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data
echo
echo
echo
echo "==============================Family SOFT files======================================= "
echo
echo "Access to GEO family soft files.."
echo "directory: "$(pwd);
echo
echo
ls -1 ;
echo
echo "Number of files: "$(ls -1 | wc -l);
echo
echo
echo "Filter all paragraphs with tags..."
echo "Add sentence-end-tag PGCGROWTHCONDITIONS..."
grep -E "<[^<]*>" * | grep -E '!'| cut -f2 -d'='|sort|uniq|awk '{ print $_" PGCGROWTHCONDITIONS"; }' > /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/input/raw-metadata-senteneces.txt
echo
echo "Number of total tag sentences: "$(wc /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/input/raw-metadata-senteneces.txt -l);
echo
echo
echo "Saving file: /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/input/raw-metadata-senteneces.txt";