get-raw-sentences.sh
1.02 KB
# Orgiginal files
#cd /home/egaytan/automatic-extraction-growth-conditions/data-sets/report-manually-tagged-gcs/
# Re-tagged
cd /home/egaytan/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/
echo
echo
echo
echo "==============================Family SOFT files======================================= "
echo
echo "Access to GEO family soft files.."
echo "directory: "$(pwd);
echo
echo
ls -1 ;
echo
echo "Number of files: "$(ls -1 | wc -l);
echo
echo
echo "Filter all paragraphs with tags..."
echo "Add sentence-end-tag PGCGROWTHCONDITIONS..."
grep -E "<[^<]*>" * | grep -E '!'| cut -f2-5 -d'='|sort|uniq|awk '{ print $_" PGCGROWTHCONDITIONS"; }' > /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/raw-metadata-senteneces.txt
echo
echo "Number of total tag sentences: "$(wc /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/raw-metadata-senteneces.txt -l);
echo
echo
echo "Saving file: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/raw-metadata-senteneces.txt";