get-raw-sentences.sh 1.06 KB


# Orgiginal files
#cd /home/egaytan/automatic-extraction-growth-conditions/data-sets/report-manually-tagged-gcs/

# Re-tagged
cd /home/egaytan/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/
echo
echo
echo

echo "==============================Family SOFT files======================================= "
echo
echo "Access to GEO family soft files.."
echo "directory: "$(pwd);
echo
echo

ls -1 ;
echo

echo "Number of files: "$(ls -1 | wc -l);
echo
echo
echo "Filter all paragraphs with tags..."
echo "Add sentence-end-tag PGCGROWTHCONDITIONS..."
grep -E "<[^<]*>"  * | grep -E  '!'| cut -f2-5 -d'='|sort|uniq|awk '{ print $_" PGCGROWTHCONDITIONS"; }'  > /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/test-traning/raw-metadata-senteneces.txt
echo
echo "Number of total tag sentences: "$(wc /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/test-training/raw-metadata-senteneces.txt -l);
echo
echo
echo "Saving file: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/test-training/raw-metadata-senteneces.txt";