get-raw-sentences.sh
1.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
echo
echo
echo
echo "===================================Extraction============================================ "
cd /home/egaytan/automatic-extraction-growth-conditions/extraction-geo/outputs/
echo "Access to output extracted baglines"
echo "directory: "$(pwd);
index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all-output-index.txt"
output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v2.txt"
report="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/bg_report_v2.txt"
echo
echo
echo
echo "==============================Baglines index files======================================= "
# absolute file output path
for gse in $(ls -1)
do
cd $gse; ls -d $PWD/*; cd ..;
done > $index
echo "Number if extracted files"
wc -l $index
echo
echo
echo
echo "==============================Baglines extraction======================================="
echo
echo
echo "Add sentence-end-tag PGCGROWTHCONDITIONS"
for gsef in $( cat $index )
do
cat $gsef | sort | uniq;
done | cut -f1 | cut -f2 -d'"' | sort | uniq | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output
echo
echo
echo "Number of total baglines: "$(wc -l $output );
echo
echo "Baglines report"
for gsef in $( cat ../reports/all-output-index.txt)
do
cat $gsef | sort | uniq ;
done | cut -f2 | cut -f2 -d'"' | sed 's/_ch/./g' | cut -f1 -d'.' | sort | uniq -c | awk '{print $1"\t"$2}' > $report
cat $report
echo
echo
echo "Saving file: /home/egaytan/automatic-extraction-growth-conditions/extraction/bg_sentences_v2.txt";