get-raw-sentences_srr-galagan.sh
2.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
echo
echo
echo
echo "===================================Extraction============================================ "
cd /home/egaytan/automatic-extraction-growth-conditions/extraction-geo/outputs/srr_galagan/
echo "Access to output extracted baglines"
echo "directory: "$(pwd);
#all output-extraction files
index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/srr_galagan/all-output-index.txt"
#input sentences to run CoreNLP
output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_srr_galagan.txt"
#GSE index by bg_sentence row
mapping="/home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping/bg_sentences_midx_srr_galagan.txt"
#Number of fields by bagline
report="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/srr_galagan/bg_report.txt"
echo
echo
echo
echo "==============================Baglines index files======================================= "
# absolute file output path
for gse in $(ls -1)
do
cd $gse; ls -d $PWD/*; cd ..;
done > $index
echo "Number if extracted files"
wc -l $index
echo
echo
echo
echo "==============================Baglines extraction======================================="
echo
echo
echo "Add sentence-end-tag PGCGROWTHCONDITIONS"
#cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f8,9 | sort | uniq | awk 'BEGIN {FS="\t"} length($2) > 3 { print $_}' | sed 's/\\null\\/null/g'| sed 's/.tsv//g' | sed 's/-/\t/' | sed 's/-/\t/' )
cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7' | sed 's/1.\tNeubauer//' | cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g'|sed 's/\\null\\//g' | sort | uniq | awk 'BEGIN{FS="\t"; OFS="\t"}{ print $4"PGC",$0 }' | grep -vw "^V1PGC" | grep -vw "^WTPGC"| cut -f2- | sed 's/GSE[0-9]*\/GSE/GSE/1')
echo "$cext" > "/home/egaytan/ot"
echo "$cext" | cut -f4 | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output
wc $output
echo "$cext" cut -f1-3,5 > $mapping
wc $mapping
echo
echo
echo "Number of total baglines: "$(wc -l $output );
echo
echo "Baglines report"
for gsef in $( cat $index)
do
cat $gsef | sort | uniq ;
done | cut -f2 | cut -f2 -d'"' | sed 's/_ch/./g' | cut -f1 -d'.' | sort | uniq -c | awk '{print $1"\t"$2}' > $report
cat $report
echo
echo
echo "Saving file: "$output;