upload

Estefani Gaytan Nunez
Commit 5596beb29b229121e799cf3ab14b3ed64a563086 5596beb2 1 parent 5c170cf2
Showing 16 changed files with 22 additions and 40 deletions
CoreNLP/bin/annotation/get-raw-sentences.sh
CoreNLP/bin/annotation/single_run_regexNER.sh
CoreNLP/input/annotation/bg_sentences_v2.txt
CoreNLP/input/annotation/bg_sentences_v3.txt
CoreNLP/output/annotation/bg_sentences_v2.txt.ner
CoreNLP/output/annotation/bg_sentences_v3.txt.ner
extraction-geo/reports/bg_report_v2.txt → extraction-geo/reports/bg_report_v3.txt
predict-annot/bin/mapping/bg_map_index.sh
predict-annot/input/annot-input_bg_v1.txt
predict-annot/input/annot-input_bg_v2.txt
predict-annot/input/annot-input_bg_v3.txt
predict-annot/mapping/bg_GSE_map_index.txt
predict-annot/mapping/bg_GSM_map_index.txt
predict-annot/mapping/bg_sentences_midx.txt
predict-annot/reports/annot-input_bg_report_v2.txt
predict-annot/reports/annot-input_bg_report_v1.txt → predict-annot/reports/annot-input_bg_report_v3.txt
--- a/CoreNLP/bin/annotation/get-raw-sentences.sh
View file @5596beb
+++ b/CoreNLP/bin/annotation/get-raw-sentences.sh
View file @5596beb
@@ -7,10 +7,14 @@ cd /home/egaytan/automatic-extraction-growth-conditions/extraction-geo/outputs/
 
 echo "Access to output extracted baglines"
 echo "directory: "$(pwd);
- 
+ #all output-extraction files
 index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all-output-index.txt"
- output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v2.txt"
- report="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/bg_report_v2.txt"
+ #input sentences to run CoreNLP
+ output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v3.txt"
+ #GSE index by bg_sentence row
+ mapping="/home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping/bg_sentences_midx.txt"
+ #Number of fields by bagline
+ report="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/bg_report_v3.txt"
 echo
 echo
 echo
@@ -29,10 +33,15 @@ echo "==============================Baglines extraction=========================
 echo
 echo
 echo "Add sentence-end-tag PGCGROWTHCONDITIONS"
- for gsef in $( cat $index )
- do
-   cat $gsef | sort | uniq;
- done | cut -f1 | cut -f2 -d'"' | sort | uniq | awk '{ print $_ " PGCGROWTHCONDITIONS" }'  > $output
+ #for gsef in $( cat $index )
+ #do
+ #  cat $gsef | sort | uniq;
+ #done | cut -f1 | cut -f2 -d'"' | sort | uniq | awk '{ print $_ " PGCGROWTHCONDITIONS" }'  > $output
+ cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f7,9 | sort | uniq | awk 'BEGIN {FS="\t"}  length($2) > 3 { print  $_}' | awk '{ print $_ " PGCGROWTHCONDITIONS" }' | sed 's/\\null\\/null/g' )
+ echo "$cext" | cut -f2 > $output
+ echo "$cext" | cut -f2 |wc
+ echo "$cext" | cut -f1 > $mapping
+ echo "$cext" | cut -f1 |wc
 echo
 echo
 echo "Number of total baglines: "$(wc -l $output );
@@ -40,7 +49,7 @@ echo
 echo "Baglines report"
 
 
- for gsef in $( cat ../reports/all-output-index.txt)
+ for gsef in $( cat $index)
 do
   cat $gsef | sort | uniq ;
 done | cut -f2 | cut -f2 -d'"' | sed 's/_ch/./g' | cut -f1 -d'.' | sort | uniq -c |  awk '{print $1"\t"$2}' > $report
@@ -48,4 +57,4 @@ done | cut -f2 | cut -f2 -d'"' | sed 's/_ch/./g' | cut -f1 -d'.' | sort | uniq -
 cat $report
 echo
 echo
- echo "Saving file: /home/egaytan/automatic-extraction-growth-conditions/extraction/bg_sentences_v2.txt";
+ echo "Saving file: "$output;
--- a/CoreNLP/bin/annotation/single_run_regexNER.sh
View file @5596beb
+++ b/CoreNLP/bin/annotation/single_run_regexNER.sh
View file @5596beb
@@ -4,7 +4,7 @@ echo "==============================Run CoreNLP=================================
 echo
 echo
 
- input="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v2.txt"
+ input="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v3.txt"
 output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation"
 regexfile="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt"
 
--- a/CoreNLP/input/annotation/bg_sentences_v2.txt deleted 100644 → 0
View file @5c170cf
+++ b/CoreNLP/input/annotation/bg_sentences_v2.txt deleted 100644 → 0
View file @5c170cf
--- a/CoreNLP/input/annotation/bg_sentences_v3.txt 0 → 100644
View file @5596beb
+++ b/CoreNLP/input/annotation/bg_sentences_v3.txt 0 → 100644
View file @5596beb
--- a/CoreNLP/output/annotation/bg_sentences_v2.txt.ner deleted 100644 → 0
View file @5c170cf
+++ b/CoreNLP/output/annotation/bg_sentences_v2.txt.ner deleted 100644 → 0
View file @5c170cf
--- a/CoreNLP/output/annotation/bg_sentences_v3.txt.ner 0 → 100644
View file @5596beb
+++ b/CoreNLP/output/annotation/bg_sentences_v3.txt.ner 0 → 100644
View file @5596beb
--- a/extraction-geo/reports/bg_report_v2.txt → extraction-geo/reports/bg_report_v3.txt
View file @5596beb
+++ b/extraction-geo/reports/bg_report_v2.txt → extraction-geo/reports/bg_report_v3.txt
View file @5596beb
--- a/predict-annot/bin/mapping/bg_map_index.sh deleted 100644 → 0
View file @5c170cf
+++ b/predict-annot/bin/mapping/bg_map_index.sh deleted 100644 → 0
View file @5c170cf
- index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all-output-index.txt"
- mapfiel="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/bg_map_index.txt"
- grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f7,9 | sort | uniq | awk 'BEGIN {FS="\t"}  length($2) > 3 { print  $_}' > $mapfile
- 
- 
--- a/predict-annot/input/annot-input_bg_v1.txt deleted 100644 → 0
View file @5c170cf
+++ b/predict-annot/input/annot-input_bg_v1.txt deleted 100644 → 0
View file @5c170cf
--- a/predict-annot/input/annot-input_bg_v2.txt deleted 100644 → 0
View file @5c170cf
+++ b/predict-annot/input/annot-input_bg_v2.txt deleted 100644 → 0
View file @5c170cf
--- a/predict-annot/input/annot-input_bg_v3.txt 0 → 100644
View file @5596beb
+++ b/predict-annot/input/annot-input_bg_v3.txt 0 → 100644
View file @5596beb
--- a/predict-annot/mapping/bg_GSE_map_index.txt deleted 100644 → 0
View file @5c170cf
+++ b/predict-annot/mapping/bg_GSE_map_index.txt deleted 100644 → 0
View file @5c170cf
--- a/predict-annot/mapping/bg_GSM_map_index.txt deleted 100644 → 0
View file @5c170cf
+++ b/predict-annot/mapping/bg_GSM_map_index.txt deleted 100644 → 0
View file @5c170cf
--- a/predict-annot/mapping/bg_sentences_midx.txt 0 → 100644
View file @5596beb
+++ b/predict-annot/mapping/bg_sentences_midx.txt 0 → 100644
View file @5596beb
--- a/predict-annot/reports/annot-input_bg_report_v2.txt deleted 100644 → 0
View file @5c170cf
+++ b/predict-annot/reports/annot-input_bg_report_v2.txt deleted 100644 → 0
View file @5c170cf
- -------------------------------- PARAMETERS --------------------------------
- Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
- File with CoreNLP-tagging bg-sentences: bg_sentences_v2.txt.ner
- Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
- File to save recontrsucted bg-sentences: annot-input_bg_v2.txt
- -------------------------------- PROCESSING --------------------------------
- EXCLUDE: 
- EXCLUDE: 
- EXCLUDE: .|.|.
- EXCLUDE: \|\|SYM
- EXCLUDE: C1|c1|NN
- EXCLUDE: C2|c2|NN
- EXCLUDE: F1|f1|NN
- EXCLUDE: F2|f2|NN
- EXCLUDE: LB|lb|NN
- EXCLUDE: NA|NA|NNP
- EXCLUDE: NC|nc|NN
- EXCLUDE: V1|v1|NN
- EXCLUDE: wt|wt|JJ
- EXCLUDE: WT|WT|NNP
- Number of sentences: 13889
- ==================================END===================================
--- a/predict-annot/reports/annot-input_bg_report_v1.txt → predict-annot/reports/annot-input_bg_report_v3.txt
View file @5596beb
+++ b/predict-annot/reports/annot-input_bg_report_v1.txt → predict-annot/reports/annot-input_bg_report_v3.txt
View file @5596beb
 -------------------------------- PARAMETERS --------------------------------
 Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
- File with CoreNLP-tagging bg-sentences: bg_sentences_v2.txt.ner
+ File with CoreNLP-tagging bg-sentences: bg_sentences_v3.txt.ner
 Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
- File to save recontrsucted bg-sentences: annot-input_bg_v1.txt
+ File to save recontrsucted bg-sentences: annot-input_bg_v3.txt
 -------------------------------- PROCESSING --------------------------------
- Number of sentences: 13903
+ Number of sentences: 14716
 ==================================END===================================