upload

Estefani Gaytan Nunez
Commit 67d2b78c2a5467e209ee89e925e9744c005bf03b 67d2b78c 1 parent 07d3119f
Showing 13 changed files with 144 additions and 2 deletions
CoreNLP/bin/annotation/get-raw-sentences.sh → CoreNLP/bin/annotation/get-raw-sentences_v3.sh
CoreNLP/bin/annotation/get-raw-sentences_v4.sh
CoreNLP/bin/annotation/nohup.out
CoreNLP/bin/annotation/single_run_regexNER.sh
CoreNLP/input/annotation/bg_sentences_v4.txt
CoreNLP/output/annotation/bg_sentences_v4.txt.ner
extraction-geo/reports/bg_report_v4.txt
predict-annot/bin/preprocessing/built_bg_sentences.py
predict-annot/input/annot-input_bg_v4.txt
predict-annot/mapping/bg_sentences_midx_v4.txt
predict-annot/output/annot-input_bg_outputII_v4.txt
predict-annot/reports/annot-input_bg_report_v4.txt
predict-annot/reports/output_tagging_report_v4.txt
--- a/CoreNLP/bin/annotation/get-raw-sentences.sh → CoreNLP/bin/annotation/get-raw-sentences_v3.sh
View file @67d2b78
+++ b/CoreNLP/bin/annotation/get-raw-sentences.sh → CoreNLP/bin/annotation/get-raw-sentences_v3.sh
View file @67d2b78
--- a/CoreNLP/bin/annotation/get-raw-sentences_v4.sh 0 → 100644
View file @67d2b78
+++ b/CoreNLP/bin/annotation/get-raw-sentences_v4.sh 0 → 100644
View file @67d2b78
+echo
+echo
+echo
+echo "===================================Extraction============================================ "
+
+cd /home/egaytan/automatic-extraction-growth-conditions/extraction-geo/outputs/
+
+echo "Access to output extracted baglines"
+echo "directory: "$(pwd);
+#all output-extraction files
+index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all-output-index.txt"
+#input sentences to run CoreNLP
+output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt"
+#GSE index by bg_sentence row
+mapping="/home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping/bg_sentences_midx_v4.txt"
+#Number of fields by bagline
+report="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/bg_report_v4.txt"
+echo
+echo
+echo
+echo "==============================Baglines index files======================================= "
+# absolute file output path
+for gse in $(ls -1)
+do
+  cd $gse; ls -d $PWD/*; cd ..;
+done > $index
+echo "Number if extracted files"
+wc -l $index
+echo
+echo
+echo
+echo "==============================Baglines extraction======================================="
+echo
+echo
+echo "Add sentence-end-tag PGCGROWTHCONDITIONS"
+#cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f8,9 | sort | uniq | awk 'BEGIN {FS="\t"}  length($2) > 3 { print  $_}' | sed 's/\\null\\/null/g'| sed 's/.tsv//g'  | sed 's/-/\t/' | sed 's/-/\t/' )
+cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7'| cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g' | sed 's/1.\tNeubauer//'| sort | uniq)
+echo "$cext" | cut -f4 | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output
+wc $output
+echo "$cext" | cut -f1-3,5 > $mapping
+wc $mapping
+echo
+echo
+echo "Number of total baglines: "$(wc -l $output );
+echo
+echo "Baglines report"
+
+
+for gsef in $( cat $index)
+do
+  cat $gsef | sort | uniq ;
+done | cut -f2 | cut -f2 -d'"' | sed 's/_ch/./g' | cut -f1 -d'.' | sort | uniq -c |  awk '{print $1"\t"$2}' > $report
+
+cat $report
+echo
+echo
+echo "Saving file: "$output;
--- a/CoreNLP/bin/annotation/nohup.out 0 → 100644
View file @67d2b78
+++ b/CoreNLP/bin/annotation/nohup.out 0 → 100644
View file @67d2b78
+
+
+==============================Run CoreNLP======================================= 
+
+
+
+input file:  /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt
+
+output directory:  /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
+
+regex file:  /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt
+
+java -mx5g -cp "/usr/local/stanford-corenlp-full-2018-10-05/*" edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,regexner -outputFormat conll -file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt -outputDirectory /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation -regexner.mapping /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt -outputExtension .ner
+[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
+[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
+[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator pos
+[main] INFO edu.stanford.nlp.tagger.maxent.MaxentTagger - Loading POS tagger from edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger ... done [0.6 sec].
+[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
+[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator regexner
+[main] INFO edu.stanford.nlp.pipeline.TokensRegexNERAnnotator - regexner: Read 9253 unique entries out of 13838 from /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt, 0 TokensRegex patterns.
+
+Processing file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt ... writing to /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation/bg_sentences_v4.txt.ner
+Untokenizable:  (U+F06D, decimal: 61549)
+Annotating file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt ... done [236.1 sec].
+
+Annotation pipeline timing information:
+TokenizerAnnotator: 2.2 sec.
+WordsToSentencesAnnotator: 4.3 sec.
+POSTaggerAnnotator: 214.7 sec.
+MorphaAnnotator: 5.6 sec.
+TokensRegexNERAnnotator: 9.3 sec.
+TOTAL: 236.1 sec. for 2373062 tokens at 10051.1 tokens/sec.
+Pipeline setup: 0.9 sec.
+Total time for StanfordCoreNLP pipeline: 239.0 sec.
--- a/CoreNLP/bin/annotation/single_run_regexNER.sh
View file @67d2b78
+++ b/CoreNLP/bin/annotation/single_run_regexNER.sh
View file @67d2b78
@@ -4,7 +4,7 @@ echo "==============================Run CoreNLP=================================
 echo
 echo
-input="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v3.txt"
+input="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt"
 output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation"
 regexfile="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt"
--- a/CoreNLP/input/annotation/bg_sentences_v4.txt 0 → 100644
View file @67d2b78
+++ b/CoreNLP/input/annotation/bg_sentences_v4.txt 0 → 100644
View file @67d2b78
--- a/CoreNLP/output/annotation/bg_sentences_v4.txt.ner 0 → 100644
View file @67d2b78
+++ b/CoreNLP/output/annotation/bg_sentences_v4.txt.ner 0 → 100644
View file @67d2b78
--- a/extraction-geo/reports/bg_report_v4.txt 0 → 100644
View file @67d2b78
+++ b/extraction-geo/reports/bg_report_v4.txt 0 → 100644
View file @67d2b78
+3	
+19269	characteristics
+17680	data_processing
+11024	extract_protocol
+6043	growth_protocol
+2344	library_strategy
+13733	organism
+9263	source_name
+6799	title
+4530	treatment_protocol
--- a/predict-annot/bin/preprocessing/built_bg_sentences.py
View file @67d2b78
+++ b/predict-annot/bin/preprocessing/built_bg_sentences.py
View file @67d2b78
@@ -26,7 +26,8 @@ from optparse import OptionParser
 # --minSenLen               1
 # --index		    5
 #
-#python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1
+#python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1 --index 5
+#egaytan@pakal:~/automatic-extraction-growth-conditions$ python predict-annot/bin/preprocessing/built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v4.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg_v4.txt --minWordLen 0 --minSenLen 0 --index 5 > predict-annot/reports/annot-input_bg_report_v4.txt
 ##########################################
 #               MAIN PROGRAM             #
--- a/predict-annot/input/annot-input_bg_v4.txt 0 → 100644
View file @67d2b78
+++ b/predict-annot/input/annot-input_bg_v4.txt 0 → 100644
View file @67d2b78
--- a/predict-annot/mapping/bg_sentences_midx_v4.txt 0 → 100644
View file @67d2b78
+++ b/predict-annot/mapping/bg_sentences_midx_v4.txt 0 → 100644
View file @67d2b78
--- a/predict-annot/output/annot-input_bg_outputII_v4.txt 0 → 100644
View file @67d2b78
+++ b/predict-annot/output/annot-input_bg_outputII_v4.txt 0 → 100644
View file @67d2b78
--- a/predict-annot/reports/annot-input_bg_report_v4.txt 0 → 100644
View file @67d2b78
+++ b/predict-annot/reports/annot-input_bg_report_v4.txt 0 → 100644
View file @67d2b78
+-------------------------------- PARAMETERS --------------------------------
+Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
+File with CoreNLP-tagging bg-sentences: bg_sentences_v4.txt.ner
+Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
+File to save recontrsucted bg-sentences: annot-input_bg_v4.txt
+-------------------------------- PROCESSING --------------------------------
+Number of sentences: 90904
+==================================END===================================
--- a/predict-annot/reports/output_tagging_report_v4.txt 0 → 100644
View file @67d2b78
+++ b/predict-annot/reports/output_tagging_report_v4.txt 0 → 100644
View file @67d2b78
+-------------------------------- PARAMETERS --------------------------------
+--inputPath          Path of training data set        : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
+--outputPath         Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
+--outputFileI        Output tagged file I             : annot-input_bg_outputI_v4.txt
+--outputFileII       Output tagged file II            : annot-input_bg_outputII_v4.txt
+--modelPath          Path to read CRF model           : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
+--modelName          Model name                       : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
+--infoPath           Path of GSE-GSM index file       : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
+--infoFile	        GSE-GSM index file               : bg_sentences_midx_v4.txt
+--variant	        Run variant                      : 13
+--S1                 General features                 : True
+--S2                 Inner/Complete word features     : False
+--S3                 Extended context features        : False
+--S4                 Semantic features                : True
+--filteringStopWords Filtering stop words             : False
+--filterSymbols      Filtering punctuation marks      : False
+Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
+-------------------------------- PROCESSING --------------------------------
+Reading CRF model...
+Reading CRF model done in: 0.009697s
+Processing corpus...
+Preprocessing file...annot-input_bg_v3.txt
+Sentences input data: 14716
+Predicting tags with model
+Prediction done in: 1.732606s
+Tagging file
+Preprocessing file...annot-input_bg_v4.txt
+Sentences input data: 90904
+Predicting tags with model
+Prediction done in: 26.221746s
+Tagging file
+Processing corpus done in: 58.477312s