upload

Estefani Gaytan Nunez
Commit 67d2b78c2a5467e209ee89e925e9744c005bf03b 67d2b78c 1 parent 07d3119f
Showing 13 changed files with 144 additions and 2 deletions
CoreNLP/bin/annotation/get-raw-sentences.sh → CoreNLP/bin/annotation/get-raw-sentences_v3.sh
CoreNLP/bin/annotation/get-raw-sentences_v4.sh
CoreNLP/bin/annotation/nohup.out
CoreNLP/bin/annotation/single_run_regexNER.sh
CoreNLP/input/annotation/bg_sentences_v4.txt
CoreNLP/output/annotation/bg_sentences_v4.txt.ner
extraction-geo/reports/bg_report_v4.txt
predict-annot/bin/preprocessing/built_bg_sentences.py
predict-annot/input/annot-input_bg_v4.txt
predict-annot/mapping/bg_sentences_midx_v4.txt
predict-annot/output/annot-input_bg_outputII_v4.txt
predict-annot/reports/annot-input_bg_report_v4.txt
predict-annot/reports/output_tagging_report_v4.txt
--- a/CoreNLP/bin/annotation/get-raw-sentences.sh → CoreNLP/bin/annotation/get-raw-sentences_v3.sh
View file @67d2b78
+++ b/CoreNLP/bin/annotation/get-raw-sentences.sh → CoreNLP/bin/annotation/get-raw-sentences_v3.sh
View file @67d2b78
--- a/CoreNLP/bin/annotation/get-raw-sentences_v4.sh 0 → 100644
View file @67d2b78
+++ b/CoreNLP/bin/annotation/get-raw-sentences_v4.sh 0 → 100644
View file @67d2b78
+ echo
+ echo
+ echo
+ echo "===================================Extraction============================================ "
+ 
+ cd /home/egaytan/automatic-extraction-growth-conditions/extraction-geo/outputs/
+ 
+ echo "Access to output extracted baglines"
+ echo "directory: "$(pwd);
+ #all output-extraction files
+ index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all-output-index.txt"
+ #input sentences to run CoreNLP
+ output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt"
+ #GSE index by bg_sentence row
+ mapping="/home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping/bg_sentences_midx_v4.txt"
+ #Number of fields by bagline
+ report="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/bg_report_v4.txt"
+ echo
+ echo
+ echo
+ echo "==============================Baglines index files======================================= "
+ # absolute file output path
+ for gse in $(ls -1)
+ do
+   cd $gse; ls -d $PWD/*; cd ..;
+ done > $index
+ echo "Number if extracted files"
+ wc -l $index
+ echo
+ echo
+ echo
+ echo "==============================Baglines extraction======================================="
+ echo
+ echo
+ echo "Add sentence-end-tag PGCGROWTHCONDITIONS"
+ #cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f8,9 | sort | uniq | awk 'BEGIN {FS="\t"}  length($2) > 3 { print  $_}' | sed 's/\\null\\/null/g'| sed 's/.tsv//g'  | sed 's/-/\t/' | sed 's/-/\t/' )
+ cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7'| cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g' | sed 's/1.\tNeubauer//'| sort | uniq)
+ echo "$cext" | cut -f4 | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output
+ wc $output
+ echo "$cext" | cut -f1-3,5 > $mapping
+ wc $mapping
+ echo
+ echo
+ echo "Number of total baglines: "$(wc -l $output );
+ echo
+ echo "Baglines report"
+ 
+ 
+ for gsef in $( cat $index)
+ do
+   cat $gsef | sort | uniq ;
+ done | cut -f2 | cut -f2 -d'"' | sed 's/_ch/./g' | cut -f1 -d'.' | sort | uniq -c |  awk '{print $1"\t"$2}' > $report
+ 
+ cat $report
+ echo
+ echo
+ echo "Saving file: "$output;
--- a/CoreNLP/bin/annotation/nohup.out 0 → 100644
View file @67d2b78
+++ b/CoreNLP/bin/annotation/nohup.out 0 → 100644
View file @67d2b78
+ 
+ 
+ ==============================Run CoreNLP======================================= 
+ 
+ 
+ 
+ input file:  /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt
+ 
+ output directory:  /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
+ 
+ regex file:  /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt
+ 
+ java -mx5g -cp "/usr/local/stanford-corenlp-full-2018-10-05/*" edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,regexner -outputFormat conll -file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt -outputDirectory /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation -regexner.mapping /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt -outputExtension .ner
+ [main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
+ [main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
+ [main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator pos
+ [main] INFO edu.stanford.nlp.tagger.maxent.MaxentTagger - Loading POS tagger from edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger ... done [0.6 sec].
+ [main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
+ [main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator regexner
+ [main] INFO edu.stanford.nlp.pipeline.TokensRegexNERAnnotator - regexner: Read 9253 unique entries out of 13838 from /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt, 0 TokensRegex patterns.
+ 
+ Processing file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt ... writing to /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation/bg_sentences_v4.txt.ner
+ Untokenizable:  (U+F06D, decimal: 61549)
+ Annotating file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt ... done [236.1 sec].
+ 
+ Annotation pipeline timing information:
+ TokenizerAnnotator: 2.2 sec.
+ WordsToSentencesAnnotator: 4.3 sec.
+ POSTaggerAnnotator: 214.7 sec.
+ MorphaAnnotator: 5.6 sec.
+ TokensRegexNERAnnotator: 9.3 sec.
+ TOTAL: 236.1 sec. for 2373062 tokens at 10051.1 tokens/sec.
+ Pipeline setup: 0.9 sec.
+ Total time for StanfordCoreNLP pipeline: 239.0 sec.
--- a/CoreNLP/bin/annotation/single_run_regexNER.sh
View file @67d2b78
+++ b/CoreNLP/bin/annotation/single_run_regexNER.sh
View file @67d2b78
@@ -4,7 +4,7 @@ echo "==============================Run CoreNLP=================================
 echo
 echo
 
- input="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v3.txt"
+ input="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt"
 output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation"
 regexfile="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt"
 
--- a/CoreNLP/input/annotation/bg_sentences_v4.txt 0 → 100644
View file @67d2b78
+++ b/CoreNLP/input/annotation/bg_sentences_v4.txt 0 → 100644
View file @67d2b78
--- a/CoreNLP/output/annotation/bg_sentences_v4.txt.ner 0 → 100644
View file @67d2b78
+++ b/CoreNLP/output/annotation/bg_sentences_v4.txt.ner 0 → 100644
View file @67d2b78
--- a/extraction-geo/reports/bg_report_v4.txt 0 → 100644
View file @67d2b78
+++ b/extraction-geo/reports/bg_report_v4.txt 0 → 100644
View file @67d2b78
+ 3	
+ 19269	characteristics
+ 17680	data_processing
+ 11024	extract_protocol
+ 6043	growth_protocol
+ 2344	library_strategy
+ 13733	organism
+ 9263	source_name
+ 6799	title
+ 4530	treatment_protocol
--- a/predict-annot/bin/preprocessing/built_bg_sentences.py
View file @67d2b78
+++ b/predict-annot/bin/preprocessing/built_bg_sentences.py
View file @67d2b78
@@ -26,7 +26,8 @@ from optparse import OptionParser
 # --minSenLen               1
 # --index		    5
 #
- #python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1
+ #python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1 --index 5
+ #egaytan@pakal:~/automatic-extraction-growth-conditions$ python predict-annot/bin/preprocessing/built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v4.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg_v4.txt --minWordLen 0 --minSenLen 0 --index 5 > predict-annot/reports/annot-input_bg_report_v4.txt
 
 ##########################################
 #               MAIN PROGRAM             #
--- a/predict-annot/input/annot-input_bg_v4.txt 0 → 100644
View file @67d2b78
+++ b/predict-annot/input/annot-input_bg_v4.txt 0 → 100644
View file @67d2b78
--- a/predict-annot/mapping/bg_sentences_midx_v4.txt 0 → 100644
View file @67d2b78
+++ b/predict-annot/mapping/bg_sentences_midx_v4.txt 0 → 100644
View file @67d2b78
--- a/predict-annot/output/annot-input_bg_outputII_v4.txt 0 → 100644
View file @67d2b78
+++ b/predict-annot/output/annot-input_bg_outputII_v4.txt 0 → 100644
View file @67d2b78
--- a/predict-annot/reports/annot-input_bg_report_v4.txt 0 → 100644
View file @67d2b78
+++ b/predict-annot/reports/annot-input_bg_report_v4.txt 0 → 100644
View file @67d2b78
+ -------------------------------- PARAMETERS --------------------------------
+ Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
+ File with CoreNLP-tagging bg-sentences: bg_sentences_v4.txt.ner
+ Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
+ File to save recontrsucted bg-sentences: annot-input_bg_v4.txt
+ -------------------------------- PROCESSING --------------------------------
+ Number of sentences: 90904
+ ==================================END===================================
--- a/predict-annot/reports/output_tagging_report_v4.txt 0 → 100644
View file @67d2b78
+++ b/predict-annot/reports/output_tagging_report_v4.txt 0 → 100644
View file @67d2b78
+ -------------------------------- PARAMETERS --------------------------------
+ --inputPath          Path of training data set        : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
+ --outputPath         Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
+ --outputFileI        Output tagged file I             : annot-input_bg_outputI_v4.txt
+ --outputFileII       Output tagged file II            : annot-input_bg_outputII_v4.txt
+ --modelPath          Path to read CRF model           : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
+ --modelName          Model name                       : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
+ --infoPath           Path of GSE-GSM index file       : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
+ --infoFile	        GSE-GSM index file               : bg_sentences_midx_v4.txt
+ --variant	        Run variant                      : 13
+ --S1                 General features                 : True
+ --S2                 Inner/Complete word features     : False
+ --S3                 Extended context features        : False
+ --S4                 Semantic features                : True
+ --filteringStopWords Filtering stop words             : False
+ --filterSymbols      Filtering punctuation marks      : False
+ Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
+ -------------------------------- PROCESSING --------------------------------
+ Reading CRF model...
+ Reading CRF model done in: 0.009697s
+ Processing corpus...
+ Preprocessing file...annot-input_bg_v3.txt
+ Sentences input data: 14716
+ Predicting tags with model
+ Prediction done in: 1.732606s
+ Tagging file
+ Preprocessing file...annot-input_bg_v4.txt
+ Sentences input data: 90904
+ Predicting tags with model
+ Prediction done in: 26.221746s
+ Tagging file
+ Processing corpus done in: 58.477312s