Estefani Gaytan Nunez

upload

1 +echo
2 +echo
3 +echo
4 +echo "===================================Extraction============================================ "
5 +
6 +cd /home/egaytan/automatic-extraction-growth-conditions/extraction-geo/outputs/
7 +
8 +echo "Access to output extracted baglines"
9 +echo "directory: "$(pwd);
10 +#all output-extraction files
11 +index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all-output-index.txt"
12 +#input sentences to run CoreNLP
13 +output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt"
14 +#GSE index by bg_sentence row
15 +mapping="/home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping/bg_sentences_midx_v4.txt"
16 +#Number of fields by bagline
17 +report="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/bg_report_v4.txt"
18 +echo
19 +echo
20 +echo
21 +echo "==============================Baglines index files======================================= "
22 +# absolute file output path
23 +for gse in $(ls -1)
24 +do
25 + cd $gse; ls -d $PWD/*; cd ..;
26 +done > $index
27 +echo "Number if extracted files"
28 +wc -l $index
29 +echo
30 +echo
31 +echo
32 +echo "==============================Baglines extraction======================================="
33 +echo
34 +echo
35 +echo "Add sentence-end-tag PGCGROWTHCONDITIONS"
36 +#cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f8,9 | sort | uniq | awk 'BEGIN {FS="\t"} length($2) > 3 { print $_}' | sed 's/\\null\\/null/g'| sed 's/.tsv//g' | sed 's/-/\t/' | sed 's/-/\t/' )
37 +cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7'| cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g' | sed 's/1.\tNeubauer//'| sort | uniq)
38 +echo "$cext" | cut -f4 | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output
39 +wc $output
40 +echo "$cext" | cut -f1-3,5 > $mapping
41 +wc $mapping
42 +echo
43 +echo
44 +echo "Number of total baglines: "$(wc -l $output );
45 +echo
46 +echo "Baglines report"
47 +
48 +
49 +for gsef in $( cat $index)
50 +do
51 + cat $gsef | sort | uniq ;
52 +done | cut -f2 | cut -f2 -d'"' | sed 's/_ch/./g' | cut -f1 -d'.' | sort | uniq -c | awk '{print $1"\t"$2}' > $report
53 +
54 +cat $report
55 +echo
56 +echo
57 +echo "Saving file: "$output;
1 +
2 +
3 +==============================Run CoreNLP=======================================
4 +
5 +
6 +
7 +input file: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt
8 +
9 +output directory: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
10 +
11 +regex file: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt
12 +
13 +java -mx5g -cp "/usr/local/stanford-corenlp-full-2018-10-05/*" edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,regexner -outputFormat conll -file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt -outputDirectory /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation -regexner.mapping /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt -outputExtension .ner
14 +[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
15 +[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
16 +[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator pos
17 +[main] INFO edu.stanford.nlp.tagger.maxent.MaxentTagger - Loading POS tagger from edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger ... done [0.6 sec].
18 +[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
19 +[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator regexner
20 +[main] INFO edu.stanford.nlp.pipeline.TokensRegexNERAnnotator - regexner: Read 9253 unique entries out of 13838 from /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt, 0 TokensRegex patterns.
21 +
22 +Processing file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt ... writing to /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation/bg_sentences_v4.txt.ner
23 +Untokenizable:  (U+F06D, decimal: 61549)
24 +Annotating file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt ... done [236.1 sec].
25 +
26 +Annotation pipeline timing information:
27 +TokenizerAnnotator: 2.2 sec.
28 +WordsToSentencesAnnotator: 4.3 sec.
29 +POSTaggerAnnotator: 214.7 sec.
30 +MorphaAnnotator: 5.6 sec.
31 +TokensRegexNERAnnotator: 9.3 sec.
32 +TOTAL: 236.1 sec. for 2373062 tokens at 10051.1 tokens/sec.
33 +Pipeline setup: 0.9 sec.
34 +Total time for StanfordCoreNLP pipeline: 239.0 sec.
...@@ -4,7 +4,7 @@ echo "==============================Run CoreNLP================================= ...@@ -4,7 +4,7 @@ echo "==============================Run CoreNLP=================================
4 echo 4 echo
5 echo 5 echo
6 6
7 -input="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v3.txt" 7 +input="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt"
8 output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation" 8 output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation"
9 regexfile="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt" 9 regexfile="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt"
10 10
......
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
1 +3
2 +19269 characteristics
3 +17680 data_processing
4 +11024 extract_protocol
5 +6043 growth_protocol
6 +2344 library_strategy
7 +13733 organism
8 +9263 source_name
9 +6799 title
10 +4530 treatment_protocol
...@@ -26,7 +26,8 @@ from optparse import OptionParser ...@@ -26,7 +26,8 @@ from optparse import OptionParser
26 # --minSenLen 1 26 # --minSenLen 1
27 # --index 5 27 # --index 5
28 # 28 #
29 -#python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1 29 +#python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1 --index 5
30 +#egaytan@pakal:~/automatic-extraction-growth-conditions$ python predict-annot/bin/preprocessing/built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v4.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg_v4.txt --minWordLen 0 --minSenLen 0 --index 5 > predict-annot/reports/annot-input_bg_report_v4.txt
30 31
31 ########################################## 32 ##########################################
32 # MAIN PROGRAM # 33 # MAIN PROGRAM #
......
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
1 +-------------------------------- PARAMETERS --------------------------------
2 +Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
3 +File with CoreNLP-tagging bg-sentences: bg_sentences_v4.txt.ner
4 +Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
5 +File to save recontrsucted bg-sentences: annot-input_bg_v4.txt
6 +-------------------------------- PROCESSING --------------------------------
7 +Number of sentences: 90904
8 +==================================END===================================
1 +-------------------------------- PARAMETERS --------------------------------
2 +--inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
3 +--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
4 +--outputFileI Output tagged file I : annot-input_bg_outputI_v4.txt
5 +--outputFileII Output tagged file II : annot-input_bg_outputII_v4.txt
6 +--modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
7 +--modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
8 +--infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
9 +--infoFile GSE-GSM index file : bg_sentences_midx_v4.txt
10 +--variant Run variant : 13
11 +--S1 General features : True
12 +--S2 Inner/Complete word features : False
13 +--S3 Extended context features : False
14 +--S4 Semantic features : True
15 +--filteringStopWords Filtering stop words : False
16 +--filterSymbols Filtering punctuation marks : False
17 +Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
18 +-------------------------------- PROCESSING --------------------------------
19 +Reading CRF model...
20 +Reading CRF model done in: 0.009697s
21 +Processing corpus...
22 +Preprocessing file...annot-input_bg_v3.txt
23 +Sentences input data: 14716
24 +Predicting tags with model
25 +Prediction done in: 1.732606s
26 +Tagging file
27 +Preprocessing file...annot-input_bg_v4.txt
28 +Sentences input data: 90904
29 +Predicting tags with model
30 +Prediction done in: 26.221746s
31 +Tagging file
32 +Processing corpus done in: 58.477312s