Estefani Gaytan Nunez

upload

echo
echo
echo
echo "===================================Extraction============================================ "
cd /home/egaytan/automatic-extraction-growth-conditions/extraction-geo/outputs/
echo "Access to output extracted baglines"
echo "directory: "$(pwd);
#all output-extraction files
index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all-output-index.txt"
#input sentences to run CoreNLP
output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt"
#GSE index by bg_sentence row
mapping="/home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping/bg_sentences_midx_v4.txt"
#Number of fields by bagline
report="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/bg_report_v4.txt"
echo
echo
echo
echo "==============================Baglines index files======================================= "
# absolute file output path
for gse in $(ls -1)
do
cd $gse; ls -d $PWD/*; cd ..;
done > $index
echo "Number if extracted files"
wc -l $index
echo
echo
echo
echo "==============================Baglines extraction======================================="
echo
echo
echo "Add sentence-end-tag PGCGROWTHCONDITIONS"
#cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f8,9 | sort | uniq | awk 'BEGIN {FS="\t"} length($2) > 3 { print $_}' | sed 's/\\null\\/null/g'| sed 's/.tsv//g' | sed 's/-/\t/' | sed 's/-/\t/' )
cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7'| cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g' | sed 's/1.\tNeubauer//'| sort | uniq)
echo "$cext" | cut -f4 | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output
wc $output
echo "$cext" | cut -f1-3,5 > $mapping
wc $mapping
echo
echo
echo "Number of total baglines: "$(wc -l $output );
echo
echo "Baglines report"
for gsef in $( cat $index)
do
cat $gsef | sort | uniq ;
done | cut -f2 | cut -f2 -d'"' | sed 's/_ch/./g' | cut -f1 -d'.' | sort | uniq -c | awk '{print $1"\t"$2}' > $report
cat $report
echo
echo
echo "Saving file: "$output;
==============================Run CoreNLP=======================================
input file: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt
output directory: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
regex file: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt
java -mx5g -cp "/usr/local/stanford-corenlp-full-2018-10-05/*" edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,regexner -outputFormat conll -file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt -outputDirectory /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation -regexner.mapping /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt -outputExtension .ner
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator pos
[main] INFO edu.stanford.nlp.tagger.maxent.MaxentTagger - Loading POS tagger from edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger ... done [0.6 sec].
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma
[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator regexner
[main] INFO edu.stanford.nlp.pipeline.TokensRegexNERAnnotator - regexner: Read 9253 unique entries out of 13838 from /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt, 0 TokensRegex patterns.
Processing file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt ... writing to /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation/bg_sentences_v4.txt.ner
Untokenizable:  (U+F06D, decimal: 61549)
Annotating file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt ... done [236.1 sec].
Annotation pipeline timing information:
TokenizerAnnotator: 2.2 sec.
WordsToSentencesAnnotator: 4.3 sec.
POSTaggerAnnotator: 214.7 sec.
MorphaAnnotator: 5.6 sec.
TokensRegexNERAnnotator: 9.3 sec.
TOTAL: 236.1 sec. for 2373062 tokens at 10051.1 tokens/sec.
Pipeline setup: 0.9 sec.
Total time for StanfordCoreNLP pipeline: 239.0 sec.
......@@ -4,7 +4,7 @@ echo "==============================Run CoreNLP=================================
echo
echo
input="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v3.txt"
input="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt"
output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation"
regexfile="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt"
......
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
3
19269 characteristics
17680 data_processing
11024 extract_protocol
6043 growth_protocol
2344 library_strategy
13733 organism
9263 source_name
6799 title
4530 treatment_protocol
......@@ -26,7 +26,8 @@ from optparse import OptionParser
# --minSenLen 1
# --index 5
#
#python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1
#python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1 --index 5
#egaytan@pakal:~/automatic-extraction-growth-conditions$ python predict-annot/bin/preprocessing/built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v4.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg_v4.txt --minWordLen 0 --minSenLen 0 --index 5 > predict-annot/reports/annot-input_bg_report_v4.txt
##########################################
# MAIN PROGRAM #
......
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
-------------------------------- PARAMETERS --------------------------------
Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
File with CoreNLP-tagging bg-sentences: bg_sentences_v4.txt.ner
Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
File to save recontrsucted bg-sentences: annot-input_bg_v4.txt
-------------------------------- PROCESSING --------------------------------
Number of sentences: 90904
==================================END===================================
-------------------------------- PARAMETERS --------------------------------
--inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
--outputFileI Output tagged file I : annot-input_bg_outputI_v4.txt
--outputFileII Output tagged file II : annot-input_bg_outputII_v4.txt
--modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
--modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
--infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
--infoFile GSE-GSM index file : bg_sentences_midx_v4.txt
--variant Run variant : 13
--S1 General features : True
--S2 Inner/Complete word features : False
--S3 Extended context features : False
--S4 Semantic features : True
--filteringStopWords Filtering stop words : False
--filterSymbols Filtering punctuation marks : False
Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
-------------------------------- PROCESSING --------------------------------
Reading CRF model...
Reading CRF model done in: 0.009697s
Processing corpus...
Preprocessing file...annot-input_bg_v3.txt
Sentences input data: 14716
Predicting tags with model
Prediction done in: 1.732606s
Tagging file
Preprocessing file...annot-input_bg_v4.txt
Sentences input data: 90904
Predicting tags with model
Prediction done in: 26.221746s
Tagging file
Processing corpus done in: 58.477312s