Showing
13 changed files
with
144 additions
and
2 deletions
File moved
1 | +echo | ||
2 | +echo | ||
3 | +echo | ||
4 | +echo "===================================Extraction============================================ " | ||
5 | + | ||
6 | +cd /home/egaytan/automatic-extraction-growth-conditions/extraction-geo/outputs/ | ||
7 | + | ||
8 | +echo "Access to output extracted baglines" | ||
9 | +echo "directory: "$(pwd); | ||
10 | +#all output-extraction files | ||
11 | +index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all-output-index.txt" | ||
12 | +#input sentences to run CoreNLP | ||
13 | +output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt" | ||
14 | +#GSE index by bg_sentence row | ||
15 | +mapping="/home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping/bg_sentences_midx_v4.txt" | ||
16 | +#Number of fields by bagline | ||
17 | +report="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/bg_report_v4.txt" | ||
18 | +echo | ||
19 | +echo | ||
20 | +echo | ||
21 | +echo "==============================Baglines index files======================================= " | ||
22 | +# absolute file output path | ||
23 | +for gse in $(ls -1) | ||
24 | +do | ||
25 | + cd $gse; ls -d $PWD/*; cd ..; | ||
26 | +done > $index | ||
27 | +echo "Number if extracted files" | ||
28 | +wc -l $index | ||
29 | +echo | ||
30 | +echo | ||
31 | +echo | ||
32 | +echo "==============================Baglines extraction=======================================" | ||
33 | +echo | ||
34 | +echo | ||
35 | +echo "Add sentence-end-tag PGCGROWTHCONDITIONS" | ||
36 | +#cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f8,9 | sort | uniq | awk 'BEGIN {FS="\t"} length($2) > 3 { print $_}' | sed 's/\\null\\/null/g'| sed 's/.tsv//g' | sed 's/-/\t/' | sed 's/-/\t/' ) | ||
37 | +cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7'| cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g' | sed 's/1.\tNeubauer//'| sort | uniq) | ||
38 | +echo "$cext" | cut -f4 | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output | ||
39 | +wc $output | ||
40 | +echo "$cext" | cut -f1-3,5 > $mapping | ||
41 | +wc $mapping | ||
42 | +echo | ||
43 | +echo | ||
44 | +echo "Number of total baglines: "$(wc -l $output ); | ||
45 | +echo | ||
46 | +echo "Baglines report" | ||
47 | + | ||
48 | + | ||
49 | +for gsef in $( cat $index) | ||
50 | +do | ||
51 | + cat $gsef | sort | uniq ; | ||
52 | +done | cut -f2 | cut -f2 -d'"' | sed 's/_ch/./g' | cut -f1 -d'.' | sort | uniq -c | awk '{print $1"\t"$2}' > $report | ||
53 | + | ||
54 | +cat $report | ||
55 | +echo | ||
56 | +echo | ||
57 | +echo "Saving file: "$output; |
CoreNLP/bin/annotation/nohup.out
0 → 100644
1 | + | ||
2 | + | ||
3 | +==============================Run CoreNLP======================================= | ||
4 | + | ||
5 | + | ||
6 | + | ||
7 | +input file: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt | ||
8 | + | ||
9 | +output directory: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation | ||
10 | + | ||
11 | +regex file: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt | ||
12 | + | ||
13 | +java -mx5g -cp "/usr/local/stanford-corenlp-full-2018-10-05/*" edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,regexner -outputFormat conll -file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt -outputDirectory /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation -regexner.mapping /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt -outputExtension .ner | ||
14 | +[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator tokenize | ||
15 | +[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator ssplit | ||
16 | +[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator pos | ||
17 | +[main] INFO edu.stanford.nlp.tagger.maxent.MaxentTagger - Loading POS tagger from edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger ... done [0.6 sec]. | ||
18 | +[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator lemma | ||
19 | +[main] INFO edu.stanford.nlp.pipeline.StanfordCoreNLP - Adding annotator regexner | ||
20 | +[main] INFO edu.stanford.nlp.pipeline.TokensRegexNERAnnotator - regexner: Read 9253 unique entries out of 13838 from /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt, 0 TokensRegex patterns. | ||
21 | + | ||
22 | +Processing file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt ... writing to /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation/bg_sentences_v4.txt.ner | ||
23 | +Untokenizable: (U+F06D, decimal: 61549) | ||
24 | +Annotating file /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt ... done [236.1 sec]. | ||
25 | + | ||
26 | +Annotation pipeline timing information: | ||
27 | +TokenizerAnnotator: 2.2 sec. | ||
28 | +WordsToSentencesAnnotator: 4.3 sec. | ||
29 | +POSTaggerAnnotator: 214.7 sec. | ||
30 | +MorphaAnnotator: 5.6 sec. | ||
31 | +TokensRegexNERAnnotator: 9.3 sec. | ||
32 | +TOTAL: 236.1 sec. for 2373062 tokens at 10051.1 tokens/sec. | ||
33 | +Pipeline setup: 0.9 sec. | ||
34 | +Total time for StanfordCoreNLP pipeline: 239.0 sec. |
... | @@ -4,7 +4,7 @@ echo "==============================Run CoreNLP================================= | ... | @@ -4,7 +4,7 @@ echo "==============================Run CoreNLP================================= |
4 | echo | 4 | echo |
5 | echo | 5 | echo |
6 | 6 | ||
7 | -input="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v3.txt" | 7 | +input="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/annotation/bg_sentences_v4.txt" |
8 | output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation" | 8 | output="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation" |
9 | regexfile="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt" | 9 | regexfile="/home/egaytan/automatic-extraction-growth-conditions/CoreNLP/input/NER/inputEntities.txt" |
10 | 10 | ... | ... |
CoreNLP/input/annotation/bg_sentences_v4.txt
0 → 100644
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
extraction-geo/reports/bg_report_v4.txt
0 → 100644
... | @@ -26,7 +26,8 @@ from optparse import OptionParser | ... | @@ -26,7 +26,8 @@ from optparse import OptionParser |
26 | # --minSenLen 1 | 26 | # --minSenLen 1 |
27 | # --index 5 | 27 | # --index 5 |
28 | # | 28 | # |
29 | -#python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1 | 29 | +#python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1 --index 5 |
30 | +#egaytan@pakal:~/automatic-extraction-growth-conditions$ python predict-annot/bin/preprocessing/built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v4.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg_v4.txt --minWordLen 0 --minSenLen 0 --index 5 > predict-annot/reports/annot-input_bg_report_v4.txt | ||
30 | 31 | ||
31 | ########################################## | 32 | ########################################## |
32 | # MAIN PROGRAM # | 33 | # MAIN PROGRAM # | ... | ... |
predict-annot/input/annot-input_bg_v4.txt
0 → 100644
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
1 | +-------------------------------- PARAMETERS -------------------------------- | ||
2 | +Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation | ||
3 | +File with CoreNLP-tagging bg-sentences: bg_sentences_v4.txt.ner | ||
4 | +Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input | ||
5 | +File to save recontrsucted bg-sentences: annot-input_bg_v4.txt | ||
6 | +-------------------------------- PROCESSING -------------------------------- | ||
7 | +Number of sentences: 90904 | ||
8 | +==================================END=================================== |
1 | +-------------------------------- PARAMETERS -------------------------------- | ||
2 | +--inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ | ||
3 | +--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ | ||
4 | +--outputFileI Output tagged file I : annot-input_bg_outputI_v4.txt | ||
5 | +--outputFileII Output tagged file II : annot-input_bg_outputII_v4.txt | ||
6 | +--modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models | ||
7 | +--modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 | ||
8 | +--infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping | ||
9 | +--infoFile GSE-GSM index file : bg_sentences_midx_v4.txt | ||
10 | +--variant Run variant : 13 | ||
11 | +--S1 General features : True | ||
12 | +--S2 Inner/Complete word features : False | ||
13 | +--S3 Extended context features : False | ||
14 | +--S4 Semantic features : True | ||
15 | +--filteringStopWords Filtering stop words : False | ||
16 | +--filterSymbols Filtering punctuation marks : False | ||
17 | +Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False | ||
18 | +-------------------------------- PROCESSING -------------------------------- | ||
19 | +Reading CRF model... | ||
20 | +Reading CRF model done in: 0.009697s | ||
21 | +Processing corpus... | ||
22 | +Preprocessing file...annot-input_bg_v3.txt | ||
23 | +Sentences input data: 14716 | ||
24 | +Predicting tags with model | ||
25 | +Prediction done in: 1.732606s | ||
26 | +Tagging file | ||
27 | +Preprocessing file...annot-input_bg_v4.txt | ||
28 | +Sentences input data: 90904 | ||
29 | +Predicting tags with model | ||
30 | +Prediction done in: 26.221746s | ||
31 | +Tagging file | ||
32 | +Processing corpus done in: 58.477312s |
-
Please register or login to post a comment