upload

Estefani Gaytan Nunez
Commit ee43c20633af0c13a6a8b3c527cbdf717166d35d ee43c206 1 parent 87ac8726
Showing 17 changed files with 40 additions and 47 deletions
CoreNLP/bin/annotation/get-raw-sentences_v4.sh
CoreNLP/input/annotation/bg_sentences_v4.txt
CoreNLP/output/annotation/bg_sentences_v4.txt.ner
predict-annot/bin/tagging/tagging.py
predict-annot/input/annot-input_bg_v3.txt
predict-annot/input/annot-input_bg_v4.txt
predict-annot/mapping/bg_sentences_midx_v3.txt
predict-annot/mapping/bg_sentences_midx_v4.txt
predict-annot/output/annot-input_bg_outputII.txt
predict-annot/output/annot-input_bg_outputIII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputI_v4.txt_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputI_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/reports/annot-input_bg_report_v3.txt
predict-annot/reports/annot-input_bg_report_v4.txt
predict-annot/reports/output_tagging_report.txt
predict-annot/reports/output_tagging_report_v4.txt
--- a/CoreNLP/bin/annotation/get-raw-sentences_v4.sh
View file @ee43c20
+++ b/CoreNLP/bin/annotation/get-raw-sentences_v4.sh
View file @ee43c20
@@ -34,7 +34,7 @@ echo
 echo
 echo "Add sentence-end-tag PGCGROWTHCONDITIONS"
 #cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f8,9 | sort | uniq | awk 'BEGIN {FS="\t"}  length($2) > 3 { print  $_}' | sed 's/\\null\\/null/g'| sed 's/.tsv//g'  | sed 's/-/\t/' | sed 's/-/\t/' )
- cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7'| cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g' | sed 's/1.\tNeubauer//'| sed 's/\\null\\/null/g' | sort | uniq)
+ cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7' | sed 's/1.\tNeubauer//' | cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g'|sed 's/\\null\\//g'  | sort | uniq)
 echo "$cext" | cut -f4 | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output
 wc $output
 echo "$cext" | cut -f1-3,5 > $mapping
--- a/CoreNLP/input/annotation/bg_sentences_v4.txt
View file @ee43c20
+++ b/CoreNLP/input/annotation/bg_sentences_v4.txt
View file @ee43c20
--- a/CoreNLP/output/annotation/bg_sentences_v4.txt.ner
View file @ee43c20
+++ b/CoreNLP/output/annotation/bg_sentences_v4.txt.ner
View file @ee43c20
--- a/predict-annot/bin/tagging/tagging.py
View file @ee43c20
+++ b/predict-annot/bin/tagging/tagging.py
View file @ee43c20
--- a/predict-annot/input/annot-input_bg_v3.txt deleted 100644 → 0
View file @87ac872
+++ b/predict-annot/input/annot-input_bg_v3.txt deleted 100644 → 0
View file @87ac872
--- a/predict-annot/input/annot-input_bg_v4.txt
View file @ee43c20
+++ b/predict-annot/input/annot-input_bg_v4.txt
View file @ee43c20
--- a/predict-annot/mapping/bg_sentences_midx_v3.txt deleted 100644 → 0
View file @87ac872
+++ b/predict-annot/mapping/bg_sentences_midx_v3.txt deleted 100644 → 0
View file @87ac872
--- a/predict-annot/mapping/bg_sentences_midx_v4.txt
View file @ee43c20
+++ b/predict-annot/mapping/bg_sentences_midx_v4.txt
View file @ee43c20
@@ -9328,7 +9328,7 @@ GSE12006	GSM303526	GPL3154-PMID:18940002	characteristics_ch1.1
 GSE12006	GSM303526	GPL3154-PMID:18940002	growth_protocol_ch1.1
 GSE12006	GSM303526	GPL3154-PMID:18940002	growth_protocol_ch1.2
 GSE12006	GSM303527	GPL3154-PMID:18940002	growth_protocol_ch1.4
- GSE12006	GSM303527	GPL3154-PMID:18940002
+ GSE12006	GSM303527	GPL3154-PMID:18940002	extract_protocol_ch1.3
 GSE12006	GSM303527	GPL3154-PMID:18940002	title.1
 GSE12006	GSM303527	GPL3154-PMID:18940002	source_name_ch1.1
 GSE12006	GSM303527	GPL3154-PMID:18940002	organism_ch1.1
@@ -9340,7 +9340,7 @@ GSE12006	GSM303527	GPL3154-PMID:18940002	characteristics_ch1.1
 GSE12006	GSM303527	GPL3154-PMID:18940002	growth_protocol_ch1.1
 GSE12006	GSM303527	GPL3154-PMID:18940002	growth_protocol_ch1.2
 GSE12006	GSM303528	GPL3154-PMID:18940002	growth_protocol_ch1.4
- GSE12006	GSM303528	GPL3154-PMID:18940002
+ GSE12006	GSM303528	GPL3154-PMID:18940002	extract_protocol_ch1.3
 GSE12006	GSM303528	GPL3154-PMID:18940002	title.1
 GSE12006	GSM303528	GPL3154-PMID:18940002	source_name_ch1.1
 GSE12006	GSM303528	GPL3154-PMID:18940002	organism_ch1.1
@@ -9352,7 +9352,7 @@ GSE12006	GSM303528	GPL3154-PMID:18940002	characteristics_ch1.1
 GSE12006	GSM303528	GPL3154-PMID:18940002	growth_protocol_ch1.1
 GSE12006	GSM303528	GPL3154-PMID:18940002	growth_protocol_ch1.2
 GSE12006	GSM303529	GPL3154-PMID:18940002	growth_protocol_ch1.4
- GSE12006	GSM303529	GPL3154-PMID:18940002
+ GSE12006	GSM303529	GPL3154-PMID:18940002	extract_protocol_ch1.3
 GSE12006	GSM303529	GPL3154-PMID:18940002	title.1
 GSE12006	GSM303529	GPL3154-PMID:18940002	source_name_ch1.1
 GSE12006	GSM303529	GPL3154-PMID:18940002	organism_ch1.1
--- a/predict-annot/output/annot-input_bg_outputII.txt deleted 100644 → 0
View file @87ac872
+++ b/predict-annot/output/annot-input_bg_outputII.txt deleted 100644 → 0
View file @87ac872
--- a/predict-annot/output/annot-input_bg_outputIII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @ee43c20
+++ b/predict-annot/output/annot-input_bg_outputIII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @ee43c20
--- a/predict-annot/output/annot-input_bg_outputII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
View file @ee43c20
+++ b/predict-annot/output/annot-input_bg_outputII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
View file @ee43c20
--- a/predict-annot/output/annot-input_bg_outputI_v4.txt_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @87ac872
+++ b/predict-annot/output/annot-input_bg_outputI_v4.txt_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @87ac872
--- a/predict-annot/output/annot-input_bg_outputI_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @ee43c20
+++ b/predict-annot/output/annot-input_bg_outputI_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @ee43c20
--- a/predict-annot/reports/annot-input_bg_report_v3.txt deleted 100644 → 0
View file @87ac872
+++ b/predict-annot/reports/annot-input_bg_report_v3.txt deleted 100644 → 0
View file @87ac872
- -------------------------------- PARAMETERS --------------------------------
- Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
- File with CoreNLP-tagging bg-sentences: bg_sentences_v3.txt.ner
- Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
- File to save recontrsucted bg-sentences: annot-input_bg_v3.txt
- -------------------------------- PROCESSING --------------------------------
- Number of sentences: 14716
- ==================================END===================================
--- a/predict-annot/reports/annot-input_bg_report_v4.txt
View file @ee43c20
+++ b/predict-annot/reports/annot-input_bg_report_v4.txt
View file @ee43c20
 -------------------------------- PARAMETERS --------------------------------
- Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
- File with CoreNLP-tagging bg-sentences: bg_sentences_v4.txt.ner
- Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
- File to save recontrsucted bg-sentences: annot-input_bg_v4.txt
+ --inputPath          Path of training data set        : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
+ --outputPath         Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
+ --outputFileI        Output tagged file I             : annot-input_bg_outputI_v4
+ --outputFileII       Output tagged file II            : annot-input_bg_outputII_v4
+ --outputFileII       Output tagged file III           : annot-input_bg_outputIII_v4
+ --modelPath          Path to read CRF model           : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
+ --modelName          Model name                       : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
+ --infoPath           Path of GSE-GSM index file       : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
+ --infoFile	        GSE-GSM index file               : bg_sentences_midx_v4.txt
+ --variant	        Run variant                      : 13
+ --S1                 General features                 : True
+ --S2                 Inner/Complete word features     : False
+ --S3                 Extended context features        : False
+ --S4                 Semantic features                : True
+ --filteringStopWords Filtering stop words             : False
+ --filterSymbols      Filtering punctuation marks      : False
 -------------------------------- PROCESSING --------------------------------
- Number of sentences: 90904
- ==================================END===================================
+ Reading CRF model...
+ Reading CRF model done in: 0.009463s
+ Processing corpus...
+ Preprocessing file...annot-input_bg_v4.txt
+ Sentences input data: 90688
+ Predicting tags with model...
+ Prediction done in: 26.367272s
+ Tagging file...
+ Saving Ouput I...
+ Saving Ouput II...
+ Saving Ouput III...
+ Processing corpus done in: 56.584394s
--- a/predict-annot/reports/output_tagging_report.txt deleted 100644 → 0
View file @87ac872
+++ b/predict-annot/reports/output_tagging_report.txt deleted 100644 → 0
View file @87ac872
- -------------------------------- PARAMETERS --------------------------------
- --inputPath          Path of training data set        : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
- --outputPath         Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
- --outputFileI        Output tagged file I             : annot-input_bg_outputI.txt
- --outputFileII       Output tagged file II            : annot-input_bg_outputII.txt
- --modelPath          Path to read CRF model           : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
- --modelName          Model name                       : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
- --infoPath           Path of GSE-GSM index file       : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
- --infoFile	        GSE-GSM index file               : bg_sentences_midx.txt
- --variant	        Run variant                      : 13
- --S1                 General features                 : True
- --S2                 Inner/Complete word features     : False
- --S3                 Extended context features        : False
- --S4                 Semantic features                : True
- --filteringStopWords Filtering stop words             : False
- --filterSymbols      Filtering punctuation marks      : False
- Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
- -------------------------------- PROCESSING --------------------------------
- Reading CRF model...
- Reading CRF model done in: 0.008336s
- Processing corpus...
- Preprocessing file...annot-input_bg_v3.txt
- Sentences input data: 14716
- Predicting tags with model
- Prediction done in: 1.688127s
- Tagging file
- Processing corpus done in: 3.948320s
--- a/predict-annot/reports/output_tagging_report_v4.txt
View file @ee43c20
+++ b/predict-annot/reports/output_tagging_report_v4.txt
View file @ee43c20
@@ -17,10 +17,16 @@
 Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
 -------------------------------- PROCESSING --------------------------------
 Reading CRF model...
- Reading CRF model done in: 0.009804s
+ Reading CRF model done in: 0.009363s
 Processing corpus...
 Preprocessing file...annot-input_bg_v3.txt
 Sentences input data: 14716
 Predicting tags with model
- Prediction done in: 1.811103s
+ Prediction done in: 1.737334s
 Tagging file
+ Preprocessing file...annot-input_bg_v4.txt
+ Sentences input data: 90688
+ Predicting tags with model
+ Prediction done in: 26.434549s
+ Tagging file
+ Processing corpus done in: 58.304885s