Estefani Gaytan Nunez

upload

......@@ -34,7 +34,7 @@ echo
echo
echo "Add sentence-end-tag PGCGROWTHCONDITIONS"
#cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f8,9 | sort | uniq | awk 'BEGIN {FS="\t"} length($2) > 3 { print $_}' | sed 's/\\null\\/null/g'| sed 's/.tsv//g' | sed 's/-/\t/' | sed 's/-/\t/' )
cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7'| cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g' | sed 's/1.\tNeubauer//'| sed 's/\\null\\/null/g' | sort | uniq)
cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7' | sed 's/1.\tNeubauer//' | cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g'|sed 's/\\null\\//g' | sort | uniq)
echo "$cext" | cut -f4 | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output
wc $output
echo "$cext" | cut -f1-3,5 > $mapping
......
This diff could not be displayed because it is too large.
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
......@@ -9328,7 +9328,7 @@ GSE12006 GSM303526 GPL3154-PMID:18940002 characteristics_ch1.1
GSE12006 GSM303526 GPL3154-PMID:18940002 growth_protocol_ch1.1
GSE12006 GSM303526 GPL3154-PMID:18940002 growth_protocol_ch1.2
GSE12006 GSM303527 GPL3154-PMID:18940002 growth_protocol_ch1.4
GSE12006 GSM303527 GPL3154-PMID:18940002
GSE12006 GSM303527 GPL3154-PMID:18940002 extract_protocol_ch1.3
GSE12006 GSM303527 GPL3154-PMID:18940002 title.1
GSE12006 GSM303527 GPL3154-PMID:18940002 source_name_ch1.1
GSE12006 GSM303527 GPL3154-PMID:18940002 organism_ch1.1
......@@ -9340,7 +9340,7 @@ GSE12006 GSM303527 GPL3154-PMID:18940002 characteristics_ch1.1
GSE12006 GSM303527 GPL3154-PMID:18940002 growth_protocol_ch1.1
GSE12006 GSM303527 GPL3154-PMID:18940002 growth_protocol_ch1.2
GSE12006 GSM303528 GPL3154-PMID:18940002 growth_protocol_ch1.4
GSE12006 GSM303528 GPL3154-PMID:18940002
GSE12006 GSM303528 GPL3154-PMID:18940002 extract_protocol_ch1.3
GSE12006 GSM303528 GPL3154-PMID:18940002 title.1
GSE12006 GSM303528 GPL3154-PMID:18940002 source_name_ch1.1
GSE12006 GSM303528 GPL3154-PMID:18940002 organism_ch1.1
......@@ -9352,7 +9352,7 @@ GSE12006 GSM303528 GPL3154-PMID:18940002 characteristics_ch1.1
GSE12006 GSM303528 GPL3154-PMID:18940002 growth_protocol_ch1.1
GSE12006 GSM303528 GPL3154-PMID:18940002 growth_protocol_ch1.2
GSE12006 GSM303529 GPL3154-PMID:18940002 growth_protocol_ch1.4
GSE12006 GSM303529 GPL3154-PMID:18940002
GSE12006 GSM303529 GPL3154-PMID:18940002 extract_protocol_ch1.3
GSE12006 GSM303529 GPL3154-PMID:18940002 title.1
GSE12006 GSM303529 GPL3154-PMID:18940002 source_name_ch1.1
GSE12006 GSM303529 GPL3154-PMID:18940002 organism_ch1.1
......
This diff could not be displayed because it is too large.
-------------------------------- PARAMETERS --------------------------------
Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
File with CoreNLP-tagging bg-sentences: bg_sentences_v3.txt.ner
Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
File to save recontrsucted bg-sentences: annot-input_bg_v3.txt
-------------------------------- PROCESSING --------------------------------
Number of sentences: 14716
==================================END===================================
-------------------------------- PARAMETERS --------------------------------
Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
File with CoreNLP-tagging bg-sentences: bg_sentences_v4.txt.ner
Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
File to save recontrsucted bg-sentences: annot-input_bg_v4.txt
--inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
--outputFileI Output tagged file I : annot-input_bg_outputI_v4
--outputFileII Output tagged file II : annot-input_bg_outputII_v4
--outputFileII Output tagged file III : annot-input_bg_outputIII_v4
--modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
--modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
--infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
--infoFile GSE-GSM index file : bg_sentences_midx_v4.txt
--variant Run variant : 13
--S1 General features : True
--S2 Inner/Complete word features : False
--S3 Extended context features : False
--S4 Semantic features : True
--filteringStopWords Filtering stop words : False
--filterSymbols Filtering punctuation marks : False
-------------------------------- PROCESSING --------------------------------
Number of sentences: 90904
==================================END===================================
Reading CRF model...
Reading CRF model done in: 0.009463s
Processing corpus...
Preprocessing file...annot-input_bg_v4.txt
Sentences input data: 90688
Predicting tags with model...
Prediction done in: 26.367272s
Tagging file...
Saving Ouput I...
Saving Ouput II...
Saving Ouput III...
Processing corpus done in: 56.584394s
......
-------------------------------- PARAMETERS --------------------------------
--inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
--outputFileI Output tagged file I : annot-input_bg_outputI.txt
--outputFileII Output tagged file II : annot-input_bg_outputII.txt
--modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
--modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
--infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
--infoFile GSE-GSM index file : bg_sentences_midx.txt
--variant Run variant : 13
--S1 General features : True
--S2 Inner/Complete word features : False
--S3 Extended context features : False
--S4 Semantic features : True
--filteringStopWords Filtering stop words : False
--filterSymbols Filtering punctuation marks : False
Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
-------------------------------- PROCESSING --------------------------------
Reading CRF model...
Reading CRF model done in: 0.008336s
Processing corpus...
Preprocessing file...annot-input_bg_v3.txt
Sentences input data: 14716
Predicting tags with model
Prediction done in: 1.688127s
Tagging file
Processing corpus done in: 3.948320s
......@@ -17,10 +17,16 @@
Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
-------------------------------- PROCESSING --------------------------------
Reading CRF model...
Reading CRF model done in: 0.009804s
Reading CRF model done in: 0.009363s
Processing corpus...
Preprocessing file...annot-input_bg_v3.txt
Sentences input data: 14716
Predicting tags with model
Prediction done in: 1.811103s
Prediction done in: 1.737334s
Tagging file
Preprocessing file...annot-input_bg_v4.txt
Sentences input data: 90688
Predicting tags with model
Prediction done in: 26.434549s
Tagging file
Processing corpus done in: 58.304885s
......