Estefani Gaytan Nunez

upload

...@@ -34,7 +34,7 @@ echo ...@@ -34,7 +34,7 @@ echo
34 echo 34 echo
35 echo "Add sentence-end-tag PGCGROWTHCONDITIONS" 35 echo "Add sentence-end-tag PGCGROWTHCONDITIONS"
36 #cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f8,9 | sort | uniq | awk 'BEGIN {FS="\t"} length($2) > 3 { print $_}' | sed 's/\\null\\/null/g'| sed 's/.tsv//g' | sed 's/-/\t/' | sed 's/-/\t/' ) 36 #cext=$(grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f8,9 | sort | uniq | awk 'BEGIN {FS="\t"} length($2) > 3 { print $_}' | sed 's/\\null\\/null/g'| sed 's/.tsv//g' | sed 's/-/\t/' | sed 's/-/\t/' )
37 -cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7'| cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g' | sed 's/1.\tNeubauer//'| sort | uniq) 37 +cext=$(grep -E ".*" $(cat $index | tr '\n' ' ') | sed 's/\//\t/7'| cut -f2-3 | sed 's/-/\t/' | sed 's/-/\t/' | sed 's/.tsv:/\t/' | sed 's/\"//g' | sed 's/1.\tNeubauer//'| sed 's/\\null\\/null/g' | sort | uniq)
38 echo "$cext" | cut -f4 | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output 38 echo "$cext" | cut -f4 | awk '{ print $_ " PGCGROWTHCONDITIONS" }' > $output
39 wc $output 39 wc $output
40 echo "$cext" | cut -f1-3,5 > $mapping 40 echo "$cext" | cut -f1-3,5 > $mapping
......
This diff could not be displayed because it is too large.
...@@ -61,6 +61,7 @@ import training_validation_v14 as training ...@@ -61,6 +61,7 @@ import training_validation_v14 as training
61 # --variant 13 61 # --variant 13
62 62
63 #python3 tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI.txt --outputFileII annot-input_bg_outputII.txt --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx.txt --variant 13 --S4 --S1 > ../../reports/output_tagging_report.txt 63 #python3 tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI.txt --outputFileII annot-input_bg_outputII.txt --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx.txt --variant 13 --S4 --S1 > ../../reports/output_tagging_report.txt
64 +#python3 predict-annot/bin/tagging/tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI_v4.txt --outputFileII annot-input_bg_outputII_v4 --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx_v4.txt --variant 13 --S4 --S1 > predict-annot/reports/output_tagging_report_v4.txt
64 65
65 __author__ = 'egaytan' 66 __author__ = 'egaytan'
66 67
...@@ -241,7 +242,7 @@ if __name__ == "__main__": ...@@ -241,7 +242,7 @@ if __name__ == "__main__":
241 #print( DF(sentencesOutputDataI) ) 242 #print( DF(sentencesOutputDataI) )
242 #print( '\n'.join(sentencesOutputDataII) ) 243 #print( '\n'.join(sentencesOutputDataII) )
243 # Save tags 244 # Save tags
244 - with open(os.path.join(options.outputPath, options.outFileII), "w") as oFile: 245 + with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.txt'), "w") as oFile:
245 for line in sentencesOutputDataII: 246 for line in sentencesOutputDataII:
246 #print(line) 247 #print(line)
247 oFile.write(line + '\n') 248 oFile.write(line + '\n')
......
1 -# -*- coding: UTF-8 -*-
2 -
3 -import os
4 -from optparse import OptionParser
5 -from time import time
6 -from collections import Counter
7 -
8 -import nltk
9 -import sklearn
10 -import scipy.stats
11 -import sys
12 -
13 -#from sklearn.externals import joblib
14 -import joblib
15 -from sklearn.metrics import make_scorer
16 -#from sklearn.cross_validation import cross_val_score
17 -from sklearn.model_selection import cross_val_score
18 -#from sklearn.grid_search import RandomizedSearchCV
19 -from sklearn.model_selection import RandomizedSearchCV
20 -
21 -import sklearn_crfsuite
22 -from sklearn_crfsuite import scorers
23 -from sklearn_crfsuite import metrics
24 -
25 -from nltk.corpus import stopwords
26 -
27 -#################################
This diff could not be displayed because it is too large.
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
2 --inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ 2 --inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
3 --outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ 3 --outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
4 --outputFileI Output tagged file I : annot-input_bg_outputI_v4.txt 4 --outputFileI Output tagged file I : annot-input_bg_outputI_v4.txt
5 ---outputFileII Output tagged file II : annot-input_bg_outputII_v4.txt 5 +--outputFileII Output tagged file II : annot-input_bg_outputII_v4
6 --modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models 6 --modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
7 --modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 7 --modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
8 --infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping 8 --infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
...@@ -17,16 +17,16 @@ ...@@ -17,16 +17,16 @@
17 Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False 17 Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
18 -------------------------------- PROCESSING -------------------------------- 18 -------------------------------- PROCESSING --------------------------------
19 Reading CRF model... 19 Reading CRF model...
20 -Reading CRF model done in: 0.009697s 20 +Reading CRF model done in: 0.009390s
21 Processing corpus... 21 Processing corpus...
22 Preprocessing file...annot-input_bg_v3.txt 22 Preprocessing file...annot-input_bg_v3.txt
23 Sentences input data: 14716 23 Sentences input data: 14716
24 Predicting tags with model 24 Predicting tags with model
25 -Prediction done in: 1.732606s 25 +Prediction done in: 1.692121s
26 Tagging file 26 Tagging file
27 Preprocessing file...annot-input_bg_v4.txt 27 Preprocessing file...annot-input_bg_v4.txt
28 Sentences input data: 90904 28 Sentences input data: 90904
29 Predicting tags with model 29 Predicting tags with model
30 -Prediction done in: 26.221746s 30 +Prediction done in: 25.701133s
31 Tagging file 31 Tagging file
32 -Processing corpus done in: 58.477312s 32 +Processing corpus done in: 57.242562s
......