upload

Estefani Gaytan Nunez
Commit 5c170cf2d74d027f7e8135af220474231616e615 5c170cf2 1 parent e48c18e6
Showing 8 changed files with 55 additions and 9 deletions
predict-annot/bin/mapping/bg_map_index.sh
predict-annot/bin/preprocessing/built_bg_sentences.py
predict-annot/input/annot-input_bg.txt → predict-annot/input/annot-input_bg_v1.txt
predict-annot/input/annot-input_bg_v2.txt
predict-annot/mapping/bg_GSE_map_index.txt
predict-annot/mapping/bg_GSM_map_index.txt
predict-annot/reports/annot-input_bg_report_v1.txt
predict-annot/reports/annot-input_bg_report_v2.txt
--- a/predict-annot/bin/mapping/bg_map_index.sh 0 → 100644
View file @5c170cf
+++ b/predict-annot/bin/mapping/bg_map_index.sh 0 → 100644
View file @5c170cf
+ index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all-output-index.txt"
+ mapfiel="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/bg_map_index.txt"
+ grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f7,9 | sort | uniq | awk 'BEGIN {FS="\t"}  length($2) > 3 { print  $_}' > $mapfile
+ 
+ 
--- a/predict-annot/bin/preprocessing/built_bg_sentences.py
View file @5c170cf
+++ b/predict-annot/bin/preprocessing/built_bg_sentences.py
View file @5c170cf
@@ -9,18 +9,22 @@ from optparse import OptionParser
 # --inputPath=PATH    		Path of inputfile
 # --outputPath=PATH   		Path to place output files
 # --outputFile=File        	Output data set
+ # --minWordLen                  Minimum word length
+ # --minSenLen			Minimum sentence length
 #
 # Output
 # Tagged sentences reconstruction
 #
 # Examples
 # python label-split_training_test_v1.py
- # --inputPath               /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
- # --inputFile               annot-input_bg.txt
- # --outputFile              input_bg_sentences.txt
+ # --inputPath               /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
+ # --inputFile               bg_sentences_v2.txt.ner
+ # --outputFile              annot-input_bg.txt
 # --outputPath              /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
+ # --minWordLen		    2
+ # --minSenLen               1
 #
- #python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output --outputFile annot-input_bg.txt
+ #python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1
 
 ##########################################
 #               MAIN PROGRAM             #
@@ -32,7 +36,10 @@ if __name__ == "__main__":
     parser.add_option("--inputPath",  dest="inputPath",  help="Path of output from CoreNLP",         metavar="PATH")
     parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files",   metavar="PATH")
     parser.add_option("--inputFile",  dest="inputFile",  help="File with CoreNLP-tagging sentences", metavar="FILE")
-     parser.add_option("--outputFile", dest="outputFile", help="File with training data set",         metavar="FILE")    
+     parser.add_option("--outputFile", dest="outputFile", help="File with training data set",         metavar="FILE")
+     parser.add_option("--minWordLen", dest="wL", help="Minimum word length", type="int")
+     parser.add_option("--minSenLen",  dest="sL", help="Minimum word length", type="int")
+     
 
     (options, args) = parser.parse_args()
     if len(args) > 0:
@@ -56,11 +63,15 @@ if __name__ == "__main__":
             if len(line.split('\t')) > 1:
                 w = line.split('\t')[1]
                 if w == "PGCGROWTHCONDITIONS":
-                     #End of sentence
-                     lista.append(sentence)
+                     if len( sentence.lstrip().split(' ') ) <= options.sL and len(sentence.lstrip().split(' ')[0].split('|')[0]) <= options.wL:                         
+                          print( "EXCLUDE: " + sentence.lstrip() )
+                     else:
+                          #End of sentence
+                          lista.append(sentence.lstrip())
+                          #New setence
+                          n = n+1                         
                     #New setence
-                     sentence = ''
-                     n=n+1                    
+                     sentence = ''                   
                 else:
                     #Building and save tagging sentence
                     sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4]))
--- a/predict-annot/input/annot-input_bg.txt → predict-annot/input/annot-input_bg_v1.txt
View file @5c170cf
+++ b/predict-annot/input/annot-input_bg.txt → predict-annot/input/annot-input_bg_v1.txt
View file @5c170cf
--- a/predict-annot/input/annot-input_bg_v2.txt 0 → 100644
View file @5c170cf
+++ b/predict-annot/input/annot-input_bg_v2.txt 0 → 100644
View file @5c170cf
--- a/predict-annot/mapping/bg_GSE_map_index.txt 0 → 100644
View file @5c170cf
+++ b/predict-annot/mapping/bg_GSE_map_index.txt 0 → 100644
View file @5c170cf
--- a/predict-annot/mapping/bg_GSM_map_index.txt 0 → 100644
View file @5c170cf
+++ b/predict-annot/mapping/bg_GSM_map_index.txt 0 → 100644
View file @5c170cf
--- a/predict-annot/reports/annot-input_bg_report_v1.txt 0 → 100644
View file @5c170cf
+++ b/predict-annot/reports/annot-input_bg_report_v1.txt 0 → 100644
View file @5c170cf
+ -------------------------------- PARAMETERS --------------------------------
+ Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
+ File with CoreNLP-tagging bg-sentences: bg_sentences_v2.txt.ner
+ Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
+ File to save recontrsucted bg-sentences: annot-input_bg_v1.txt
+ -------------------------------- PROCESSING --------------------------------
+ Number of sentences: 13903
+ ==================================END===================================
--- a/predict-annot/reports/annot-input_bg_report_v2.txt 0 → 100644
View file @5c170cf
+++ b/predict-annot/reports/annot-input_bg_report_v2.txt 0 → 100644
View file @5c170cf
+ -------------------------------- PARAMETERS --------------------------------
+ Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
+ File with CoreNLP-tagging bg-sentences: bg_sentences_v2.txt.ner
+ Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
+ File to save recontrsucted bg-sentences: annot-input_bg_v2.txt
+ -------------------------------- PROCESSING --------------------------------
+ EXCLUDE: 
+ EXCLUDE: 
+ EXCLUDE: .|.|.
+ EXCLUDE: \|\|SYM
+ EXCLUDE: C1|c1|NN
+ EXCLUDE: C2|c2|NN
+ EXCLUDE: F1|f1|NN
+ EXCLUDE: F2|f2|NN
+ EXCLUDE: LB|lb|NN
+ EXCLUDE: NA|NA|NNP
+ EXCLUDE: NC|nc|NN
+ EXCLUDE: V1|v1|NN
+ EXCLUDE: wt|wt|JJ
+ EXCLUDE: WT|WT|NNP
+ Number of sentences: 13889
+ ==================================END===================================