upload

Estefani Gaytan Nunez
Commit 5c170cf2d74d027f7e8135af220474231616e615 5c170cf2 1 parent e48c18e6
Showing 8 changed files with 55 additions and 9 deletions
predict-annot/bin/mapping/bg_map_index.sh
predict-annot/bin/preprocessing/built_bg_sentences.py
predict-annot/input/annot-input_bg.txt → predict-annot/input/annot-input_bg_v1.txt
predict-annot/input/annot-input_bg_v2.txt
predict-annot/mapping/bg_GSE_map_index.txt
predict-annot/mapping/bg_GSM_map_index.txt
predict-annot/reports/annot-input_bg_report_v1.txt
predict-annot/reports/annot-input_bg_report_v2.txt
--- a/predict-annot/bin/mapping/bg_map_index.sh 0 → 100644
View file @5c170cf
+++ b/predict-annot/bin/mapping/bg_map_index.sh 0 → 100644
View file @5c170cf
+index="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/all-output-index.txt"
+mapfiel="/home/egaytan/automatic-extraction-growth-conditions/extraction-geo/reports/bg_map_index.txt"
+grep -E ".*" $(cat $index | tr '\n' ' ')| sed 's/"//g'| sed 's/.tsv:/.tsv\t/' | tr '/' '\t'| cut -f7,9 | sort | uniq | awk 'BEGIN {FS="\t"}  length($2) > 3 { print  $_}' > $mapfile
+
+
--- a/predict-annot/bin/preprocessing/built_bg_sentences.py
View file @5c170cf
+++ b/predict-annot/bin/preprocessing/built_bg_sentences.py
View file @5c170cf
@@ -9,18 +9,22 @@ from optparse import OptionParser
 # --inputPath=PATH    		Path of inputfile
 # --outputPath=PATH   		Path to place output files
 # --outputFile=File        	Output data set
+# --minWordLen                  Minimum word length
+# --minSenLen			Minimum sentence length
 #
 # Output
 # Tagged sentences reconstruction
 #
 # Examples
 # python label-split_training_test_v1.py
-# --inputPath               /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
+# --inputPath               /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
-# --inputFile               annot-input_bg.txt
+# --inputFile               bg_sentences_v2.txt.ner
-# --outputFile              input_bg_sentences.txt
+# --outputFile              annot-input_bg.txt
 # --outputPath              /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
+# --minWordLen		    2
+# --minSenLen               1
 #
-#python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output --outputFile annot-input_bg.txt
+#python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1
 ##########################################
 #               MAIN PROGRAM             #
@@ -32,7 +36,10 @@ if __name__ == "__main__":
     parser.add_option("--inputPath",  dest="inputPath",  help="Path of output from CoreNLP",         metavar="PATH")
     parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files",   metavar="PATH")
     parser.add_option("--inputFile",  dest="inputFile",  help="File with CoreNLP-tagging sentences", metavar="FILE")
-    parser.add_option("--outputFile", dest="outputFile", help="File with training data set",         metavar="FILE")    
+    parser.add_option("--outputFile", dest="outputFile", help="File with training data set",         metavar="FILE")
+    parser.add_option("--minWordLen", dest="wL", help="Minimum word length", type="int")
+    parser.add_option("--minSenLen",  dest="sL", help="Minimum word length", type="int")
+    
     (options, args) = parser.parse_args()
     if len(args) > 0:
@@ -56,11 +63,15 @@ if __name__ == "__main__":
             if len(line.split('\t')) > 1:
                 w = line.split('\t')[1]
                 if w == "PGCGROWTHCONDITIONS":
-                    #End of sentence
+                    if len( sentence.lstrip().split(' ') ) <= options.sL and len(sentence.lstrip().split(' ')[0].split('|')[0]) <= options.wL:                         
-                    lista.append(sentence)
+                         print( "EXCLUDE: " + sentence.lstrip() )
+                    else:
+                         #End of sentence
+                         lista.append(sentence.lstrip())
+                         #New setence
+                         n = n+1                         
                     #New setence
-                    sentence = ''
+                    sentence = ''                   
-                    n=n+1                    
                 else:
                     #Building and save tagging sentence
                     sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4]))
--- a/predict-annot/input/annot-input_bg.txt → predict-annot/input/annot-input_bg_v1.txt
View file @5c170cf
+++ b/predict-annot/input/annot-input_bg.txt → predict-annot/input/annot-input_bg_v1.txt
View file @5c170cf
--- a/predict-annot/input/annot-input_bg_v2.txt 0 → 100644
View file @5c170cf
+++ b/predict-annot/input/annot-input_bg_v2.txt 0 → 100644
View file @5c170cf
--- a/predict-annot/mapping/bg_GSE_map_index.txt 0 → 100644
View file @5c170cf
+++ b/predict-annot/mapping/bg_GSE_map_index.txt 0 → 100644
View file @5c170cf
--- a/predict-annot/mapping/bg_GSM_map_index.txt 0 → 100644
View file @5c170cf
+++ b/predict-annot/mapping/bg_GSM_map_index.txt 0 → 100644
View file @5c170cf
--- a/predict-annot/reports/annot-input_bg_report_v1.txt 0 → 100644
View file @5c170cf
+++ b/predict-annot/reports/annot-input_bg_report_v1.txt 0 → 100644
View file @5c170cf
+-------------------------------- PARAMETERS --------------------------------
+Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
+File with CoreNLP-tagging bg-sentences: bg_sentences_v2.txt.ner
+Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
+File to save recontrsucted bg-sentences: annot-input_bg_v1.txt
+-------------------------------- PROCESSING --------------------------------
+Number of sentences: 13903
+==================================END===================================
--- a/predict-annot/reports/annot-input_bg_report_v2.txt 0 → 100644
View file @5c170cf
+++ b/predict-annot/reports/annot-input_bg_report_v2.txt 0 → 100644
View file @5c170cf
+-------------------------------- PARAMETERS --------------------------------
+Path of CoreNLP output: /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
+File with CoreNLP-tagging bg-sentences: bg_sentences_v2.txt.ner
+Path to save data set: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
+File to save recontrsucted bg-sentences: annot-input_bg_v2.txt
+-------------------------------- PROCESSING --------------------------------
+EXCLUDE: 
+EXCLUDE: 
+EXCLUDE: .|.|.
+EXCLUDE: \|\|SYM
+EXCLUDE: C1|c1|NN
+EXCLUDE: C2|c2|NN
+EXCLUDE: F1|f1|NN
+EXCLUDE: F2|f2|NN
+EXCLUDE: LB|lb|NN
+EXCLUDE: NA|NA|NNP
+EXCLUDE: NC|nc|NN
+EXCLUDE: V1|v1|NN
+EXCLUDE: wt|wt|JJ
+EXCLUDE: WT|WT|NNP
+Number of sentences: 13889
+==================================END===================================