upload

Estefani Gaytan Nunez
Commit 1d5f7e2bef7496d026bbec1efc0b82cc12853d5e 1d5f7e2b 1 parent ee43c206
Showing 10 changed files with 61 additions and 86 deletions
data.json
predict-annot/bin/tagging/tagging.py → predict-annot/bin/tagging/tagging_v2.py
predict-annot/output/annot-input_bg_outputIII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputIII_v5_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputII_v5_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputI_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputI_v5.txt_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/crf_probs.json
predict-annot/reports/annot-input_bg_report_v4.txt → predict-annot/reports/output_tagging_report_v5.txt
--- a/data.json 0 → 100644
View file @1d5f7e2
+++ b/data.json 0 → 100644
View file @1d5f7e2
+ {"key1": "keyinfo", "key2": "keyinfo2"}
\ No newline at end of file
--- a/predict-annot/bin/tagging/tagging.py → predict-annot/bin/tagging/tagging_v2.py
View file @1d5f7e2
+++ b/predict-annot/bin/tagging/tagging.py → predict-annot/bin/tagging/tagging_v2.py
View file @1d5f7e2
@@ -25,6 +25,8 @@ from nltk.corpus import stopwords
 
 import training_validation_v14 as training
 
+ import json
+ 
 #-------------------------------------------------------------------------------
 # Objective
 # Tagging transformed file with CRF model with sklearn-crfsuite.
@@ -61,10 +63,7 @@ import training_validation_v14 as training
 # --variant		    13 
 
 #Examples
- #python3 tagging.py  --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/  --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/  --outputFileI  annot-input_bg_outputI.txt  --outputFileII  annot-input_bg_outputII.txt  --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models  --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10  --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping  --infoFile bg_sentences_midx.txt  --variant 13   --S4   --S1 > ../../reports/output_tagging_report.txt
- #python3 predict-annot/bin/tagging/tagging.py  --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/  --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/  --outputFileI  annot-input_bg_outputI_v4.txt  --outputFileII  annot-input_bg_outputII_v4  --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models  --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10  --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping  --infoFile bg_sentences_midx_v4.txt  --variant 13   --S4   --S1 > predict-annot/reports/output_tagging_report_v4.txt
- #python3 predict-annot/bin/tagging/tagging.py  --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/  --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/  --outputFileI  annot-input_bg_outputI_v4.txt  --outputFileII  annot-input_bg_outputII_v4 --outputFileII  annot-input_bg_outputIII_v4  --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models  --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10  --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping  --infoFile bg_sentences_midx_v4.txt  --variant 13   --S4   --S1 > predict-annot/reports/annot-input_bg_report_v4.txt
- 
+ #predict-annot/bin/tagging/tagging.py  --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/  --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/  --outputFileI  annot-input_bg_outputI_v5.txt  --outputFileII  annot-input_bg_outputII_v5 --outputFileIII  annot-input_bg_outputIII_v5  --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models  --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10  --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping  --infoFile bg_sentences_midx_v4.txt  --variant 10   --S2   > predict-annot/reports/annot-input_bg_report_v4.txt > predict-annot/reports/output_tagging_report_v5.txt
 __author__ = 'egaytan'
 
 ##########################################
@@ -84,12 +83,10 @@ if __name__ == "__main__":
     parser.add_option("--infoPath",        dest="infoPath",        help="Path of GSE-GSM index file",        metavar="PATH")
     parser.add_option("--infoFile",        dest="idx",             help="GSE-GSM index file",                metavar="FILE")
     parser.add_option("--variant",         dest="variant",         help="Run variant",                       metavar="FILE")
-     parser.add_option("--S1",              dest="S1",              help="General features",                  action="store_true", default=False)
-     parser.add_option("--S2",              dest="S2",              help="Inner/Complete word features",      action="store_true", default=False)
+     parser.add_option("--S1",              dest="S1",              help="Inner word features",               action="store_true", default=False)
+     parser.add_option("--S2",              dest="S2",              help="Complete word features",            action="store_true", default=False)
     parser.add_option("--S3",              dest="S3",              help="Extended context features",         action="store_true", default=False)       
     parser.add_option("--S4",              dest="S4",              help="Semantic features",                 action="store_true", default=False)       
-     parser.add_option("--filterStopWords", dest="filterStopWords", help="Filtering stop words",              action="store_true", default=False)
-     parser.add_option("--filterSymbols",   dest="filterSymbols",   help="Filtering punctuation marks",       action="store_true", default=False)
 
     (options, args) = parser.parse_args()
     if len(args) > 0:
@@ -109,39 +106,29 @@ if __name__ == "__main__":
     print("--infoPath           Path of GSE-GSM index file       : " + str(options.infoPath         ))
     print("--infoFile	        GSE-GSM index file               : " + str(options.idx              ))
     print("--variant	        Run variant                      : " + str(options.variant          ))
-     print("--S1                 General features                 : " + str(options.S1               ))
-     print("--S2                 Inner/Complete word features     : " + str(options.S2               ))
+     print("--S1                 Inner word features set          : " + str(options.S1               ))
+     print("--S2                 Complete word features     : " + str(options.S2               ))
     print("--S3                 Extended context features     : " + str(options.S3               ))
     print("--S4                 Semantic features     : " + str(options.S4               ))
-     print("--filteringStopWords Filtering stop words             : " + str(options.filterStopWords  ))
-     print("--filterSymbols      Filtering punctuation marks      : " + str(options.filterSymbols    ))
-            
     
-     symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
-                '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
-     #print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
     ########################################### PROCESSING ##########################################
     print('-------------------------------- PROCESSING --------------------------------')
     
-     stopwords = [word for word in stopwords.words('english')]
     # Read index mapping GSE file information
     idx = open(os.path.join(options.infoPath, options.idx), "r").readlines()   
     
-     
     ########################################### Read CRF model ##########################################
     t0 = time()
     print('Reading CRF model...')    
     crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod'))
     print("Reading CRF model done in: %fs" % (time() - t0))
    
-     
     ########################################### Reading sentences ########################################## 
     print('Processing corpus...')
     t0 = time()
     labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])   
     
-     
-     #####################################################################################
+     ########################################### Preprocessing ########################################### 
     # Walk directory to read files
     for path, dirs, files in os.walk(options.inputPath):
         # For each file in dir
@@ -149,74 +136,60 @@ if __name__ == "__main__":
             print("Preprocessing file..." + str(file))
             sentencesInputData    = []
             sentencesOutputDataI = []
-             sentencesOutputDataII = []
+             # Preprocessing input sentences
             with open(os.path.join(options.inputPath, file), "r") as iFile:
-                 lines = iFile.readlines()
-                 for line in lines:
-                     listLine = []
-                     for token in line.strip('\n').split():
-                         if options.filterStopWords:
-                             listToken = token.split('|')
-                             lemma = listToken[1]                            
-                             if lemma in stopwords:
-                                 continue
-                         if options.filterSymbols:
-                             listToken = token.split('|')
-                             lemma = listToken[1]
-                             if lemma in symbols:
-                                 if lemma == ',':
-                                     print("Coma , identificada")
-                                 continue
-                         listLine.append(token)
-                     sentencesInputData.append(listLine)
+                 sentencesInputData = [ line.strip('\n').split() for line in iFile]
+                 # Save input sentences    
                 X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData]
                 print("Sentences input data: " + str(len(sentencesInputData)))               
                 
-                 
                 ###########################################     Predicting tags    ###########################################
                 t1 = time()               
                 print("Predicting tags with model...")
                 y_pred = crf.predict(X_input)
-                 print("Prediction done in: %fs" % (time() - t1))
                 
+                 print("Prediction done in: %fs" % (time() - t1))                
                 
                 ########################################### Tagging with CRF model ###########################################
                 print("Tagging file...")
                 lidx = 0
-                 for line, tagLine in zip(lines, y_pred):
+                 for line, tagLine in zip(iFile.readlines(), y_pred):
+                     # unique tags
                     Ltags = set(labels).intersection(set(tagLine))
-                     outputLine = ''                    
+                     # Skip untagged sentence
+                     if Ltags == {'O'}: continue
                     line = line.strip('\n')
-                                         
+                     # starting empty sentence
+                     outputLine = ''
+                     # tag behind
                     tb = 'O'
+                     # per sentence word count
                     i  = 0
-                     ########################## one word sentences ##########################
-                     if len(tagLine)==1:
+                     # Exception for one word sentences 
+                     if len(tagLine) == 1:
                         if tagLine[0] in labels:
+                             # add start tagging signature
                             start = '<'  + tagLine[0]  + '> '
-                             end   = '</' + tagLine[0] + '/>'
+                             # add end tagging signature
+                             end   = '</' + tagLine[0]  + '>'
                             word  = line.split('|')[0] + ' '
+                             # save output tagged sentence
                             outputLine = start + word + end
                         else:                             
                             outputLine = line.split(' ')[0]
-                         ########################## Saving Sentence Ouput I ##########################
+                         # Saving Sentence Ouput I
                         sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + '\t' + ', '.join(Ltags))
-                         ########################## Saving Sentence Ouput II ##########################
-                         sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + word.split('|')[0] + '\t' + tag)
+                         # Increase sentence counter
                         lidx += 1
+                         # Continue with the next sentence
                         continue
-                         
-                     sentence = ''
-                     sb = False
+                     # Tagging sentences
                     for word,tag in zip(line.split(' '), tagLine):
                         # start tagging
                         if tag in labels and tb != tag:                            
-                             # start tagging
                             outputLine += '<' + tag + '> '
-                             sb = True
-                             sentence = word.split('|')[0] + ' '
-                             tb = tag
                             outputLine += word.split('|')[0] + ' '
+                             tb = tag
                             i += 1
                             continue
                         # end tagging
@@ -224,32 +197,24 @@ if __name__ == "__main__":
                             if i+1==len(tagLine):
                                 # end sentence
                                 outputLine += word.split('|')[0] + ' '
-                                 outputLine += '</' + tag + '/> '
-                                 ########################## Saving Sentence Ouput II ##########################
-                                 sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + sentence + word.split('|')[0] + '\t' +tag)
-                                 sb = False
+                                 outputLine += '</' + tag + '> '                                
                                 tb = 'O'
                                 i += 1
                                 continue
                             elif tag!=tagLine[i+1]:
                                 # start new tag
                                 outputLine += word.split('|')[0] + ' '
-                                 outputLine += '</' + tag + '/> '
-                                 ########################## Saving Sentence Ouput II ##########################
-                                 sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + sentence + word.split('|')[0] + '\t' +tag)
-                                 sb = False
+                                 outputLine += '</' + tag + '> '
                                 tb = 'O'
                                 i += 1
                                 continue
                         # word tagged
                         outputLine += word.split('|')[0] + ' '
                         i += 1                        
-                         if sb:
-                             sentence+= word.split('|')[0] + ' '
-                     ########################## Saving Sentence Ouput I ##########################
+                     # Saving Sentence Ouput I 
                     sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ '\t' +', '.join(Ltags))
                     lidx += 1
-             
+                 print("\n".join(sentencesOutputDataI[1:3]))
                 ########################################### Save Output I ##########################################
                 print("Saving Ouput I...")
                 with open(os.path.join(options.outputPath, options.outFileI + '_' + options.modelName + '.tsv'), "w") as oFileI:
@@ -259,19 +224,29 @@ if __name__ == "__main__":
                             oline = line.replace('LDR','(')
                             oline = oline.replace('RDR',')')
                             oFileI.write(oline + '\n')
+                             
                 ########################################### Save Output II ##########################################
                 print("Saving Ouput II...")
                 with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFileII:
-                 for line in sentencesOutputDataII:
-                     #print(line)
+                     for line in sentencesOutputDataI:
                         oline = line.replace('LDR','(')
                         oline = oline.replace('RDR',')')
-                     oFileII.write(oline + '\n')
+                         for ttex, tag in re.findall(r'<[^>]+>([^<]+)</([^>]+)>', oline):
+                             lline = oline.split('\t')[0:-2] + [ttex, tag]
+                             nline = '\t'.join(lline)
+                             oFileII.write(nline  + '\n')
+                             
                 ########################################### Save Output III ##########################################
                 print("Saving Ouput III...")
                 with open(os.path.join(options.outputPath, options.outFileIII + '_' + options.modelName + '.tsv'), "w") as oFileIII:
-                 for line, tagLine in zip(lines, y_pred):                    
+                     for line, tagLine in zip(iFile.readlines(), y_pred):                    
                         oline = [ w.split('|')[0].replace('LDR','(').replace('LDR','(')+'|'+tag for w,tag in zip(line.split(' '), tagLine)]
                         
                         oFileIII.write(' '.join(oline) + '\n')  
+                 
+                 ########################################### Save Probs ##########################################
+                 y_probs = crf.predict_marginals(X_input)
+                 # from https://stackoverflow.com/questions/7100125/storing-python-dictionaries
+                 with open(os.path.join(options.outputPath, 'crf_probs.json'), 'w') as fp:
+                     json.dump(y_probs, fp)
     print("Processing corpus done in: %fs" % (time() - t0))
--- a/predict-annot/output/annot-input_bg_outputIII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @ee43c20
+++ b/predict-annot/output/annot-input_bg_outputIII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @ee43c20
--- a/predict-annot/output/annot-input_bg_outputIII_v5_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @1d5f7e2
+++ b/predict-annot/output/annot-input_bg_outputIII_v5_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @1d5f7e2
--- a/predict-annot/output/annot-input_bg_outputII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @ee43c20
+++ b/predict-annot/output/annot-input_bg_outputII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @ee43c20
--- a/predict-annot/output/annot-input_bg_outputII_v5_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @1d5f7e2
+++ b/predict-annot/output/annot-input_bg_outputII_v5_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @1d5f7e2
--- a/predict-annot/output/annot-input_bg_outputI_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @ee43c20
+++ b/predict-annot/output/annot-input_bg_outputI_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @ee43c20
--- a/predict-annot/output/annot-input_bg_outputI_v5.txt_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @1d5f7e2
+++ b/predict-annot/output/annot-input_bg_outputI_v5.txt_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @1d5f7e2
--- a/predict-annot/output/crf_probs.json 0 → 100644
View file @1d5f7e2
+++ b/predict-annot/output/crf_probs.json 0 → 100644
View file @1d5f7e2
--- a/predict-annot/reports/annot-input_bg_report_v4.txt → predict-annot/reports/output_tagging_report_v5.txt
View file @1d5f7e2
+++ b/predict-annot/reports/annot-input_bg_report_v4.txt → predict-annot/reports/output_tagging_report_v5.txt
View file @1d5f7e2
 -------------------------------- PARAMETERS --------------------------------
 --inputPath          Path of training data set        : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
 --outputPath         Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
- --outputFileI        Output tagged file I             : annot-input_bg_outputI_v4
- --outputFileII       Output tagged file II            : annot-input_bg_outputII_v4
- --outputFileII       Output tagged file III           : annot-input_bg_outputIII_v4
+ --outputFileI        Output tagged file I             : annot-input_bg_outputI_v5.txt
+ --outputFileII       Output tagged file II            : annot-input_bg_outputII_v5
+ --outputFileII       Output tagged file III           : annot-input_bg_outputIII_v5
 --modelPath          Path to read CRF model           : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
 --modelName          Model name                       : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
 --infoPath           Path of GSE-GSM index file       : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
 --infoFile	        GSE-GSM index file               : bg_sentences_midx_v4.txt
- --variant	        Run variant                      : 13
- --S1                 General features                 : True
- --S2                 Inner/Complete word features     : False
+ --variant	        Run variant                      : 10
+ --S1                 Inner word features set          : False
+ --S2                 Complete word features     : True
 --S3                 Extended context features     : False
- --S4                 Semantic features                : True
- --filteringStopWords Filtering stop words             : False
- --filterSymbols      Filtering punctuation marks      : False
+ --S4                 Semantic features     : False
 -------------------------------- PROCESSING --------------------------------
 Reading CRF model...
- Reading CRF model done in: 0.009463s
+ Reading CRF model done in: 0.009485s
 Processing corpus...
 Preprocessing file...annot-input_bg_v4.txt
 Sentences input data: 90688
 Predicting tags with model...
- Prediction done in: 26.367272s
+ Prediction done in: 27.326342s
 Tagging file...
+ 
 Saving Ouput I...
 Saving Ouput II...
 Saving Ouput III...
- Processing corpus done in: 56.584394s
+ Processing corpus done in: 247.353067s