upload

Estefani Gaytan Nunez
Commit 1d5f7e2bef7496d026bbec1efc0b82cc12853d5e 1d5f7e2b 1 parent ee43c206
Showing 10 changed files with 61 additions and 86 deletions
data.json
predict-annot/bin/tagging/tagging.py → predict-annot/bin/tagging/tagging_v2.py
predict-annot/output/annot-input_bg_outputIII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputIII_v5_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputII_v5_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputI_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputI_v5.txt_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/crf_probs.json
predict-annot/reports/annot-input_bg_report_v4.txt → predict-annot/reports/output_tagging_report_v5.txt
--- a/data.json 0 → 100644
View file @1d5f7e2
+++ b/data.json 0 → 100644
View file @1d5f7e2
+{"key1": "keyinfo", "key2": "keyinfo2"}
\ No newline at end of file
--- a/predict-annot/bin/tagging/tagging.py → predict-annot/bin/tagging/tagging_v2.py
View file @1d5f7e2
+++ b/predict-annot/bin/tagging/tagging.py → predict-annot/bin/tagging/tagging_v2.py
View file @1d5f7e2
@@ -25,6 +25,8 @@ from nltk.corpus import stopwords
 import training_validation_v14 as training
+import json
+
 #-------------------------------------------------------------------------------
 # Objective
 # Tagging transformed file with CRF model with sklearn-crfsuite.
@@ -61,10 +63,7 @@ import training_validation_v14 as training
 # --variant		    13 
 #Examples
-#python3 tagging.py  --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/  --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/  --outputFileI  annot-input_bg_outputI.txt  --outputFileII  annot-input_bg_outputII.txt  --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models  --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10  --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping  --infoFile bg_sentences_midx.txt  --variant 13   --S4   --S1 > ../../reports/output_tagging_report.txt
+#predict-annot/bin/tagging/tagging.py  --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/  --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/  --outputFileI  annot-input_bg_outputI_v5.txt  --outputFileII  annot-input_bg_outputII_v5 --outputFileIII  annot-input_bg_outputIII_v5  --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models  --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10  --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping  --infoFile bg_sentences_midx_v4.txt  --variant 10   --S2   > predict-annot/reports/annot-input_bg_report_v4.txt > predict-annot/reports/output_tagging_report_v5.txt
-#python3 predict-annot/bin/tagging/tagging.py  --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/  --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/  --outputFileI  annot-input_bg_outputI_v4.txt  --outputFileII  annot-input_bg_outputII_v4  --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models  --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10  --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping  --infoFile bg_sentences_midx_v4.txt  --variant 13   --S4   --S1 > predict-annot/reports/output_tagging_report_v4.txt
-#python3 predict-annot/bin/tagging/tagging.py  --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/  --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/  --outputFileI  annot-input_bg_outputI_v4.txt  --outputFileII  annot-input_bg_outputII_v4 --outputFileII  annot-input_bg_outputIII_v4  --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models  --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10  --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping  --infoFile bg_sentences_midx_v4.txt  --variant 13   --S4   --S1 > predict-annot/reports/annot-input_bg_report_v4.txt
-
 __author__ = 'egaytan'
 ##########################################
@@ -84,12 +83,10 @@ if __name__ == "__main__":
     parser.add_option("--infoPath",        dest="infoPath",        help="Path of GSE-GSM index file",        metavar="PATH")
     parser.add_option("--infoFile",        dest="idx",             help="GSE-GSM index file",                metavar="FILE")
     parser.add_option("--variant",         dest="variant",         help="Run variant",                       metavar="FILE")
-    parser.add_option("--S1",              dest="S1",              help="General features",                  action="store_true", default=False)
+    parser.add_option("--S1",              dest="S1",              help="Inner word features",               action="store_true", default=False)
-    parser.add_option("--S2",              dest="S2",              help="Inner/Complete word features",      action="store_true", default=False)
+    parser.add_option("--S2",              dest="S2",              help="Complete word features",            action="store_true", default=False)
     parser.add_option("--S3",              dest="S3",              help="Extended context features",         action="store_true", default=False)       
     parser.add_option("--S4",              dest="S4",              help="Semantic features",                 action="store_true", default=False)       
-    parser.add_option("--filterStopWords", dest="filterStopWords", help="Filtering stop words",              action="store_true", default=False)
-    parser.add_option("--filterSymbols",   dest="filterSymbols",   help="Filtering punctuation marks",       action="store_true", default=False)
     (options, args) = parser.parse_args()
     if len(args) > 0:
@@ -109,39 +106,29 @@ if __name__ == "__main__":
     print("--infoPath           Path of GSE-GSM index file       : " + str(options.infoPath         ))
     print("--infoFile	        GSE-GSM index file               : " + str(options.idx              ))
     print("--variant	        Run variant                      : " + str(options.variant          ))
-    print("--S1                 General features                 : " + str(options.S1               ))
+    print("--S1                 Inner word features set          : " + str(options.S1               ))
-    print("--S2                 Inner/Complete word features     : " + str(options.S2               ))
+    print("--S2                 Complete word features     : " + str(options.S2               ))
     print("--S3                 Extended context features     : " + str(options.S3               ))
     print("--S4                 Semantic features     : " + str(options.S4               ))
-    print("--filteringStopWords Filtering stop words             : " + str(options.filterStopWords  ))
-    print("--filterSymbols      Filtering punctuation marks      : " + str(options.filterSymbols    ))
-           
-    symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
-               '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
-    #print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
     ########################################### PROCESSING ##########################################
     print('-------------------------------- PROCESSING --------------------------------')
-    stopwords = [word for word in stopwords.words('english')]
     # Read index mapping GSE file information
     idx = open(os.path.join(options.infoPath, options.idx), "r").readlines()   
-    
     ########################################### Read CRF model ##########################################
     t0 = time()
     print('Reading CRF model...')    
     crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod'))
     print("Reading CRF model done in: %fs" % (time() - t0))
-    
     ########################################### Reading sentences ########################################## 
     print('Processing corpus...')
     t0 = time()
     labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])   
-    
+    ########################################### Preprocessing ########################################### 
-    #####################################################################################
     # Walk directory to read files
     for path, dirs, files in os.walk(options.inputPath):
         # For each file in dir
@@ -149,74 +136,60 @@ if __name__ == "__main__":
             print("Preprocessing file..." + str(file))
             sentencesInputData    = []
             sentencesOutputDataI = []
-            sentencesOutputDataII = []
+            # Preprocessing input sentences
             with open(os.path.join(options.inputPath, file), "r") as iFile:
-                lines = iFile.readlines()
+                sentencesInputData = [ line.strip('\n').split() for line in iFile]
-                for line in lines:
+                # Save input sentences    
-                    listLine = []
-                    for token in line.strip('\n').split():
-                        if options.filterStopWords:
-                            listToken = token.split('|')
-                            lemma = listToken[1]                            
-                            if lemma in stopwords:
-                                continue
-                        if options.filterSymbols:
-                            listToken = token.split('|')
-                            lemma = listToken[1]
-                            if lemma in symbols:
-                                if lemma == ',':
-                                    print("Coma , identificada")
-                                continue
-                        listLine.append(token)
-                    sentencesInputData.append(listLine)
                 X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData]
                 print("Sentences input data: " + str(len(sentencesInputData)))               
-                
                 ###########################################     Predicting tags    ###########################################
                 t1 = time()               
                 print("Predicting tags with model...")
                 y_pred = crf.predict(X_input)
-                print("Prediction done in: %fs" % (time() - t1))
+                print("Prediction done in: %fs" % (time() - t1))                
                 ########################################### Tagging with CRF model ###########################################
                 print("Tagging file...")
                 lidx = 0
-                for line, tagLine in zip(lines, y_pred):
+                for line, tagLine in zip(iFile.readlines(), y_pred):
+                    # unique tags
                     Ltags = set(labels).intersection(set(tagLine))
-                    outputLine = ''                    
+                    # Skip untagged sentence
+                    if Ltags == {'O'}: continue
                     line = line.strip('\n')
-                                        
+                    # starting empty sentence
+                    outputLine = ''
+                    # tag behind
                     tb = 'O'
+                    # per sentence word count
                     i  = 0
-                    ########################## one word sentences ##########################
+                    # Exception for one word sentences 
-                    if len(tagLine)==1:
+                    if len(tagLine) == 1:
                         if tagLine[0] in labels:
+                            # add start tagging signature
                             start = '<'  + tagLine[0]  + '> '
-                            end   = '</' + tagLine[0] + '/>'
+                            # add end tagging signature
+                            end   = '</' + tagLine[0]  + '>'
                             word  = line.split('|')[0] + ' '
+                            # save output tagged sentence
                             outputLine = start + word + end
                         else:                             
                             outputLine = line.split(' ')[0]
-                        ########################## Saving Sentence Ouput I ##########################
+                        # Saving Sentence Ouput I
                         sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + '\t' + ', '.join(Ltags))
-                        ########################## Saving Sentence Ouput II ##########################
+                        # Increase sentence counter
-                        sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + word.split('|')[0] + '\t' + tag)
                         lidx += 1
+                        # Continue with the next sentence
                         continue
-                        
+                    # Tagging sentences
-                    sentence = ''
-                    sb = False
                     for word,tag in zip(line.split(' '), tagLine):
                         # start tagging
                         if tag in labels and tb != tag:                            
-                            # start tagging
                             outputLine += '<' + tag + '> '
-                            sb = True
-                            sentence = word.split('|')[0] + ' '
-                            tb = tag
                             outputLine += word.split('|')[0] + ' '
+                            tb = tag
                             i += 1
                             continue
                         # end tagging
@@ -224,32 +197,24 @@ if __name__ == "__main__":
                             if i+1==len(tagLine):
                                 # end sentence
                                 outputLine += word.split('|')[0] + ' '
-                                outputLine += '</' + tag + '/> '
+                                outputLine += '</' + tag + '> '                                
-                                ########################## Saving Sentence Ouput II ##########################
-                                sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + sentence + word.split('|')[0] + '\t' +tag)
-                                sb = False
                                 tb = 'O'
                                 i += 1
                                 continue
                             elif tag!=tagLine[i+1]:
                                 # start new tag
                                 outputLine += word.split('|')[0] + ' '
-                                outputLine += '</' + tag + '/> '
+                                outputLine += '</' + tag + '> '
-                                ########################## Saving Sentence Ouput II ##########################
-                                sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + sentence + word.split('|')[0] + '\t' +tag)
-                                sb = False
                                 tb = 'O'
                                 i += 1
                                 continue
                         # word tagged
                         outputLine += word.split('|')[0] + ' '
                         i += 1                        
-                        if sb:
+                    # Saving Sentence Ouput I 
-                            sentence+= word.split('|')[0] + ' '
-                    ########################## Saving Sentence Ouput I ##########################
                     sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ '\t' +', '.join(Ltags))
                     lidx += 1
-            
+                print("\n".join(sentencesOutputDataI[1:3]))
                 ########################################### Save Output I ##########################################
                 print("Saving Ouput I...")
                 with open(os.path.join(options.outputPath, options.outFileI + '_' + options.modelName + '.tsv'), "w") as oFileI:
@@ -259,19 +224,29 @@ if __name__ == "__main__":
                             oline = line.replace('LDR','(')
                             oline = oline.replace('RDR',')')
                             oFileI.write(oline + '\n')
+                            
                 ########################################### Save Output II ##########################################
                 print("Saving Ouput II...")
                 with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFileII:
-                for line in sentencesOutputDataII:
+                    for line in sentencesOutputDataI:
-                    #print(line)
                         oline = line.replace('LDR','(')
                         oline = oline.replace('RDR',')')
-                    oFileII.write(oline + '\n')
+                        for ttex, tag in re.findall(r'<[^>]+>([^<]+)</([^>]+)>', oline):
+                            lline = oline.split('\t')[0:-2] + [ttex, tag]
+                            nline = '\t'.join(lline)
+                            oFileII.write(nline  + '\n')
+                            
                 ########################################### Save Output III ##########################################
                 print("Saving Ouput III...")
                 with open(os.path.join(options.outputPath, options.outFileIII + '_' + options.modelName + '.tsv'), "w") as oFileIII:
-                for line, tagLine in zip(lines, y_pred):                    
+                    for line, tagLine in zip(iFile.readlines(), y_pred):                    
                         oline = [ w.split('|')[0].replace('LDR','(').replace('LDR','(')+'|'+tag for w,tag in zip(line.split(' '), tagLine)]
                         oFileIII.write(' '.join(oline) + '\n')  
+                
+                ########################################### Save Probs ##########################################
+                y_probs = crf.predict_marginals(X_input)
+                # from https://stackoverflow.com/questions/7100125/storing-python-dictionaries
+                with open(os.path.join(options.outputPath, 'crf_probs.json'), 'w') as fp:
+                    json.dump(y_probs, fp)
     print("Processing corpus done in: %fs" % (time() - t0))
--- a/predict-annot/output/annot-input_bg_outputIII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @ee43c20
+++ b/predict-annot/output/annot-input_bg_outputIII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @ee43c20
--- a/predict-annot/output/annot-input_bg_outputIII_v5_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @1d5f7e2
+++ b/predict-annot/output/annot-input_bg_outputIII_v5_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @1d5f7e2
--- a/predict-annot/output/annot-input_bg_outputII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @ee43c20
+++ b/predict-annot/output/annot-input_bg_outputII_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @ee43c20
--- a/predict-annot/output/annot-input_bg_outputII_v5_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @1d5f7e2
+++ b/predict-annot/output/annot-input_bg_outputII_v5_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @1d5f7e2
--- a/predict-annot/output/annot-input_bg_outputI_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @ee43c20
+++ b/predict-annot/output/annot-input_bg_outputI_v4_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @ee43c20
--- a/predict-annot/output/annot-input_bg_outputI_v5.txt_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @1d5f7e2
+++ b/predict-annot/output/annot-input_bg_outputI_v5.txt_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @1d5f7e2
--- a/predict-annot/output/crf_probs.json 0 → 100644
View file @1d5f7e2
+++ b/predict-annot/output/crf_probs.json 0 → 100644
View file @1d5f7e2
--- a/predict-annot/reports/annot-input_bg_report_v4.txt → predict-annot/reports/output_tagging_report_v5.txt
View file @1d5f7e2
+++ b/predict-annot/reports/annot-input_bg_report_v4.txt → predict-annot/reports/output_tagging_report_v5.txt
View file @1d5f7e2
 -------------------------------- PARAMETERS --------------------------------
 --inputPath          Path of training data set        : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
 --outputPath         Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
---outputFileI        Output tagged file I             : annot-input_bg_outputI_v4
+--outputFileI        Output tagged file I             : annot-input_bg_outputI_v5.txt
---outputFileII       Output tagged file II            : annot-input_bg_outputII_v4
+--outputFileII       Output tagged file II            : annot-input_bg_outputII_v5
---outputFileII       Output tagged file III           : annot-input_bg_outputIII_v4
+--outputFileII       Output tagged file III           : annot-input_bg_outputIII_v5
 --modelPath          Path to read CRF model           : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
 --modelName          Model name                       : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
 --infoPath           Path of GSE-GSM index file       : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
 --infoFile	        GSE-GSM index file               : bg_sentences_midx_v4.txt
---variant	        Run variant                      : 13
+--variant	        Run variant                      : 10
---S1                 General features                 : True
+--S1                 Inner word features set          : False
---S2                 Inner/Complete word features     : False
+--S2                 Complete word features     : True
 --S3                 Extended context features     : False
---S4                 Semantic features                : True
+--S4                 Semantic features     : False
---filteringStopWords Filtering stop words             : False
---filterSymbols      Filtering punctuation marks      : False
 -------------------------------- PROCESSING --------------------------------
 Reading CRF model...
-Reading CRF model done in: 0.009463s
+Reading CRF model done in: 0.009485s
 Processing corpus...
 Preprocessing file...annot-input_bg_v4.txt
 Sentences input data: 90688
 Predicting tags with model...
-Prediction done in: 26.367272s
+Prediction done in: 27.326342s
 Tagging file...
+
 Saving Ouput I...
 Saving Ouput II...
 Saving Ouput III...
-Processing corpus done in: 56.584394s
+Processing corpus done in: 247.353067s