upload

Estefani Gaytan Nunez
Commit b2b5af9e04ce3a9461d01e1a13089389a91bc65a b2b5af9e 1 parent ec09fc7e
Showing 10 changed files with 301 additions and 7 deletions
data.json
predict-annot/bin/tagging/__pycache__/training_validation_v14.cpython-36.pyc
predict-annot/bin/tagging/tagging_2Nov.py
predict-annot/output/annot-input_bg_outputII_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputII_v7_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputI_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputI_v7_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/borrame_unique.tsv
predict-annot/reports/annot-input_bg_report_v6.txt → predict-annot/reports/annot-input_bg_report_v7.txt
--- a/data.json deleted 100644 → 0
View file @ec09fc7
+++ b/data.json deleted 100644 → 0
View file @ec09fc7
- {"key1": "keyinfo", "key2": "keyinfo2"}
\ No newline at end of file
--- a/predict-annot/bin/tagging/__pycache__/training_validation_v14.cpython-36.pyc deleted 100644 → 0
View file @ec09fc7
+++ b/predict-annot/bin/tagging/__pycache__/training_validation_v14.cpython-36.pyc deleted 100644 → 0
View file @ec09fc7
--- a/predict-annot/bin/tagging/tagging_2Nov.py 0 → 100644
View file @b2b5af9
+++ b/predict-annot/bin/tagging/tagging_2Nov.py 0 → 100644
View file @b2b5af9
+ # -*- coding: UTF-8 -*-
+ 
+ import os
+ import re
+ from pandas import DataFrame as DF
+ from optparse import OptionParser
+ from time import time
+ from collections import Counter
+ 
+ import nltk
+ import sklearn
+ import scipy.stats
+ import sys
+ 
+ import joblib
+ from sklearn.metrics import make_scorer
+ from sklearn.model_selection import cross_val_score
+ from sklearn.model_selection import RandomizedSearchCV
+ 
+ import sklearn_crfsuite
+ from sklearn_crfsuite import scorers
+ from sklearn_crfsuite import metrics
+ 
+ from nltk.corpus import stopwords
+ 
+ import training_validation_v14 as training
+ 
+ import json
+ 
+ #-------------------------------------------------------------------------------
+ # Objective
+ # Tagging transformed file with CRF model with sklearn-crfsuite.
+ #
+ # Input parameters
+ # --inputPath=PATH          Path of transformed files x|y|z
+ # --outputPath              Output path to place output files
+ # --outputFileI             Output tagged file I
+ # --outputFileII            Output tagged file II
+ # --modelPath               Path to CRF model
+ # --modelName               Model name
+ # --infoPath                Path of GSE-GSM index file
+ # --infoFile	            GSE-GSM index file",
+ # --variant	                Part of S2 variant
+ # --S1                      Inner word features set
+ # --S2                      Complete word features
+ # --S3                      Extended context features
+ # --S4                      Semantic features
+ # --filteringStopWords      Filtering stop words
+ # --filterSymbols           Filtering punctuation marks
+ 
+ # Output
+ # 1) Tagged files in transformed format
+ 
+ # Examples
+ # --inputPath		/home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
+ # --outputPath	    /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
+ # --outputFileI     annot-input_bg_outputI.txt
+ # --outputFileII    annot-input_bg_outputII.txt
+ # --modelPath		/home/egaytan/automatic-extraction-growth-conditions/CRF/models
+ # --modelName		model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
+ # --infoPath		/home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
+ # --infoFile		bg_sentences_midx.txt
+ # --variant		    13
+ 
+ #Examples
+ #predict-annot/bin/tagging/tagging.py  --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/  --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/  --outputFileI  annot-input_bg_outputI_v5.txt  --outputFileII  annot-input_bg_outputII_v5 --outputFileIII  annot-input_bg_outputIII_v5  --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models  --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10  --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping  --infoFile bg_sentences_midx_v4.txt  --variant 10   --S2   > predict-annot/reports/annot-input_bg_report_v4.txt > predict-annot/reports/output_tagging_report_v5.txt
+ __author__ = 'egaytan'
+ 
+ ##########################################
+ #               MAIN PROGRAM             #
+ ##########################################
+ 
+ if __name__ == "__main__":
+     ########################################### Defining parameters ##########################################
+     parser = OptionParser()
+     parser.add_option("--inputPath",       dest="inputPath",       help="Path of training data set",         metavar="PATH")
+     parser.add_option("--outputPath",      dest="outputPath",      help="Output path to place output files", metavar="PATH")
+     parser.add_option("--outputFileI",     dest="outFileI",        help="Output tagged file I",              metavar="FILE")
+     parser.add_option("--outputFileII",    dest="outFileII",       help="Output tagged file II",             metavar="FILE")
+     parser.add_option("--outputFileIII",   dest="outFileIII",      help="Output tagged file III",            metavar="FILE")
+     parser.add_option("--modelPath",       dest="modelPath",       help="Path to read CRF model",            metavar="PATH")
+     parser.add_option("--modelName",       dest="modelName",       help="Model name",                        metavar="TEXT")
+     parser.add_option("--infoPath",        dest="infoPath",        help="Path of GSE-GSM index file",        metavar="PATH")
+     parser.add_option("--infoFile",        dest="idx",             help="GSE-GSM index file",                metavar="FILE")
+     parser.add_option("--variant",         dest="variant",         help="Run variant",                       metavar="FILE")
+     parser.add_option("--S1",              dest="S1",              help="Inner word features",               action="store_true", default=False)
+     parser.add_option("--S2",              dest="S2",              help="Complete word features",            action="store_true", default=False)
+     parser.add_option("--S3",              dest="S3",              help="Extended context features",         action="store_true", default=False)
+     parser.add_option("--S4",              dest="S4",              help="Semantic features",                 action="store_true", default=False)
+ 
+     (options, args) = parser.parse_args()
+     if len(args) > 0:
+         parser.error("Any parameter given.")
+         sys.exit(1)
+ 
+     ########################################### DISP PARAMETERS ##########################################
+     print('-------------------------------- PARAMETERS --------------------------------')
+ 
+     print("--inputPath          Path of training data set        : " + str(options.inputPath        ))
+     print("--outputPath         Output path to place output files: " + str(options.outputPath       ))
+     print("--outputFileI        Output tagged file I             : " + str(options.outFileI         ))
+     print("--outputFileII       Output tagged file II            : " + str(options.outFileII        ))
+     print("--outputFileII       Output tagged file III           : " + str(options.outFileIII       ))
+     print("--modelPath          Path to read CRF model           : " + str(options.modelPath        ))
+     print("--modelName          Model name                       : " + str(options.modelName        ))
+     print("--infoPath           Path of GSE-GSM index file       : " + str(options.infoPath         ))
+     print("--infoFile	        GSE-GSM index file               : " + str(options.idx              ))
+     print("--variant	        Run variant                      : " + str(options.variant          ))
+     print("--S1                 Inner word features set          : " + str(options.S1               ))
+     print("--S2                 Complete word features     : " + str(options.S2               ))
+     print("--S3                 Extended context features     : " + str(options.S3               ))
+     print("--S4                 Semantic features     : " + str(options.S4               ))
+ 
+     ########################################### PROCESSING ##########################################
+     print('-------------------------------- PROCESSING --------------------------------')
+ 
+     # Load index mapping GSE file information
+     idx = open(os.path.join(options.infoPath, options.idx), "r").readlines()
+ 
+     ########################################### Read CRF model ##########################################
+     t0 = time()
+     print('Reading CRF model...')
+     crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod'))
+     print("Reading CRF model done in: %fs" % (time() - t0))
+ 
+     ########################################### Reading sentences ##########################################
+     print('Processing corpus...')
+     t0 = time()
+     labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
+ 
+     ########################################### Preprocessing ###########################################
+     # Walk directory to read files
+     for path, dirs, files in os.walk(options.inputPath):
+         # For each file in dir
+         for file in files:
+             print("Preprocessing file..." + str(file))
+             sentencesInputData    = []
+             sentencesOutputDataI = []
+ 
+             # Preprocessing input sentences
+             with open(os.path.join(options.inputPath, file), "r") as iFile:
+                 lines = iFile.readlines()
+                 sentencesInputData = [ line.strip('\n').split() for line in lines]
+                 # Save input sentences
+                 X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData]
+                 print("Sentences input data: " + str(len(sentencesInputData)))
+ 
+                 ###########################################     Predicting tags    ###########################################
+ 
+                 t1 = time()
+                 print("Predicting tags with model...")
+                 y_pred = crf.predict(X_input)
+                 print("Prediction done in: %fs" % (time() - t1))
+ 
+                 ########################################### Tagging with CRF model ###########################################
+ 
+                 print("Tagging file...")
+                 lidx = 0
+                 for line, tagLine in zip(lines, y_pred):
+ 
+                     # get unique tags
+                     Ltags = set(labels).intersection(set(tagLine))
+ 
+                     # Skip untagged sentence
+                     if Ltags == {'O'}: continue
+                     line = line.strip('\n')
+ 
+                     # start an empty sentence
+                     outputLine = ''
+ 
+                     # per sentence word count
+                     i  = 0
+ 
+                     # Exception for one word sentences
+                     if len(tagLine) == 1:
+                         if tagLine[0] in labels:
+ 
+                             # add start tagging signature
+                             start = '<'  + tagLine[0]  + '> '
+ 
+                             # add end tagging signature
+                             end   = '</' + tagLine[0]  + '>'
+                             word  = line.split('|')[0] + ' '
+ 
+                             # save output tagged sentence
+                             outputLine = start + word + end
+                         else:
+                             outputLine = line.split(' ')[0]
+                         # Saving Sentence Ouput I
+                         #print(outputLine)
+                         sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + '\t' + ', '.join(Ltags))
+                         # Increase sentence counter
+                         lidx += 1
+                         # Continue with the next sentence
+                         continue
+ 
+                     # tag behind
+                     tb = 'O'
+ 
+                     # Tagging sentences
+                     for word,tag in zip(line.split(' '), tagLine):
+ 
+                         # general start tagging
+                         if tag in labels and tb != tag:
+ 
+                             # check continues tags case before start tagging
+                             if tb in labels and outputLine[-2:] != '> ':
+                                 # closed the last tagging
+                                 outputLine += '</' + tb + '> '
+ 
+                             # start new tagging
+                             outputLine += '<' + tag + '> '
+                             outputLine += word.split('|')[0] + ' '
+ 
+                             # check single word tagging case
+                             try:
+                                 # close tagging for the sigle last word case
+                                 if tag != tagLine[i+1]: outputLine += '</' + tag + '> '
+                             except:
+                                 # close tagging for the sigle last word case
+                                 if i+1==len(tagLine): outputLine += '</' + tag + '> '
+ 
+                             i += 1
+                             tb = tag
+                             continue
+ 
+ 
+                         # general close tagging
+                         elif tag in labels:
+ 
+                             # check end sentence case
+                             if i+1==len(tagLine):
+                                 outputLine += word.split('|')[0] + ' '
+                                 outputLine += '</' + tag + '> '
+                                 i += 1
+                                 tb = tag
+                                 continue
+ 
+                             # close tagging
+                             elif tag!=tagLine[i+1]:
+                                 outputLine += word.split('|')[0] + ' '
+                                 outputLine += '</' + tag + '> '
+                                 i += 1
+                                 tb = tag
+                                 continue
+ 
+                         # check the last closed
+                         if tb != tag and tb in labels and outputLine[-2:] != '> ':
+                             outputLine += '</' + tb + '> '
+ 
+                         # add word
+                         outputLine += word.split('|')[0] + ' '
+                         i += 1
+                     # save the previous tag
+                         tb = tag
+ 
+                     # Saving Sentence Ouput I
+                     sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ '\t' +', '.join(Ltags))
+                     lidx += 1
+ 
+             ########################################### Save Output I ##########################################
+             print("Saving Ouput I...")
+             with open(os.path.join(options.outputPath, options.outFileI + '_' + options.modelName + '.tsv'), "w") as oFileI:
+                 for line in sentencesOutputDataI:
+                     if re.findall('</', line):
+                         #print(line)
+                         oline = line.replace('-LRB-','(')
+                         oline = oline.replace('-RRB-',')')
+                         oFileI.write(oline + '\n')
+ 
+             ########################################### Save Output II ##########################################
+             print("Saving Ouput II...")
+             with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFileII:
+                 for line in sentencesOutputDataI:
+                     oline = line.replace('-LRB-','(')
+                     oline = oline.replace('-RRB-',')')
+                     for ttex, tag in re.findall(r'<[^>]+>([^<]+)</([^>]+)>', oline):
+                         lline = oline.split('\t')[0:-2] + [ttex, tag]
+                         nline = '\t'.join(lline)
+                         oFileII.write(nline  + '\n')
+ 
+             ########################################### Save Output III ##########################################
+             print("Saving Ouput III...")
+             with open(os.path.join(options.outputPath, options.outFileIII + '_' + options.modelName + '.tsv'), "w") as oFileIII:
+                 for line, tagLine in zip(lines, y_pred):
+                     oline = [ w.split('|')[0].replace('-LRB-','(').replace('-LRB-','(')+'|'+tag for w,tag in zip(line.split(' '), tagLine)]
+ 
+                     oFileIII.write(' '.join(oline) + '\n')
+ 
+             ########################################### Save Probs ##########################################
+             y_probs = crf.predict_marginals(X_input)
+             # from https://stackoverflow.com/questions/7100125/storing-python-dictionaries
+             with open(os.path.join(options.outputPath, 'crf_probs.json'), 'w') as fp:
+                 json.dump(y_probs, fp)
+     print("Passing corpus done in: %fs" % (time() - t0))
--- a/predict-annot/output/annot-input_bg_outputIII_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv → predict-annot/output/annot-input_bg_outputIII_v7_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
View file @b2b5af9
+++ b/predict-annot/output/annot-input_bg_outputIII_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv → predict-annot/output/annot-input_bg_outputIII_v7_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
View file @b2b5af9
--- a/predict-annot/output/annot-input_bg_outputII_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @ec09fc7
+++ b/predict-annot/output/annot-input_bg_outputII_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @ec09fc7
--- a/predict-annot/output/annot-input_bg_outputII_v7_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @b2b5af9
+++ b/predict-annot/output/annot-input_bg_outputII_v7_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @b2b5af9
--- a/predict-annot/output/annot-input_bg_outputI_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @ec09fc7
+++ b/predict-annot/output/annot-input_bg_outputI_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @ec09fc7
--- a/predict-annot/output/annot-input_bg_outputI_v7_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @b2b5af9
+++ b/predict-annot/output/annot-input_bg_outputI_v7_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @b2b5af9
--- a/predict-annot/output/borrame_unique.tsv deleted 100644 → 0
View file @ec09fc7
+++ b/predict-annot/output/borrame_unique.tsv deleted 100644 → 0
View file @ec09fc7
--- a/predict-annot/reports/annot-input_bg_report_v6.txt → predict-annot/reports/annot-input_bg_report_v7.txt
View file @b2b5af9
+++ b/predict-annot/reports/annot-input_bg_report_v6.txt → predict-annot/reports/annot-input_bg_report_v7.txt
View file @b2b5af9
 -------------------------------- PARAMETERS --------------------------------
 --inputPath          Path of training data set        : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
 --outputPath         Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
- --outputFileI        Output tagged file I             : annot-input_bg_outputI_v6
- --outputFileII       Output tagged file II            : annot-input_bg_outputII_v6
- --outputFileII       Output tagged file III           : annot-input_bg_outputIII_v6
+ --outputFileI        Output tagged file I             : annot-input_bg_outputI_v7
+ --outputFileII       Output tagged file II            : annot-input_bg_outputII_v7
+ --outputFileII       Output tagged file III           : annot-input_bg_outputIII_v7
 --modelPath          Path to read CRF model           : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
 --modelName          Model name                       : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
 --infoPath           Path of GSE-GSM index file       : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
@@ -15,14 +15,14 @@
 --S4                 Semantic features     : False
 -------------------------------- PROCESSING --------------------------------
 Reading CRF model...
- Reading CRF model done in: 0.009524s
+ Reading CRF model done in: 0.009408s
 Processing corpus...
 Preprocessing file...annot-input_bg_v4.txt
 Sentences input data: 90688
 Predicting tags with model...
- Prediction done in: 27.458162s
+ Prediction done in: 27.324524s
 Tagging file...
 Saving Ouput I...
 Saving Ouput II...
 Saving Ouput III...
- Passing corpus done in: 257.970281s
+ Passing corpus done in: 261.721646s