upload

Estefani Gaytan Nunez
Commit ec09fc7e48333f5b31c0c578d77c50f139476c7a ec09fc7e 1 parent 6a0c7437
Showing 9 changed files with 271 additions and 40 deletions
predict-annot/bin/tagging/tagging_v2_21Oct.py
predict-annot/output/annot-input_bg_outputII_v5_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputII_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputI_v5.txt_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/output/annot-input_bg_outputI_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
predict-annot/reports/annot-input_bg_report_v4.txt
predict-annot/reports/output_tagging_report_v5.txt → predict-annot/reports/annot-input_bg_report_v6.txt
predict-annot/reports/output_tagging_report_v4.txt
--- a/predict-annot/bin/tagging/tagging_v2_21Oct.py 0 → 100644
View file @ec09fc7
+++ b/predict-annot/bin/tagging/tagging_v2_21Oct.py 0 → 100644
View file @ec09fc7
+ # -*- coding: UTF-8 -*-
+ 
+ import os
+ import re
+ from pandas import DataFrame as DF
+ from optparse import OptionParser
+ from time import time
+ from collections import Counter
+ 
+ import nltk
+ import sklearn
+ import scipy.stats
+ import sys
+ 
+ import joblib
+ from sklearn.metrics import make_scorer
+ from sklearn.model_selection import cross_val_score
+ from sklearn.model_selection import RandomizedSearchCV
+ 
+ import sklearn_crfsuite
+ from sklearn_crfsuite import scorers
+ from sklearn_crfsuite import metrics
+ 
+ from nltk.corpus import stopwords
+ 
+ import training_validation_v14 as training
+ 
+ import json
+ 
+ #-------------------------------------------------------------------------------
+ # Objective
+ # Tagging transformed file with CRF model with sklearn-crfsuite.
+ #
+ # Input parameters
+ # --inputPath=PATH          Path of transformed files x|y|z
+ # --outputPath              Output path to place output files
+ # --outputFileI             Output tagged file I
+ # --outputFileII            Output tagged file II
+ # --modelPath               Path to CRF model
+ # --modelName               Model name
+ # --infoPath                Path of GSE-GSM index file
+ # --infoFile	            GSE-GSM index file",
+ # --variant	                Part of S2 variant
+ # --S1                      Inner word features set
+ # --S2                      Complete word features
+ # --S3                      Extended context features
+ # --S4                      Semantic features
+ # --filteringStopWords      Filtering stop words
+ # --filterSymbols           Filtering punctuation marks
+ 
+ # Output
+ # 1) Tagged files in transformed format
+ 
+ # Examples
+ # --inputPath		/home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
+ # --outputPath	    /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
+ # --outputFileI     annot-input_bg_outputI.txt
+ # --outputFileII    annot-input_bg_outputII.txt
+ # --modelPath		/home/egaytan/automatic-extraction-growth-conditions/CRF/models
+ # --modelName		model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
+ # --infoPath		/home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
+ # --infoFile		bg_sentences_midx.txt
+ # --variant		    13
+ 
+ #Examples
+ #predict-annot/bin/tagging/tagging.py  --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/  --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/  --outputFileI  annot-input_bg_outputI_v5.txt  --outputFileII  annot-input_bg_outputII_v5 --outputFileIII  annot-input_bg_outputIII_v5  --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models  --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10  --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping  --infoFile bg_sentences_midx_v4.txt  --variant 10   --S2   > predict-annot/reports/annot-input_bg_report_v4.txt > predict-annot/reports/output_tagging_report_v5.txt
+ __author__ = 'egaytan'
+ 
+ ##########################################
+ #               MAIN PROGRAM             #
+ ##########################################
+ 
+ if __name__ == "__main__":
+     ########################################### Defining parameters ##########################################
+     parser = OptionParser()
+     parser.add_option("--inputPath",       dest="inputPath",       help="Path of training data set",         metavar="PATH")
+     parser.add_option("--outputPath",      dest="outputPath",      help="Output path to place output files", metavar="PATH")
+     parser.add_option("--outputFileI",     dest="outFileI",        help="Output tagged file I",              metavar="FILE")
+     parser.add_option("--outputFileII",    dest="outFileII",       help="Output tagged file II",             metavar="FILE")
+     parser.add_option("--outputFileIII",   dest="outFileIII",      help="Output tagged file III",            metavar="FILE")
+     parser.add_option("--modelPath",       dest="modelPath",       help="Path to read CRF model",            metavar="PATH")
+     parser.add_option("--modelName",       dest="modelName",       help="Model name",                        metavar="TEXT")
+     parser.add_option("--infoPath",        dest="infoPath",        help="Path of GSE-GSM index file",        metavar="PATH")
+     parser.add_option("--infoFile",        dest="idx",             help="GSE-GSM index file",                metavar="FILE")
+     parser.add_option("--variant",         dest="variant",         help="Run variant",                       metavar="FILE")
+     parser.add_option("--S1",              dest="S1",              help="Inner word features",               action="store_true", default=False)
+     parser.add_option("--S2",              dest="S2",              help="Complete word features",            action="store_true", default=False)
+     parser.add_option("--S3",              dest="S3",              help="Extended context features",         action="store_true", default=False)
+     parser.add_option("--S4",              dest="S4",              help="Semantic features",                 action="store_true", default=False)
+ 
+     (options, args) = parser.parse_args()
+     if len(args) > 0:
+         parser.error("Any parameter given.")
+         sys.exit(1)
+ 
+     ########################################### DISP PARAMETERS ##########################################
+     print('-------------------------------- PARAMETERS --------------------------------')
+ 
+     print("--inputPath          Path of training data set        : " + str(options.inputPath        ))
+     print("--outputPath         Output path to place output files: " + str(options.outputPath       ))
+     print("--outputFileI        Output tagged file I             : " + str(options.outFileI         ))
+     print("--outputFileII       Output tagged file II            : " + str(options.outFileII        ))
+     print("--outputFileII       Output tagged file III           : " + str(options.outFileIII       ))
+     print("--modelPath          Path to read CRF model           : " + str(options.modelPath        ))
+     print("--modelName          Model name                       : " + str(options.modelName        ))
+     print("--infoPath           Path of GSE-GSM index file       : " + str(options.infoPath         ))
+     print("--infoFile	        GSE-GSM index file               : " + str(options.idx              ))
+     print("--variant	        Run variant                      : " + str(options.variant          ))
+     print("--S1                 Inner word features set          : " + str(options.S1               ))
+     print("--S2                 Complete word features     : " + str(options.S2               ))
+     print("--S3                 Extended context features     : " + str(options.S3               ))
+     print("--S4                 Semantic features     : " + str(options.S4               ))
+ 
+     ########################################### PROCESSING ##########################################
+     print('-------------------------------- PROCESSING --------------------------------')
+ 
+     # Read index mapping GSE file information
+     idx = open(os.path.join(options.infoPath, options.idx), "r").readlines()
+ 
+     ########################################### Read CRF model ##########################################
+     t0 = time()
+     print('Reading CRF model...')
+     crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod'))
+     print("Reading CRF model done in: %fs" % (time() - t0))
+ 
+     ########################################### Reading sentences ##########################################
+     print('Processing corpus...')
+     t0 = time()
+     labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
+ 
+     ########################################### Preprocessing ###########################################
+     # Walk directory to read files
+     for path, dirs, files in os.walk(options.inputPath):
+         # For each file in dir
+         for file in files:
+             print("Preprocessing file..." + str(file))
+             sentencesInputData    = []
+             sentencesOutputDataI = []
+             # Preprocessing input sentences
+             with open(os.path.join(options.inputPath, file), "r") as iFile:
+                 lines = iFile.readlines()
+                 sentencesInputData = [ line.strip('\n').split() for line in lines]
+                 # Save input sentences
+                 X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData]
+                 print("Sentences input data: " + str(len(sentencesInputData)))
+ 
+                 ###########################################     Predicting tags    ###########################################
+                 t1 = time()
+                 print("Predicting tags with model...")
+                 y_pred = crf.predict(X_input)
+ 
+                 #print(y_pred)
+                 print("Prediction done in: %fs" % (time() - t1))
+ 
+                 ########################################### Tagging with CRF model ###########################################
+                 print("Tagging file...")
+                 lidx = 0
+                 for line, tagLine in zip(lines, y_pred):
+                     # unique tags
+                     Ltags = set(labels).intersection(set(tagLine))
+                     # Skip untagged sentence
+                     if Ltags == {'O'}: continue
+                     line = line.strip('\n')
+                     # starting empty sentence
+                     outputLine = ''
+                     # tag behind
+                     tb = 'O'
+                     # per sentence word count
+                     i  = 0
+                     # Exception for one word sentences
+                     if len(tagLine) == 1:
+                         if tagLine[0] in labels:
+                             # add start tagging signature
+                             start = '<'  + tagLine[0]  + '> '
+                             # add end tagging signature
+                             end   = '</' + tagLine[0]  + '>'
+                             word  = line.split('|')[0] + ' '
+                             # save output tagged sentence
+                             outputLine = start + word + end
+                         else:
+                             outputLine = line.split(' ')[0]
+                         # Saving Sentence Ouput I
+                         #print(outputLine)
+                         sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + '\t' + ', '.join(Ltags))
+                         # Increase sentence counter
+                         lidx += 1
+                         # Continue with the next sentence
+                         continue
+                     # Tagging sentences
+                     for word,tag in zip(line.split(' '), tagLine):
+                         # start tagging
+                         if tag in labels and tb != tag:
+                             # check the last tagged word(s)
+                             if tb in labels and outputLine[-2:] != '> ':
+                               outputLine += '<' + tb + '> '
+                             outputLine += '<' + tag + '> '
+                             outputLine += word.split('|')[0] + ' '
+                             tb = tag
+                             i += 1
+                             continue
+                         # end tagging
+                         elif tag in labels: #elif tb in labels: #elif tag in labels and tag!=tagLine[i+1]:
+                             if i+1==len(tagLine):
+                                 # end sentence
+                                 #print(outputLine)
+                                 outputLine += word.split('|')[0] + ' '
+                                 outputLine += '</' + tag + '> '
+                                 tb = 'O'
+                                 i += 1
+                                 continue
+                             elif tag!=tagLine[i+1]:
+                                 # start new tag
+                                 outputLine += word.split('|')[0] + ' '
+                                 outputLine += '</' + tag + '> '
+                                 tb = 'O'
+                                 i += 1
+                                 continue
+                         # check the last tagged word(s)
+                         if tb != tag and tb in labels and outputLine[-2:] != '<' + tb + '> ':
+                               outputLine += '</' + tb + '> '
+                         # word tagged
+                         outputLine += word.split('|')[0] + ' '
+                         i += 1
+                     # Saving Sentence Ouput I
+                     #print(outputLine)
+                     sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ '\t' +', '.join(Ltags))
+                     lidx += 1
+ 
+                 #print("\n".join(sentencesOutputDataI[1:3]))
+ 
+             ########################################### Save Output I ##########################################
+             print("Saving Ouput I...")
+             with open(os.path.join(options.outputPath, options.outFileI + '_' + options.modelName + '.tsv'), "w") as oFileI:
+                 for line in sentencesOutputDataI:
+                     if re.findall('</', line):
+                         #print(line)
+                         oline = line.replace('-LRB-','(')
+                         oline = oline.replace('-RRB-',')')
+                         oFileI.write(oline + '\n')
+ 
+             ########################################### Save Output II ##########################################
+             print("Saving Ouput II...")
+             with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFileII:
+                 for line in sentencesOutputDataI:
+                     oline = line.replace('-LRB-','(')
+                     oline = oline.replace('-RRB-',')')
+                     for ttex, tag in re.findall(r'<[^>]+>([^<]+)</([^>]+)>', oline):
+                         lline = oline.split('\t')[0:-2] + [ttex, tag]
+                         nline = '\t'.join(lline)
+                         oFileII.write(nline  + '\n')
+ 
+             ########################################### Save Output III ##########################################
+             print("Saving Ouput III...")
+             with open(os.path.join(options.outputPath, options.outFileIII + '_' + options.modelName + '.tsv'), "w") as oFileIII:
+                 for line, tagLine in zip(lines, y_pred):
+                     oline = [ w.split('|')[0].replace('-LRB-','(').replace('-LRB-','(')+'|'+tag for w,tag in zip(line.split(' '), tagLine)]
+ 
+                     oFileIII.write(' '.join(oline) + '\n')
+ 
+             ########################################### Save Probs ##########################################
+             y_probs = crf.predict_marginals(X_input)
+             # from https://stackoverflow.com/questions/7100125/storing-python-dictionaries
+             with open(os.path.join(options.outputPath, 'crf_probs.json'), 'w') as fp:
+                 json.dump(y_probs, fp)
+     print("Passing corpus done in: %fs" % (time() - t0))
--- a/predict-annot/output/annot-input_bg_outputIII_v5_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv → predict-annot/output/annot-input_bg_outputIII_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
View file @ec09fc7
+++ b/predict-annot/output/annot-input_bg_outputIII_v5_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv → predict-annot/output/annot-input_bg_outputIII_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
View file @ec09fc7
--- a/predict-annot/output/annot-input_bg_outputII_v5_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @6a0c743
+++ b/predict-annot/output/annot-input_bg_outputII_v5_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @6a0c743
--- a/predict-annot/output/annot-input_bg_outputII_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @ec09fc7
+++ b/predict-annot/output/annot-input_bg_outputII_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @ec09fc7
--- a/predict-annot/output/annot-input_bg_outputI_v5.txt_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @6a0c743
+++ b/predict-annot/output/annot-input_bg_outputI_v5.txt_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv deleted 100644 → 0
View file @6a0c743
--- a/predict-annot/output/annot-input_bg_outputI_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @ec09fc7
+++ b/predict-annot/output/annot-input_bg_outputI_v6_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv 0 → 100644
View file @ec09fc7
--- a/predict-annot/reports/annot-input_bg_report_v4.txt deleted 100644 → 0
View file @6a0c743
+++ b/predict-annot/reports/annot-input_bg_report_v4.txt deleted 100644 → 0
View file @6a0c743
--- a/predict-annot/reports/output_tagging_report_v5.txt → predict-annot/reports/annot-input_bg_report_v6.txt
View file @ec09fc7
+++ b/predict-annot/reports/output_tagging_report_v5.txt → predict-annot/reports/annot-input_bg_report_v6.txt
View file @ec09fc7
 -------------------------------- PARAMETERS --------------------------------
 --inputPath          Path of training data set        : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
 --outputPath         Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
- --outputFileI        Output tagged file I             : annot-input_bg_outputI_v5.txt
- --outputFileII       Output tagged file II            : annot-input_bg_outputII_v5
- --outputFileII       Output tagged file III           : annot-input_bg_outputIII_v5
+ --outputFileI        Output tagged file I             : annot-input_bg_outputI_v6
+ --outputFileII       Output tagged file II            : annot-input_bg_outputII_v6
+ --outputFileII       Output tagged file III           : annot-input_bg_outputIII_v6
 --modelPath          Path to read CRF model           : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
 --modelName          Model name                       : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
 --infoPath           Path of GSE-GSM index file       : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
@@ -15,16 +15,14 @@
 --S4                 Semantic features     : False
 -------------------------------- PROCESSING --------------------------------
 Reading CRF model...
- Reading CRF model done in: 0.009225s
+ Reading CRF model done in: 0.009524s
 Processing corpus...
 Preprocessing file...annot-input_bg_v4.txt
 Sentences input data: 90688
 Predicting tags with model...
- Prediction done in: 27.733279s
+ Prediction done in: 27.458162s
 Tagging file...
- GSE100233	GSM2675514	GPL18006-GPL18133-PMID:29186514	library_strategy.1	<Technique> ChIP-Seq </Technique>	Technique
- GSE100233	GSM2675514	GPL18006-GPL18133-PMID:29186514	growth_protocol_ch1.1	Cultures of Caulobacter -LRB- TLS1631-TLS1633 -RRB- were grown at 30oC in PYE and supplemented with antibiotics , as necessary , at appropriate concentrations . To deplete wild-type non-tagged ParB , exponential-phase cells were washed off xylose and re-introduced to PYE +0.2 % glucose for an additional <Supp> 5 hours </Supp> . After 4 hours , vanillate was added to induce the expression of flag-parB -LRB- WT -RRB- or flag-parB -LRB- G101S/R104A -RRB- for an hour . Cultures of Escherichia coli -LRB- TLS1637-TLS1650 -RRB- were grown at 30oC in LB and supplemented with antibiotics , as necessary , at appropriate concentrations . IPTG -LRB- 0.5 mM -RRB- was added to induce the production of T18-ParB -LRB- WT -RRB- or T18-ParB -LRB- G101S -RRB- . After an hour , formadehyde -LRB- 1 % final concentration -RRB- were added to fix cells for ChIP-seq . 	Supp
 Saving Ouput I...
 Saving Ouput II...
 Saving Ouput III...
- Pssing corpus done in: 258.328259s
+ Passing corpus done in: 257.970281s
--- a/predict-annot/reports/output_tagging_report_v4.txt deleted 100644 → 0
View file @6a0c743
+++ b/predict-annot/reports/output_tagging_report_v4.txt deleted 100644 → 0
View file @6a0c743
- -------------------------------- PARAMETERS --------------------------------
- --inputPath          Path of training data set        : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
- --outputPath         Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
- --outputFileI        Output tagged file I             : annot-input_bg_outputI_v4.txt
- --outputFileII       Output tagged file II            : annot-input_bg_outputII_v4
- --modelPath          Path to read CRF model           : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
- --modelName          Model name                       : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
- --infoPath           Path of GSE-GSM index file       : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
- --infoFile	        GSE-GSM index file               : bg_sentences_midx_v4.txt
- --variant	        Run variant                      : 13
- --S1                 General features                 : True
- --S2                 Inner/Complete word features     : False
- --S3                 Extended context features        : False
- --S4                 Semantic features                : True
- --filteringStopWords Filtering stop words             : False
- --filterSymbols      Filtering punctuation marks      : False
- Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
- -------------------------------- PROCESSING --------------------------------
- Reading CRF model...
- Reading CRF model done in: 0.009363s
- Processing corpus...
- Preprocessing file...annot-input_bg_v3.txt
- Sentences input data: 14716
- Predicting tags with model
- Prediction done in: 1.737334s
- Tagging file
- Preprocessing file...annot-input_bg_v4.txt
- Sentences input data: 90688
- Predicting tags with model
- Prediction done in: 26.434549s
- Tagging file
- Processing corpus done in: 58.304885s