Estefani Gaytan Nunez

upload

# -*- coding: UTF-8 -*-
import os
import re
from pandas import DataFrame as DF
from optparse import OptionParser
from time import time
from collections import Counter
import nltk
import sklearn
import scipy.stats
import sys
import joblib
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from nltk.corpus import stopwords
import training_validation_v14 as training
import json
#-------------------------------------------------------------------------------
# Objective
# Tagging transformed file with CRF model with sklearn-crfsuite.
#
# Input parameters
# --inputPath=PATH Path of transformed files x|y|z
# --outputPath Output path to place output files
# --outputFileI Output tagged file I
# --outputFileII Output tagged file II
# --modelPath Path to CRF model
# --modelName Model name
# --infoPath Path of GSE-GSM index file
# --infoFile GSE-GSM index file",
# --variant Part of S2 variant
# --S1 Inner word features set
# --S2 Complete word features
# --S3 Extended context features
# --S4 Semantic features
# --filteringStopWords Filtering stop words
# --filterSymbols Filtering punctuation marks
# Output
# 1) Tagged files in transformed format
# Examples
# --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
# --outputFileI annot-input_bg_outputI.txt
# --outputFileII annot-input_bg_outputII.txt
# --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models
# --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
# --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
# --infoFile bg_sentences_midx.txt
# --variant 13
#Examples
#predict-annot/bin/tagging/tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI_v5.txt --outputFileII annot-input_bg_outputII_v5 --outputFileIII annot-input_bg_outputIII_v5 --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx_v4.txt --variant 10 --S2 > predict-annot/reports/annot-input_bg_report_v4.txt > predict-annot/reports/output_tagging_report_v5.txt
__author__ = 'egaytan'
##########################################
# MAIN PROGRAM #
##########################################
if __name__ == "__main__":
########################################### Defining parameters ##########################################
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath", help="Path of training data set", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
parser.add_option("--outputFileI", dest="outFileI", help="Output tagged file I", metavar="FILE")
parser.add_option("--outputFileII", dest="outFileII", help="Output tagged file II", metavar="FILE")
parser.add_option("--outputFileIII", dest="outFileIII", help="Output tagged file III", metavar="FILE")
parser.add_option("--modelPath", dest="modelPath", help="Path to read CRF model", metavar="PATH")
parser.add_option("--modelName", dest="modelName", help="Model name", metavar="TEXT")
parser.add_option("--infoPath", dest="infoPath", help="Path of GSE-GSM index file", metavar="PATH")
parser.add_option("--infoFile", dest="idx", help="GSE-GSM index file", metavar="FILE")
parser.add_option("--variant", dest="variant", help="Run variant", metavar="FILE")
parser.add_option("--S1", dest="S1", help="Inner word features", action="store_true", default=False)
parser.add_option("--S2", dest="S2", help="Complete word features", action="store_true", default=False)
parser.add_option("--S3", dest="S3", help="Extended context features", action="store_true", default=False)
parser.add_option("--S4", dest="S4", help="Semantic features", action="store_true", default=False)
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Any parameter given.")
sys.exit(1)
########################################### DISP PARAMETERS ##########################################
print('-------------------------------- PARAMETERS --------------------------------')
print("--inputPath Path of training data set : " + str(options.inputPath ))
print("--outputPath Output path to place output files: " + str(options.outputPath ))
print("--outputFileI Output tagged file I : " + str(options.outFileI ))
print("--outputFileII Output tagged file II : " + str(options.outFileII ))
print("--outputFileII Output tagged file III : " + str(options.outFileIII ))
print("--modelPath Path to read CRF model : " + str(options.modelPath ))
print("--modelName Model name : " + str(options.modelName ))
print("--infoPath Path of GSE-GSM index file : " + str(options.infoPath ))
print("--infoFile GSE-GSM index file : " + str(options.idx ))
print("--variant Run variant : " + str(options.variant ))
print("--S1 Inner word features set : " + str(options.S1 ))
print("--S2 Complete word features : " + str(options.S2 ))
print("--S3 Extended context features : " + str(options.S3 ))
print("--S4 Semantic features : " + str(options.S4 ))
########################################### PROCESSING ##########################################
print('-------------------------------- PROCESSING --------------------------------')
# Read index mapping GSE file information
idx = open(os.path.join(options.infoPath, options.idx), "r").readlines()
########################################### Read CRF model ##########################################
t0 = time()
print('Reading CRF model...')
crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod'))
print("Reading CRF model done in: %fs" % (time() - t0))
########################################### Reading sentences ##########################################
print('Processing corpus...')
t0 = time()
labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
########################################### Preprocessing ###########################################
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
# For each file in dir
for file in files:
print("Preprocessing file..." + str(file))
sentencesInputData = []
sentencesOutputDataI = []
# Preprocessing input sentences
with open(os.path.join(options.inputPath, file), "r") as iFile:
lines = iFile.readlines()
sentencesInputData = [ line.strip('\n').split() for line in lines]
# Save input sentences
X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData]
print("Sentences input data: " + str(len(sentencesInputData)))
########################################### Predicting tags ###########################################
t1 = time()
print("Predicting tags with model...")
y_pred = crf.predict(X_input)
#print(y_pred)
print("Prediction done in: %fs" % (time() - t1))
########################################### Tagging with CRF model ###########################################
print("Tagging file...")
lidx = 0
for line, tagLine in zip(lines, y_pred):
# unique tags
Ltags = set(labels).intersection(set(tagLine))
# Skip untagged sentence
if Ltags == {'O'}: continue
line = line.strip('\n')
# starting empty sentence
outputLine = ''
# tag behind
tb = 'O'
# per sentence word count
i = 0
# Exception for one word sentences
if len(tagLine) == 1:
if tagLine[0] in labels:
# add start tagging signature
start = '<' + tagLine[0] + '> '
# add end tagging signature
end = '</' + tagLine[0] + '>'
word = line.split('|')[0] + ' '
# save output tagged sentence
outputLine = start + word + end
else:
outputLine = line.split(' ')[0]
# Saving Sentence Ouput I
#print(outputLine)
sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + '\t' + ', '.join(Ltags))
# Increase sentence counter
lidx += 1
# Continue with the next sentence
continue
# Tagging sentences
for word,tag in zip(line.split(' '), tagLine):
# start tagging
if tag in labels and tb != tag:
# check the last tagged word(s)
if tb in labels and outputLine[-2:] != '> ':
outputLine += '<' + tb + '> '
outputLine += '<' + tag + '> '
outputLine += word.split('|')[0] + ' '
tb = tag
i += 1
continue
# end tagging
elif tag in labels: #elif tb in labels: #elif tag in labels and tag!=tagLine[i+1]:
if i+1==len(tagLine):
# end sentence
#print(outputLine)
outputLine += word.split('|')[0] + ' '
outputLine += '</' + tag + '> '
tb = 'O'
i += 1
continue
elif tag!=tagLine[i+1]:
# start new tag
outputLine += word.split('|')[0] + ' '
outputLine += '</' + tag + '> '
tb = 'O'
i += 1
continue
# check the last tagged word(s)
if tb != tag and tb in labels and outputLine[-2:] != '<' + tb + '> ':
outputLine += '</' + tb + '> '
# word tagged
outputLine += word.split('|')[0] + ' '
i += 1
# Saving Sentence Ouput I
#print(outputLine)
sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ '\t' +', '.join(Ltags))
lidx += 1
#print("\n".join(sentencesOutputDataI[1:3]))
########################################### Save Output I ##########################################
print("Saving Ouput I...")
with open(os.path.join(options.outputPath, options.outFileI + '_' + options.modelName + '.tsv'), "w") as oFileI:
for line in sentencesOutputDataI:
if re.findall('</', line):
#print(line)
oline = line.replace('-LRB-','(')
oline = oline.replace('-RRB-',')')
oFileI.write(oline + '\n')
########################################### Save Output II ##########################################
print("Saving Ouput II...")
with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFileII:
for line in sentencesOutputDataI:
oline = line.replace('-LRB-','(')
oline = oline.replace('-RRB-',')')
for ttex, tag in re.findall(r'<[^>]+>([^<]+)</([^>]+)>', oline):
lline = oline.split('\t')[0:-2] + [ttex, tag]
nline = '\t'.join(lline)
oFileII.write(nline + '\n')
########################################### Save Output III ##########################################
print("Saving Ouput III...")
with open(os.path.join(options.outputPath, options.outFileIII + '_' + options.modelName + '.tsv'), "w") as oFileIII:
for line, tagLine in zip(lines, y_pred):
oline = [ w.split('|')[0].replace('-LRB-','(').replace('-LRB-','(')+'|'+tag for w,tag in zip(line.split(' '), tagLine)]
oFileIII.write(' '.join(oline) + '\n')
########################################### Save Probs ##########################################
y_probs = crf.predict_marginals(X_input)
# from https://stackoverflow.com/questions/7100125/storing-python-dictionaries
with open(os.path.join(options.outputPath, 'crf_probs.json'), 'w') as fp:
json.dump(y_probs, fp)
print("Passing corpus done in: %fs" % (time() - t0))
-------------------------------- PARAMETERS --------------------------------
--inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
--outputFileI Output tagged file I : annot-input_bg_outputI_v5.txt
--outputFileII Output tagged file II : annot-input_bg_outputII_v5
--outputFileII Output tagged file III : annot-input_bg_outputIII_v5
--outputFileI Output tagged file I : annot-input_bg_outputI_v6
--outputFileII Output tagged file II : annot-input_bg_outputII_v6
--outputFileII Output tagged file III : annot-input_bg_outputIII_v6
--modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
--modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
--infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
......@@ -15,16 +15,14 @@
--S4 Semantic features : False
-------------------------------- PROCESSING --------------------------------
Reading CRF model...
Reading CRF model done in: 0.009225s
Reading CRF model done in: 0.009524s
Processing corpus...
Preprocessing file...annot-input_bg_v4.txt
Sentences input data: 90688
Predicting tags with model...
Prediction done in: 27.733279s
Prediction done in: 27.458162s
Tagging file...
GSE100233 GSM2675514 GPL18006-GPL18133-PMID:29186514 library_strategy.1 <Technique> ChIP-Seq </Technique> Technique
GSE100233 GSM2675514 GPL18006-GPL18133-PMID:29186514 growth_protocol_ch1.1 Cultures of Caulobacter -LRB- TLS1631-TLS1633 -RRB- were grown at 30oC in PYE and supplemented with antibiotics , as necessary , at appropriate concentrations . To deplete wild-type non-tagged ParB , exponential-phase cells were washed off xylose and re-introduced to PYE +0.2 % glucose for an additional <Supp> 5 hours </Supp> . After 4 hours , vanillate was added to induce the expression of flag-parB -LRB- WT -RRB- or flag-parB -LRB- G101S/R104A -RRB- for an hour . Cultures of Escherichia coli -LRB- TLS1637-TLS1650 -RRB- were grown at 30oC in LB and supplemented with antibiotics , as necessary , at appropriate concentrations . IPTG -LRB- 0.5 mM -RRB- was added to induce the production of T18-ParB -LRB- WT -RRB- or T18-ParB -LRB- G101S -RRB- . After an hour , formadehyde -LRB- 1 % final concentration -RRB- were added to fix cells for ChIP-seq . Supp
Saving Ouput I...
Saving Ouput II...
Saving Ouput III...
Pssing corpus done in: 258.328259s
Passing corpus done in: 257.970281s
......
-------------------------------- PARAMETERS --------------------------------
--inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
--outputFileI Output tagged file I : annot-input_bg_outputI_v4.txt
--outputFileII Output tagged file II : annot-input_bg_outputII_v4
--modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
--modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
--infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
--infoFile GSE-GSM index file : bg_sentences_midx_v4.txt
--variant Run variant : 13
--S1 General features : True
--S2 Inner/Complete word features : False
--S3 Extended context features : False
--S4 Semantic features : True
--filteringStopWords Filtering stop words : False
--filterSymbols Filtering punctuation marks : False
Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
-------------------------------- PROCESSING --------------------------------
Reading CRF model...
Reading CRF model done in: 0.009363s
Processing corpus...
Preprocessing file...annot-input_bg_v3.txt
Sentences input data: 14716
Predicting tags with model
Prediction done in: 1.737334s
Tagging file
Preprocessing file...annot-input_bg_v4.txt
Sentences input data: 90688
Predicting tags with model
Prediction done in: 26.434549s
Tagging file
Processing corpus done in: 58.304885s