Estefani Gaytan Nunez

upload

{"key1": "keyinfo", "key2": "keyinfo2"}
\ No newline at end of file
# -*- coding: UTF-8 -*-
import os
import re
from pandas import DataFrame as DF
from optparse import OptionParser
from time import time
from collections import Counter
import nltk
import sklearn
import scipy.stats
import sys
import joblib
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from nltk.corpus import stopwords
import training_validation_v14 as training
import json
#-------------------------------------------------------------------------------
# Objective
# Tagging transformed file with CRF model with sklearn-crfsuite.
#
# Input parameters
# --inputPath=PATH Path of transformed files x|y|z
# --outputPath Output path to place output files
# --outputFileI Output tagged file I
# --outputFileII Output tagged file II
# --modelPath Path to CRF model
# --modelName Model name
# --infoPath Path of GSE-GSM index file
# --infoFile GSE-GSM index file",
# --variant Part of S2 variant
# --S1 Inner word features set
# --S2 Complete word features
# --S3 Extended context features
# --S4 Semantic features
# --filteringStopWords Filtering stop words
# --filterSymbols Filtering punctuation marks
# Output
# 1) Tagged files in transformed format
# Examples
# --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
# --outputFileI annot-input_bg_outputI.txt
# --outputFileII annot-input_bg_outputII.txt
# --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models
# --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
# --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
# --infoFile bg_sentences_midx.txt
# --variant 13
#Examples
#predict-annot/bin/tagging/tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI_v5.txt --outputFileII annot-input_bg_outputII_v5 --outputFileIII annot-input_bg_outputIII_v5 --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx_v4.txt --variant 10 --S2 > predict-annot/reports/annot-input_bg_report_v4.txt > predict-annot/reports/output_tagging_report_v5.txt
__author__ = 'egaytan'
##########################################
# MAIN PROGRAM #
##########################################
if __name__ == "__main__":
########################################### Defining parameters ##########################################
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath", help="Path of training data set", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
parser.add_option("--outputFileI", dest="outFileI", help="Output tagged file I", metavar="FILE")
parser.add_option("--outputFileII", dest="outFileII", help="Output tagged file II", metavar="FILE")
parser.add_option("--outputFileIII", dest="outFileIII", help="Output tagged file III", metavar="FILE")
parser.add_option("--modelPath", dest="modelPath", help="Path to read CRF model", metavar="PATH")
parser.add_option("--modelName", dest="modelName", help="Model name", metavar="TEXT")
parser.add_option("--infoPath", dest="infoPath", help="Path of GSE-GSM index file", metavar="PATH")
parser.add_option("--infoFile", dest="idx", help="GSE-GSM index file", metavar="FILE")
parser.add_option("--variant", dest="variant", help="Run variant", metavar="FILE")
parser.add_option("--S1", dest="S1", help="Inner word features", action="store_true", default=False)
parser.add_option("--S2", dest="S2", help="Complete word features", action="store_true", default=False)
parser.add_option("--S3", dest="S3", help="Extended context features", action="store_true", default=False)
parser.add_option("--S4", dest="S4", help="Semantic features", action="store_true", default=False)
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Any parameter given.")
sys.exit(1)
########################################### DISP PARAMETERS ##########################################
print('-------------------------------- PARAMETERS --------------------------------')
print("--inputPath Path of training data set : " + str(options.inputPath ))
print("--outputPath Output path to place output files: " + str(options.outputPath ))
print("--outputFileI Output tagged file I : " + str(options.outFileI ))
print("--outputFileII Output tagged file II : " + str(options.outFileII ))
print("--outputFileII Output tagged file III : " + str(options.outFileIII ))
print("--modelPath Path to read CRF model : " + str(options.modelPath ))
print("--modelName Model name : " + str(options.modelName ))
print("--infoPath Path of GSE-GSM index file : " + str(options.infoPath ))
print("--infoFile GSE-GSM index file : " + str(options.idx ))
print("--variant Run variant : " + str(options.variant ))
print("--S1 Inner word features set : " + str(options.S1 ))
print("--S2 Complete word features : " + str(options.S2 ))
print("--S3 Extended context features : " + str(options.S3 ))
print("--S4 Semantic features : " + str(options.S4 ))
########################################### PROCESSING ##########################################
print('-------------------------------- PROCESSING --------------------------------')
# Load index mapping GSE file information
idx = open(os.path.join(options.infoPath, options.idx), "r").readlines()
########################################### Read CRF model ##########################################
t0 = time()
print('Reading CRF model...')
crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod'))
print("Reading CRF model done in: %fs" % (time() - t0))
########################################### Reading sentences ##########################################
print('Processing corpus...')
t0 = time()
labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
########################################### Preprocessing ###########################################
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
# For each file in dir
for file in files:
print("Preprocessing file..." + str(file))
sentencesInputData = []
sentencesOutputDataI = []
# Preprocessing input sentences
with open(os.path.join(options.inputPath, file), "r") as iFile:
lines = iFile.readlines()
sentencesInputData = [ line.strip('\n').split() for line in lines]
# Save input sentences
X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData]
print("Sentences input data: " + str(len(sentencesInputData)))
########################################### Predicting tags ###########################################
t1 = time()
print("Predicting tags with model...")
y_pred = crf.predict(X_input)
print("Prediction done in: %fs" % (time() - t1))
########################################### Tagging with CRF model ###########################################
print("Tagging file...")
lidx = 0
for line, tagLine in zip(lines, y_pred):
# get unique tags
Ltags = set(labels).intersection(set(tagLine))
# Skip untagged sentence
if Ltags == {'O'}: continue
line = line.strip('\n')
# start an empty sentence
outputLine = ''
# per sentence word count
i = 0
# Exception for one word sentences
if len(tagLine) == 1:
if tagLine[0] in labels:
# add start tagging signature
start = '<' + tagLine[0] + '> '
# add end tagging signature
end = '</' + tagLine[0] + '>'
word = line.split('|')[0] + ' '
# save output tagged sentence
outputLine = start + word + end
else:
outputLine = line.split(' ')[0]
# Saving Sentence Ouput I
#print(outputLine)
sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + '\t' + ', '.join(Ltags))
# Increase sentence counter
lidx += 1
# Continue with the next sentence
continue
# tag behind
tb = 'O'
# Tagging sentences
for word,tag in zip(line.split(' '), tagLine):
# general start tagging
if tag in labels and tb != tag:
# check continues tags case before start tagging
if tb in labels and outputLine[-2:] != '> ':
# closed the last tagging
outputLine += '</' + tb + '> '
# start new tagging
outputLine += '<' + tag + '> '
outputLine += word.split('|')[0] + ' '
# check single word tagging case
try:
# close tagging for the sigle last word case
if tag != tagLine[i+1]: outputLine += '</' + tag + '> '
except:
# close tagging for the sigle last word case
if i+1==len(tagLine): outputLine += '</' + tag + '> '
i += 1
tb = tag
continue
# general close tagging
elif tag in labels:
# check end sentence case
if i+1==len(tagLine):
outputLine += word.split('|')[0] + ' '
outputLine += '</' + tag + '> '
i += 1
tb = tag
continue
# close tagging
elif tag!=tagLine[i+1]:
outputLine += word.split('|')[0] + ' '
outputLine += '</' + tag + '> '
i += 1
tb = tag
continue
# check the last closed
if tb != tag and tb in labels and outputLine[-2:] != '> ':
outputLine += '</' + tb + '> '
# add word
outputLine += word.split('|')[0] + ' '
i += 1
# save the previous tag
tb = tag
# Saving Sentence Ouput I
sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ '\t' +', '.join(Ltags))
lidx += 1
########################################### Save Output I ##########################################
print("Saving Ouput I...")
with open(os.path.join(options.outputPath, options.outFileI + '_' + options.modelName + '.tsv'), "w") as oFileI:
for line in sentencesOutputDataI:
if re.findall('</', line):
#print(line)
oline = line.replace('-LRB-','(')
oline = oline.replace('-RRB-',')')
oFileI.write(oline + '\n')
########################################### Save Output II ##########################################
print("Saving Ouput II...")
with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFileII:
for line in sentencesOutputDataI:
oline = line.replace('-LRB-','(')
oline = oline.replace('-RRB-',')')
for ttex, tag in re.findall(r'<[^>]+>([^<]+)</([^>]+)>', oline):
lline = oline.split('\t')[0:-2] + [ttex, tag]
nline = '\t'.join(lline)
oFileII.write(nline + '\n')
########################################### Save Output III ##########################################
print("Saving Ouput III...")
with open(os.path.join(options.outputPath, options.outFileIII + '_' + options.modelName + '.tsv'), "w") as oFileIII:
for line, tagLine in zip(lines, y_pred):
oline = [ w.split('|')[0].replace('-LRB-','(').replace('-LRB-','(')+'|'+tag for w,tag in zip(line.split(' '), tagLine)]
oFileIII.write(' '.join(oline) + '\n')
########################################### Save Probs ##########################################
y_probs = crf.predict_marginals(X_input)
# from https://stackoverflow.com/questions/7100125/storing-python-dictionaries
with open(os.path.join(options.outputPath, 'crf_probs.json'), 'w') as fp:
json.dump(y_probs, fp)
print("Passing corpus done in: %fs" % (time() - t0))
This diff could not be displayed because it is too large.
-------------------------------- PARAMETERS --------------------------------
--inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
--outputFileI Output tagged file I : annot-input_bg_outputI_v6
--outputFileII Output tagged file II : annot-input_bg_outputII_v6
--outputFileII Output tagged file III : annot-input_bg_outputIII_v6
--outputFileI Output tagged file I : annot-input_bg_outputI_v7
--outputFileII Output tagged file II : annot-input_bg_outputII_v7
--outputFileII Output tagged file III : annot-input_bg_outputIII_v7
--modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
--modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
--infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
......@@ -15,14 +15,14 @@
--S4 Semantic features : False
-------------------------------- PROCESSING --------------------------------
Reading CRF model...
Reading CRF model done in: 0.009524s
Reading CRF model done in: 0.009408s
Processing corpus...
Preprocessing file...annot-input_bg_v4.txt
Sentences input data: 90688
Predicting tags with model...
Prediction done in: 27.458162s
Prediction done in: 27.324524s
Tagging file...
Saving Ouput I...
Saving Ouput II...
Saving Ouput III...
Passing corpus done in: 257.970281s
Passing corpus done in: 261.721646s
......