Estefani Gaytan Nunez

upload

# -*- coding: UTF-8 -*-
import os
import re
from pandas import DataFrame as DF
from optparse import OptionParser
from time import time
......@@ -29,25 +30,37 @@ import training_validation_v14 as training
# Tagging transformed file with CRF model with sklearn-crfsuite.
#
# Input parameters
# --inputPath=PATH Path of transformed files x|y|z
# --modelPath Path to CRF model
# --modelName Model name
# --outputPath=PATH Output path to place output files
# --filteringStopWords Filtering stop words
# --filterSymbols Filtering punctuation marks
# --inputPath=PATH Path of transformed files x|y|z
# --outputPath Output path to place output files
# --outputFileI Output tagged file I
# --outputFileII Output tagged file II
# --modelPath Path to CRF model
# --modelName Model name
# --infoPath Path of GSE-GSM index file
# --infoFile GSE-GSM index file",
# --variant Part of S2 variant
# --S1 Inner word features set
# --S2 Complete word features
# --S3 Extended context features
# --S4 Semantic features
# --filteringStopWords Filtering stop words
# --filterSymbols Filtering punctuation marks
# Output
# 1) Tagged files in transformed format
# Examples
# python3 tagging.py
# --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
# --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod
# --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models/
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
# --filterSymbols
# python3 tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --filterSymbols > output_tagging_report.txt
# --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
# --outputFileI annot-input_bg_outputI.txt
# --outputFileII annot-input_bg_outputII.txt
# --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models
# --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
# --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
# --infoFile bg_sentences_midx.txt
# --variant 13
#python3 tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI.txt --outputFileII annot-input_bg_outputII.txt --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx.txt --variant 13 --S4 --S1 > ../../reports/output_tagging_report.txt
__author__ = 'egaytan'
......@@ -60,9 +73,13 @@ if __name__ == "__main__":
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath", help="Path of training data set", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
parser.add_option("--outputFileI", dest="outFileI", help="Output tagged file I", metavar="FILE")
parser.add_option("--outputFileII", dest="outFileII", help="Output tagged file II", metavar="FILE")
parser.add_option("--modelPath", dest="modelPath", help="Path to read CRF model", metavar="PATH")
parser.add_option("--modelName", dest="modelName", help="Model name", metavar="TEXT")
parser.add_option("--variant", dest="variant", help="Report file", metavar="FILE")
parser.add_option("--infoPath", dest="infoPath", help="Path of GSE-GSM index file", metavar="PATH")
parser.add_option("--infoFile", dest="idx", help="GSE-GSM index file", metavar="FILE")
parser.add_option("--variant", dest="variant", help="Run variant", metavar="FILE")
parser.add_option("--S1", dest="S1", help="General features", action="store_true", default=False)
parser.add_option("--S2", dest="S2", help="Inner/Complete word features", action="store_true", default=False)
parser.add_option("--S3", dest="S3", help="Extended context features", action="store_true", default=False)
......@@ -75,14 +92,25 @@ if __name__ == "__main__":
parser.error("Any parameter given.")
sys.exit(1)
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + options.inputPath)
print("Mode name: " + str(options.modelName))
print("Model path: " + options.modelPath)
print("Path to place output files: " + options.outputPath)
print("Filtering stop words: " + str(options.filterStopWords))
print("Levels: " + "S1: " + str(options.S1) + "S2: " + str(options.S2) + "S3: " + str(options.S3) + "S4: " + str(options.S4))
print("Run variant: " + str(options.variant))
print("--inputPath Path of training data set : " + str(options.inputPath ))
print("--outputPath Output path to place output files: " + str(options.outputPath ))
print("--outputFileI Output tagged file I : " + str(options.outFileI ))
print("--outputFileII Output tagged file II : " + str(options.outFileII ))
print("--modelPath Path to read CRF model : " + str(options.modelPath ))
print("--modelName Model name : " + str(options.modelName ))
print("--infoPath Path of GSE-GSM index file : " + str(options.infoPath ))
print("--infoFile GSE-GSM index file : " + str(options.idx ))
print("--variant Run variant : " + str(options.variant ))
print("--S1 General features : " + str(options.S1 ))
print("--S2 Inner/Complete word features : " + str(options.S2 ))
print("--S3 Extended context features : " + str(options.S3 ))
print("--S4 Semantic features : " + str(options.S4 ))
print("--filteringStopWords Filtering stop words : " + str(options.filterStopWords ))
print("--filterSymbols Filtering punctuation marks : " + str(options.filterSymbols ))
symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
'}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
......@@ -92,7 +120,9 @@ if __name__ == "__main__":
print('-------------------------------- PROCESSING --------------------------------')
stopwords = [word for word in stopwords.words('english')]
# Read index
idx = open(os.path.join(options.infoPath, options.idx), "r").readlines()
# Read CRF model
t0 = time()
print('Reading CRF model...')
......@@ -108,8 +138,9 @@ if __name__ == "__main__":
# For each file in dir
for file in files:
print("Preprocessing file..." + str(file))
sentencesInputData = []
sentencesOutputData = []
sentencesInputData = []
sentencesOutputDataI = []
sentencesOutputDataII = []
with open(os.path.join(options.inputPath, file), "r") as iFile:
lines = iFile.readlines()
for line in lines:
......@@ -142,10 +173,12 @@ if __name__ == "__main__":
# Tagging with CRF model
print("Tagging file")
lidx = 0
for line, tagLine in zip(lines, y_pred):
Ltags = set(labels).intersection(set(tagLine))
outputLine = ''
line = line.strip('\n')
line = line.strip('\n')
#print("\nLine: " + str(line))
#print ("CRF tagged line: " + str(tagLine))
tb = 'O'
......@@ -153,20 +186,25 @@ if __name__ == "__main__":
if len(tagLine)==1:
if tagLine[0] in labels:
start = '<' + tagLine[0] + '> '
end = '<' + tagLine[0] + '/>'
word = line.split('|')[0] + ' '
end = '</' + tagLine[0] + '/>'
word = line.split('|')[0] + ' '
outputLine = start + word + end
else:
outputLine = line.split(' ')[0]
#print(outputLine + '\t' + ', '.join(Ltags))
sentencesOutputData.append([outputLine, ', '.join(Ltags)])
sentencesOutputDataI.append([outputLine, ', '.join(Ltags)])
sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + word.split('|')[0] + '\t' + tag)
continue
sentence = ''
sb = False
for word,tag in zip(line.split(' '), tagLine):
# start tagging
if tag in labels and tb == 'O':
if tag in labels and tb != tag:
# start tagging
outputLine += '<' + tag + '> '
sb = True
sentence = word.split('|')[0] + ' '
tb = tag
outputLine += word.split('|')[0] + ' '
i += 1
......@@ -174,40 +212,38 @@ if __name__ == "__main__":
# end tagging
elif tb in labels:
if i+1==len(tagLine):
# end tagging
# end sentence
outputLine += word.split('|')[0] + ' '
outputLine += '<' + tag + '/> '
outputLine += '</' + tag + '/> '
sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + sentence + word.split('|')[0] + '\t' +tag)
sb = False
tb = 'O'
i += 1
continue
elif tagLine[i+1]=='O':
# end tagging
elif tag!=tagLine[i+1]:
# start new tag
outputLine += word.split('|')[0] + ' '
outputLine += '<' + tag + '/> '
outputLine += '</' + tag + '/> '
sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + sentence + word.split('|')[0] + '\t' +tag)
sb = False
tb = 'O'
i += 1
continue
# word tagged
outputLine += word.split('|')[0] + ' '
i += 1
#print(outputLine + '\t' + ', '.join(Ltags))
sentencesOutputData.append([outputLine, ', '.join(Ltags)])
if sb:
sentence+= word.split('|')[0] + ' '
#print(outputLine + '\t' + ', '.join(Ltags))
sentencesOutputDataI.append([outputLine, ', '.join(Ltags)])
lidx += 1
print( DF(sentencesOutputData) )
#print( DF(sentencesOutputDataI) )
#print( '\n'.join(sentencesOutputDataII) )
# Save tags
'''
with open(os.path.join(options.outputPath, file), "w") as oFile:
for line in sentencesOutputData:
with open(os.path.join(options.outputPath, options.outFileII), "w") as oFile:
for line in sentencesOutputDataII:
#print(line)
oFile.write(line + '\n')
print("Processing corpus done in: %fs" % (time() - t0))
'''
......
This diff could not be displayed because it is too large.
-------------------------------- PARAMETERS --------------------------------
Path to read input files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
Mode name: model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
Model path: /home/egaytan/automatic-extraction-growth-conditions/CRF/models
Path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
Filtering stop words: False
Levels: S1: FalseS2: FalseS3: FalseS4: False
Run variant: None
--inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
--outputFileI Output tagged file I : annot-input_bg_outputI.txt
--outputFileII Output tagged file II : annot-input_bg_outputII.txt
--modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
--modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
--infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
--infoFile GSE-GSM index file : bg_sentences_midx.txt
--variant Run variant : 13
--S1 General features : True
--S2 Inner/Complete word features : False
--S3 Extended context features : False
--S4 Semantic features : True
--filteringStopWords Filtering stop words : False
--filterSymbols Filtering punctuation marks : False
Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
-------------------------------- PROCESSING --------------------------------
Reading CRF model...
Reading CRF model done in: 0.008342s
Reading CRF model done in: 0.008336s
Processing corpus...
Preprocessing file...annot-input_bg_v3.txt
Sentences input data: 14716
Predicting tags with model
Prediction done in: 0.983480s
Prediction done in: 1.688127s
Tagging file
0 1
0 <Gtype> antibody : Flag <Gtype/> Gtype
1 <Gversion> ChIP-Seq <Gversion/> Gversion
2 Cultures of Caulobacter -LRB- TLS1631-TLS1633 ... Gtype
3 <Gtype> developmental stage : mixed population... Gtype
4 DNA was isolated using the Qiagen Cell Lysis a...
5 Escherichia coli
6 Escherichia coli AB1157
7 For analysis of ChIP-seq data , Hiseq 2500 Ill...
8 For analysis of IDAP-seq data , Hiseq 2500 Ill... Gtype
9 Genome _ build : NC _ 000913.3
10 Genome _ build : NC _ 011916.1
11 <Gtype> genotype : AB1157 ybbD : : parS scramb... Gtype
12 <Gtype> genotype : AB1157 ybbD : : parS scramb... Gtype
13 <Gtype> genotype : AB1157 ybbD : : parS site 1... Gtype
14 <Gtype> genotype : AB1157 ybbD : : parS site 2... Gtype
15 <Gtype> genotype : AB1157 ybbD : : parS site 2... Gtype
16 <Gtype> genotype : AB1157 ybbD : : parS site 3... Gtype
17 <Gtype> genotype : AB1157 ybbD : : parS site 3... Gtype
18 <Gtype> genotype : AB1157 ybbD : : parS site 4... Gtype
19 <Gtype> genotype : AB1157 ybbD : : parS site 4... Gtype
20 <Gtype> genotype : AB1157 ybbD : : parS site 5... Gtype
21 <Gtype> genotype : AB1157 ybbD : : parS site 5... Gtype
22 <Gtype> genotype : AB1157 ybbD : : parS site 6... Gtype
23 <Gtype> genotype : AB1157 ybbD : : parS site 7... Gtype
24 <Gtype> genotype : AB1157 ybbD : : parS site 7... Gtype
25 Hiseq 2500 Illumina short reads -LRB- 50 bp -R...
26 LELab _ ChIP _ seq _ TLS1637 _ anti _ FLAG
27 LELab _ ChIP _ seq _ TLS1638 _ anti _ FLAG
28 LELab _ ChIP _ seq _ TLS1639 _ anti _ FLAG
29 LELab _ ChIP _ seq _ TLS1640 _ anti _ FLAG
... ... ...
14686 <Phase> ESBL019 Coliform <Phase/> Phase
14687 <Gtype> ESBL019 Filamented <Gtype/> Gtype
14688 ESBL019 Reverted
14689 <Phase> ESBL019 Transition <Phase/> Phase
14690 Escherichia coli
14691 Four morphologic states of ESBL019 were used d...
14692 <Gtype> morphology : Coliform <Gtype/> Gtype
14693 <Gtype> morphology : Filamented <Gtype/> Gtype
14694 morphology : Reverted -LRB- reverted back from...
14695 morphology : Transition -LRB- from Coli into F...
14696 RNA isolation was performed using an RNeasy mi...
14697 <Gtype> strain : beta-lactamase -LRB- ESBL -RR... Gtype
14698 The E. coli isolate ESBL019 was originally iso...
14699 Escherichia coli
14700 lexA 10 ' after UV vs. 0 ' , MG1655
14701 <Gtype> lexA 10 min after UV treatment , 25 ug... Gtype
14702 lexA 20 ' after NOuv vs. 0 ' , MG1655
14703 lexA 20 ' after UV vs. 0 ' , MG1655
14704 lexA 20 min after NOuv , 25 ug total RNA , 2 u...
14705 <Gtype> lexA 20 min after UV treatment , 25 ug... Gtype
14706 lexA 40 ' after UV vs. 0 ' , MG1655
14707 <Gtype> lexA 40 min after UV treatment , 25 ug... Gtype
14708 lexA 5 ' after UV vs. 0 ' , MG1655
14709 <Gtype> lexA 5 min after UV treatment , 25 ug ... Gtype
14710 lexA 60 ' after NOuv vs. 0 ' , MG1655
14711 lexA 60 ' after UV vs. 0 ' , MG1655
14712 lexA 60 min after NOuv , 25 ug total RNA , 2 u...
14713 <Gtype> lexA 60 min after UV treatment , 25 ug... Gtype
14714 lexA vs. wt , before UV treatment , MG1655
14715 untreated cells , 25 ug total RNA
[14716 rows x 2 columns]
Processing corpus done in: 3.948320s
......