Estefani Gaytan Nunez

upload

......@@ -9,9 +9,11 @@ import random
#
# Input parameters
# --inputPath=PATH Path of inputfile
# --inputFile Output CoreNLP file with tagging sentences
# --outputPath=PATH Path to place output files
# --trainingFile=testFile Output training data set
# --testFile=testFile Output test data set
# --index Select a limit CoreNLP output column
#
# Output
# training and test data set
......@@ -23,7 +25,7 @@ import random
# --trainingFile training-data-set-70_v4.txt
# --testFile test-data-set-30_v4.txt
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets
#
# --index 5
#
# python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/test-trainig --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --index 5
......
......@@ -11,6 +11,7 @@ from optparse import OptionParser
# --outputFile=File Output data set
# --minWordLen Minimum word length
# --minSenLen Minimum sentence length
# --index Select a limit CoreNLP output column
#
# Output
# Tagged sentences reconstruction
......@@ -23,6 +24,7 @@ from optparse import OptionParser
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
# --minWordLen 2
# --minSenLen 1
# --index 5
#
#python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1
......@@ -39,7 +41,7 @@ if __name__ == "__main__":
parser.add_option("--outputFile", dest="outputFile", help="File with training data set", metavar="FILE")
parser.add_option("--minWordLen", dest="wL", help="Minimum word length", type="int")
parser.add_option("--minSenLen", dest="sL", help="Minimum word length", type="int")
parser.add_option("--index", dest="index",help="Select a limit CoreNLP output column", metavar='N', type=int)
(options, args) = parser.parse_args()
if len(args) > 0:
......@@ -58,23 +60,26 @@ if __name__ == "__main__":
lista = []
#First sentence
sentence = ''
#count
i = 0
with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
for line in input_file:
if len(line.split('\t')) > 1:
w = line.split('\t')[1]
if w == "PGCGROWTHCONDITIONS":
i = i + 1
if len( sentence.lstrip().split(' ') ) <= options.sL and len(sentence.lstrip().split(' ')[0].split('|')[0]) <= options.wL:
print( "EXCLUDE: " + sentence.lstrip() )
print( "EXCLUDE: " + str(i) + "line" + sentence.lstrip() )
else:
#End of sentence
lista.append(sentence.lstrip())
#New setence
n = n+1
#New setence
sentence = ''
sentence = ''
else:
#Building and save tagging sentence
sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4]))
sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:options.index]))
print("Number of sentences: " + str(n))
......
# -*- coding: UTF-8 -*-
import os
from pandas import DataFrame as DF
from optparse import OptionParser
from time import time
from collections import Counter
import nltk
import sklearn
import scipy.stats
import sys
import joblib
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from nltk.corpus import stopwords
import training_validation_v14 as training
#-------------------------------------------------------------------------------
# Objective
# Tagging transformed file with CRF model with sklearn-crfsuite.
#
# Input parameters
# --inputPath=PATH Path of transformed files x|y|z
# --modelPath Path to CRF model
# --modelName Model name
# --outputPath=PATH Output path to place output files
# --filteringStopWords Filtering stop words
# --filterSymbols Filtering punctuation marks
# Output
# 1) Tagged files in transformed format
# Examples
# python3 tagging.py
# --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
# --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod
# --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models/
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
# --filterSymbols
# python3 tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --filterSymbols > output_tagging_report.txt
__author__ = 'egaytan'
##########################################
# MAIN PROGRAM #
##########################################
if __name__ == "__main__":
# Defining parameters
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath", help="Path of training data set", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
parser.add_option("--modelPath", dest="modelPath", help="Path to read CRF model", metavar="PATH")
parser.add_option("--modelName", dest="modelName", help="Model name", metavar="TEXT")
parser.add_option("--variant", dest="variant", help="Report file", metavar="FILE")
parser.add_option("--S1", dest="S1", help="General features", action="store_true", default=False)
parser.add_option("--S2", dest="S2", help="Inner/Complete word features", action="store_true", default=False)
parser.add_option("--S3", dest="S3", help="Extended context features", action="store_true", default=False)
parser.add_option("--S4", dest="S4", help="Semantic features", action="store_true", default=False)
parser.add_option("--filterStopWords", dest="filterStopWords", help="Filtering stop words", action="store_true", default=False)
parser.add_option("--filterSymbols", dest="filterSymbols", help="Filtering punctuation marks", action="store_true", default=False)
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Any parameter given.")
sys.exit(1)
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + options.inputPath)
print("Mode name: " + str(options.modelName))
print("Model path: " + options.modelPath)
print("Path to place output files: " + options.outputPath)
print("Filtering stop words: " + str(options.filterStopWords))
print("Levels: " + "S1: " + str(options.S1) + "S2: " + str(options.S2) + "S3: " + str(options.S3) + "S4: " + str(options.S4))
print("Run variant: " + str(options.variant))
symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
'}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
print('-------------------------------- PROCESSING --------------------------------')
stopwords = [word for word in stopwords.words('english')]
# Read CRF model
t0 = time()
print('Reading CRF model...')
crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod'))
print("Reading CRF model done in: %fs" % (time() - t0))
# Reading sentences
print('Processing corpus...')
t0 = time()
labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
# For each file in dir
for file in files:
print("Preprocessing file..." + str(file))
sentencesInputData = []
sentencesOutputData = []
with open(os.path.join(options.inputPath, file), "r") as iFile:
lines = iFile.readlines()
for line in lines:
listLine = []
for token in line.strip('\n').split():
if options.filterStopWords:
listToken = token.split('|')
lemma = listToken[1]
if lemma in stopwords:
continue
if options.filterSymbols:
listToken = token.split('|')
lemma = listToken[1]
if lemma in symbols:
if lemma == ',':
print("Coma , identificada")
continue
listLine.append(token)
sentencesInputData.append(listLine)
X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData]
print("Sentences input data: " + str(len(sentencesInputData)))
# Predicting tags
t1 = time()
print("Predicting tags with model")
y_pred = crf.predict(X_input)
print("Prediction done in: %fs" % (time() - t1))
# Tagging with CRF model
print("Tagging file")
for line, tagLine in zip(lines, y_pred):
Ltags = set(labels).intersection(set(tagLine))
outputLine = ''
line = line.strip('\n')
#print("\nLine: " + str(line))
#print ("CRF tagged line: " + str(tagLine))
tb = 'O'
i = 0
if len(tagLine)==1:
if tagLine[0] in labels:
start = '<' + tagLine[0] + '> '
end = '<' + tagLine[0] + '/>'
word = line.split('|')[0] + ' '
outputLine = start + word + end
else:
outputLine = line.split(' ')[0]
#print(outputLine + '\t' + ', '.join(Ltags))
sentencesOutputData.append([outputLine, ', '.join(Ltags)])
continue
for word,tag in zip(line.split(' '), tagLine):
# start tagging
if tag in labels and tb == 'O':
# start tagging
outputLine += '<' + tag + '> '
tb = tag
outputLine += word.split('|')[0] + ' '
i += 1
continue
# end tagging
elif tb in labels:
if i+1==len(tagLine):
# end tagging
outputLine += word.split('|')[0] + ' '
outputLine += '<' + tag + '/> '
tb = 'O'
i += 1
continue
elif tagLine[i+1]=='O':
# end tagging
outputLine += word.split('|')[0] + ' '
outputLine += '<' + tag + '/> '
tb = 'O'
i += 1
continue
# word tagged
outputLine += word.split('|')[0] + ' '
i += 1
#print(outputLine + '\t' + ', '.join(Ltags))
sentencesOutputData.append([outputLine, ', '.join(Ltags)])
print( DF(sentencesOutputData) )
# Save tags
'''
with open(os.path.join(options.outputPath, file), "w") as oFile:
for line in sentencesOutputData:
oFile.write(line + '\n')
print("Processing corpus done in: %fs" % (time() - t0))
'''
# -*- coding: UTF-8 -*-
import os
from optparse import OptionParser
from time import time
from collections import Counter
import nltk
import sklearn
import scipy.stats
import sys
#from sklearn.externals import joblib
import joblib
from sklearn.metrics import make_scorer
#from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import cross_val_score
#from sklearn.grid_search import RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from nltk.corpus import stopwords
#################################
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
-------------------------------- PARAMETERS --------------------------------
Path to read input files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
Mode name: model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
Model path: /home/egaytan/automatic-extraction-growth-conditions/CRF/models
Path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
Filtering stop words: False
Levels: S1: FalseS2: FalseS3: FalseS4: False
Run variant: None
Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
-------------------------------- PROCESSING --------------------------------
Reading CRF model...
Reading CRF model done in: 0.008342s
Processing corpus...
Preprocessing file...annot-input_bg_v3.txt
Sentences input data: 14716
Predicting tags with model
Prediction done in: 0.983480s
Tagging file
0 1
0 <Gtype> antibody : Flag <Gtype/> Gtype
1 <Gversion> ChIP-Seq <Gversion/> Gversion
2 Cultures of Caulobacter -LRB- TLS1631-TLS1633 ... Gtype
3 <Gtype> developmental stage : mixed population... Gtype
4 DNA was isolated using the Qiagen Cell Lysis a...
5 Escherichia coli
6 Escherichia coli AB1157
7 For analysis of ChIP-seq data , Hiseq 2500 Ill...
8 For analysis of IDAP-seq data , Hiseq 2500 Ill... Gtype
9 Genome _ build : NC _ 000913.3
10 Genome _ build : NC _ 011916.1
11 <Gtype> genotype : AB1157 ybbD : : parS scramb... Gtype
12 <Gtype> genotype : AB1157 ybbD : : parS scramb... Gtype
13 <Gtype> genotype : AB1157 ybbD : : parS site 1... Gtype
14 <Gtype> genotype : AB1157 ybbD : : parS site 2... Gtype
15 <Gtype> genotype : AB1157 ybbD : : parS site 2... Gtype
16 <Gtype> genotype : AB1157 ybbD : : parS site 3... Gtype
17 <Gtype> genotype : AB1157 ybbD : : parS site 3... Gtype
18 <Gtype> genotype : AB1157 ybbD : : parS site 4... Gtype
19 <Gtype> genotype : AB1157 ybbD : : parS site 4... Gtype
20 <Gtype> genotype : AB1157 ybbD : : parS site 5... Gtype
21 <Gtype> genotype : AB1157 ybbD : : parS site 5... Gtype
22 <Gtype> genotype : AB1157 ybbD : : parS site 6... Gtype
23 <Gtype> genotype : AB1157 ybbD : : parS site 7... Gtype
24 <Gtype> genotype : AB1157 ybbD : : parS site 7... Gtype
25 Hiseq 2500 Illumina short reads -LRB- 50 bp -R...
26 LELab _ ChIP _ seq _ TLS1637 _ anti _ FLAG
27 LELab _ ChIP _ seq _ TLS1638 _ anti _ FLAG
28 LELab _ ChIP _ seq _ TLS1639 _ anti _ FLAG
29 LELab _ ChIP _ seq _ TLS1640 _ anti _ FLAG
... ... ...
14686 <Phase> ESBL019 Coliform <Phase/> Phase
14687 <Gtype> ESBL019 Filamented <Gtype/> Gtype
14688 ESBL019 Reverted
14689 <Phase> ESBL019 Transition <Phase/> Phase
14690 Escherichia coli
14691 Four morphologic states of ESBL019 were used d...
14692 <Gtype> morphology : Coliform <Gtype/> Gtype
14693 <Gtype> morphology : Filamented <Gtype/> Gtype
14694 morphology : Reverted -LRB- reverted back from...
14695 morphology : Transition -LRB- from Coli into F...
14696 RNA isolation was performed using an RNeasy mi...
14697 <Gtype> strain : beta-lactamase -LRB- ESBL -RR... Gtype
14698 The E. coli isolate ESBL019 was originally iso...
14699 Escherichia coli
14700 lexA 10 ' after UV vs. 0 ' , MG1655
14701 <Gtype> lexA 10 min after UV treatment , 25 ug... Gtype
14702 lexA 20 ' after NOuv vs. 0 ' , MG1655
14703 lexA 20 ' after UV vs. 0 ' , MG1655
14704 lexA 20 min after NOuv , 25 ug total RNA , 2 u...
14705 <Gtype> lexA 20 min after UV treatment , 25 ug... Gtype
14706 lexA 40 ' after UV vs. 0 ' , MG1655
14707 <Gtype> lexA 40 min after UV treatment , 25 ug... Gtype
14708 lexA 5 ' after UV vs. 0 ' , MG1655
14709 <Gtype> lexA 5 min after UV treatment , 25 ug ... Gtype
14710 lexA 60 ' after NOuv vs. 0 ' , MG1655
14711 lexA 60 ' after UV vs. 0 ' , MG1655
14712 lexA 60 min after NOuv , 25 ug total RNA , 2 u...
14713 <Gtype> lexA 60 min after UV treatment , 25 ug... Gtype
14714 lexA vs. wt , before UV treatment , MG1655
14715 untreated cells , 25 ug total RNA
[14716 rows x 2 columns]