Estefani Gaytan Nunez

upload

......@@ -9,9 +9,11 @@ import random
#
# Input parameters
# --inputPath=PATH Path of inputfile
# --inputFile Output CoreNLP file with tagging sentences
# --outputPath=PATH Path to place output files
# --trainingFile=testFile Output training data set
# --testFile=testFile Output test data set
# --index Select a limit CoreNLP output column
#
# Output
# training and test data set
......@@ -23,7 +25,7 @@ import random
# --trainingFile training-data-set-70_v4.txt
# --testFile test-data-set-30_v4.txt
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets
#
# --index 5
#
# python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/test-trainig --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --index 5
......
......@@ -11,6 +11,7 @@ from optparse import OptionParser
# --outputFile=File Output data set
# --minWordLen Minimum word length
# --minSenLen Minimum sentence length
# --index Select a limit CoreNLP output column
#
# Output
# Tagged sentences reconstruction
......@@ -23,6 +24,7 @@ from optparse import OptionParser
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
# --minWordLen 2
# --minSenLen 1
# --index 5
#
#python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1
......@@ -39,7 +41,7 @@ if __name__ == "__main__":
parser.add_option("--outputFile", dest="outputFile", help="File with training data set", metavar="FILE")
parser.add_option("--minWordLen", dest="wL", help="Minimum word length", type="int")
parser.add_option("--minSenLen", dest="sL", help="Minimum word length", type="int")
parser.add_option("--index", dest="index",help="Select a limit CoreNLP output column", metavar='N', type=int)
(options, args) = parser.parse_args()
if len(args) > 0:
......@@ -58,23 +60,26 @@ if __name__ == "__main__":
lista = []
#First sentence
sentence = ''
#count
i = 0
with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
for line in input_file:
if len(line.split('\t')) > 1:
w = line.split('\t')[1]
if w == "PGCGROWTHCONDITIONS":
i = i + 1
if len( sentence.lstrip().split(' ') ) <= options.sL and len(sentence.lstrip().split(' ')[0].split('|')[0]) <= options.wL:
print( "EXCLUDE: " + sentence.lstrip() )
print( "EXCLUDE: " + str(i) + "line" + sentence.lstrip() )
else:
#End of sentence
lista.append(sentence.lstrip())
#New setence
n = n+1
#New setence
sentence = ''
sentence = ''
else:
#Building and save tagging sentence
sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4]))
sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:options.index]))
print("Number of sentences: " + str(n))
......
# -*- coding: UTF-8 -*-
import os
from pandas import DataFrame as DF
from optparse import OptionParser
from time import time
from collections import Counter
import nltk
import sklearn
import scipy.stats
import sys
import joblib
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from nltk.corpus import stopwords
import training_validation_v14 as training
#-------------------------------------------------------------------------------
# Objective
# Tagging transformed file with CRF model with sklearn-crfsuite.
#
# Input parameters
# --inputPath=PATH Path of transformed files x|y|z
# --modelPath Path to CRF model
# --modelName Model name
# --outputPath=PATH Output path to place output files
# --filteringStopWords Filtering stop words
# --filterSymbols Filtering punctuation marks
# Output
# 1) Tagged files in transformed format
# Examples
# python3 tagging.py
# --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
# --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod
# --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models/
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
# --filterSymbols
# python3 tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --filterSymbols > output_tagging_report.txt
__author__ = 'egaytan'
##########################################
# MAIN PROGRAM #
##########################################
if __name__ == "__main__":
# Defining parameters
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath", help="Path of training data set", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
parser.add_option("--modelPath", dest="modelPath", help="Path to read CRF model", metavar="PATH")
parser.add_option("--modelName", dest="modelName", help="Model name", metavar="TEXT")
parser.add_option("--variant", dest="variant", help="Report file", metavar="FILE")
parser.add_option("--S1", dest="S1", help="General features", action="store_true", default=False)
parser.add_option("--S2", dest="S2", help="Inner/Complete word features", action="store_true", default=False)
parser.add_option("--S3", dest="S3", help="Extended context features", action="store_true", default=False)
parser.add_option("--S4", dest="S4", help="Semantic features", action="store_true", default=False)
parser.add_option("--filterStopWords", dest="filterStopWords", help="Filtering stop words", action="store_true", default=False)
parser.add_option("--filterSymbols", dest="filterSymbols", help="Filtering punctuation marks", action="store_true", default=False)
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Any parameter given.")
sys.exit(1)
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + options.inputPath)
print("Mode name: " + str(options.modelName))
print("Model path: " + options.modelPath)
print("Path to place output files: " + options.outputPath)
print("Filtering stop words: " + str(options.filterStopWords))
print("Levels: " + "S1: " + str(options.S1) + "S2: " + str(options.S2) + "S3: " + str(options.S3) + "S4: " + str(options.S4))
print("Run variant: " + str(options.variant))
symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
'}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
print('-------------------------------- PROCESSING --------------------------------')
stopwords = [word for word in stopwords.words('english')]
# Read CRF model
t0 = time()
print('Reading CRF model...')
crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod'))
print("Reading CRF model done in: %fs" % (time() - t0))
# Reading sentences
print('Processing corpus...')
t0 = time()
labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
# For each file in dir
for file in files:
print("Preprocessing file..." + str(file))
sentencesInputData = []
sentencesOutputData = []
with open(os.path.join(options.inputPath, file), "r") as iFile:
lines = iFile.readlines()
for line in lines:
listLine = []
for token in line.strip('\n').split():
if options.filterStopWords:
listToken = token.split('|')
lemma = listToken[1]
if lemma in stopwords:
continue
if options.filterSymbols:
listToken = token.split('|')
lemma = listToken[1]
if lemma in symbols:
if lemma == ',':
print("Coma , identificada")
continue
listLine.append(token)
sentencesInputData.append(listLine)
X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData]
print("Sentences input data: " + str(len(sentencesInputData)))
# Predicting tags
t1 = time()
print("Predicting tags with model")
y_pred = crf.predict(X_input)
print("Prediction done in: %fs" % (time() - t1))
# Tagging with CRF model
print("Tagging file")
for line, tagLine in zip(lines, y_pred):
Ltags = set(labels).intersection(set(tagLine))
outputLine = ''
line = line.strip('\n')
#print("\nLine: " + str(line))
#print ("CRF tagged line: " + str(tagLine))
tb = 'O'
i = 0
if len(tagLine)==1:
if tagLine[0] in labels:
start = '<' + tagLine[0] + '> '
end = '<' + tagLine[0] + '/>'
word = line.split('|')[0] + ' '
outputLine = start + word + end
else:
outputLine = line.split(' ')[0]
#print(outputLine + '\t' + ', '.join(Ltags))
sentencesOutputData.append([outputLine, ', '.join(Ltags)])
continue
for word,tag in zip(line.split(' '), tagLine):
# start tagging
if tag in labels and tb == 'O':
# start tagging
outputLine += '<' + tag + '> '
tb = tag
outputLine += word.split('|')[0] + ' '
i += 1
continue
# end tagging
elif tb in labels:
if i+1==len(tagLine):
# end tagging
outputLine += word.split('|')[0] + ' '
outputLine += '<' + tag + '/> '
tb = 'O'
i += 1
continue
elif tagLine[i+1]=='O':
# end tagging
outputLine += word.split('|')[0] + ' '
outputLine += '<' + tag + '/> '
tb = 'O'
i += 1
continue
# word tagged
outputLine += word.split('|')[0] + ' '
i += 1
#print(outputLine + '\t' + ', '.join(Ltags))
sentencesOutputData.append([outputLine, ', '.join(Ltags)])
print( DF(sentencesOutputData) )
# Save tags
'''
with open(os.path.join(options.outputPath, file), "w") as oFile:
for line in sentencesOutputData:
oFile.write(line + '\n')
print("Processing corpus done in: %fs" % (time() - t0))
'''
# -*- coding: UTF-8 -*-
import os
from optparse import OptionParser
from time import time
from collections import Counter
import nltk
import sklearn
import scipy.stats
import sys
#from sklearn.externals import joblib
import joblib
from sklearn.metrics import make_scorer
#from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import cross_val_score
#from sklearn.grid_search import RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from nltk.corpus import stopwords
#################################
# -*- coding: UTF-8 -*-
import os # Access operative sistem
#from itertools import chain # No se ocupa
from optparse import OptionParser # Number of transitions
from time import time # Return the time in seconds since the epoch as a float
from collections import Counter # Dict subclass for counting hashable objects
#import re # No se ocupa
import nltk # Natural Language Toolkit platform to work with human language data
import sklearn # Free software machine learning
import scipy.stats # library of statistical functions
import sys # to exit from Python.
import joblib # provide lightweight pipelining
from sklearn.metrics import make_scorer # Make a scorer from a performance metric or loss function
from sklearn.model_selection import cross_val_score # Evaluate a score by cross-validation
from sklearn.model_selection import RandomizedSearchCV # Randomized search on hyper parameters
import sklearn_crfsuite # Thin CRFsuite
from sklearn_crfsuite import scorers # Added scorers.sequence_accuracy
from sklearn_crfsuite import metrics # Add flat recall score to metrics
from pandas import DataFrame as DF # Contruct dataframe object
from nltk.corpus import stopwords # To exclude top words
#-------------------------------------------------------------------------------
# Objective
# Training and evaluation of CRFs with sklearn-crfsuite.
#
# Input parameters
# (1) --inputPath Path of training and test data set
# (2) --outputPath Output path to place output files
# (3) --trainingFile File with training data set
# (4) --testFile File with test data set
# (5) --reportName Number of run
# (6) --variant Part of S2 variant
# (7) --nrules Number of crf transitions
# (8) --S1 Inner word features set
# (9) --S2 Complete word features
# (10) --S3 Extended context features
# (11) --S4 Semantic features
# (12) --excludeStopWords
# (13) --excludeSymbols
# Output
# 1) Best model
# 2) Report
# Examples
# python3 training_validation_v14.0.1.py
# --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/input/
# --trainingFile training-data-set-70-NER.txt
# --testFile test-data-set-30-NER.txt
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/
# --nrules 500
# --reportName Run1
# --variant 11
# --S1
# --S2
# --S3
# --S4
# python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run1 --variant 10 > ../../outputs/enero/Run1_v10.txt
##################################################################
# FEATURES #
##################################################################
#================== COMPLETE WORD FEATURES ======================#
def isGreek(word):
## Complete word are greek letters
alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω',
'α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω']
if word in alphabet:
return True
else:
return False
#================ INNER OF THE WORD FEATURES ====================#
def hGreek(word):
## Search for at least has one greek letter
alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω','α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω']
# hexadicimal code
matches = [letter for letter in word if letter in alphabet]
if (len(matches) > 0):
return(True)
else: return(False)
## At least a greek letter
def hNumber(word):
## Al leats has one greek letter
for l in word:
if l.isdigit():
return True
return False
def hUpper(word):
## At least an upper letter
for l in word:
if l.isupper(): return True
return False
def hLower(word):
## At least a lower letter
for l in word:
if l.islower(): return True
return False
#============================FEATURES===========================#
def word2features(sent, i, S1, S2, S3, S4, v): #SA, v
## Getting word features
## Saving CoreNLP annotations
listElem = sent[i].split('|')
## Split CoreNLP output by columns
word = listElem[0]
lemma = listElem[1]
postag = listElem[2]
ner = listElem[3]
#=========================== G =============================#
## NAME LEVEL G
## FUTURE TYPE General features
## Adding to features dictionary
features = {
## basal features
'lemma': lemma,
'postag': postag
}
## Anterior lemma and postag
## need more tha one word in sentence
if i > 0:
## Split CoreNLP output by columns
listElem = sent[i - 1].split('|')
## Saving CoreNLP annotations
lemma0 = listElem[1]
postag0 = listElem[2]
## Adding features to dictionary
features.update({
#LemaG anterior
'-1:lemma': lemma0,
#Postag anterior
'-1:postag': postag0,
})
## Posterior lemma and postag
## is not the last word
if i < len(sent) - 1:
## Posterior word
listElem = sent[i + 1].split('|')
## Saving CoreNLP annotations
lemma2 = listElem[1]
postag2 = listElem[2]
## Adding to features dictionary
features.update({
#LemaG posterior
'+1:lemma': lemma2,
#Postag posterior
'+1:postag': postag2,
})
#=========================== S1 =============================#
## NAME LEVEL S1
## FEATURE TYPE Inner word features
if S1:
## Adding features to dictionary
features.update({
'hUpper' : hUpper(word),
'hLower' : hLower(word),
'hGreek' : hGreek(word),
'symb' : word.isalnum()
})
#========== Variants of inner words features ============#
if v == 10:
#word first character
features['word[:1]']= word[:1]
#word second character
if len(word)>1:
features['word[:2]']= word[:2]
if v == 11:
#lemma and postag first dharacter
features['lemma[:1]']= lemma[:1]
features['postag[:1]']= postag[:1]
#lemma and postag secondChar
if len(lemma)>1:
features['lemma[:2]']= lemma[:2]
if len(postag)>1:
features['postag[:2]']= postag[:2]
if v == 12:
#word first character
features['word[:1]']= word[:1]
#word second character
if len(word)>1:
features['word[:2]']= word[:2]
#postag first character
features['postag[:1]']= postag[:1]
#postag second character
if len(postag)>1:
features['postag[:2]']= postag[:2]
if v == 13:
#lemma first character
features['lemma[:1]']= lemma[:1]
#lemma second character
if len(lemma)>1:
features['lemma[:2]']= lemma[:2]
#=========================== S2 =============================#
## NAME LEVEL S2
## FEATURE TYPE Complete word features
if S2:
#Add features to dictionary
features.update({
'word' : word,
'isUpper' : word.isupper(),
'isLower' : word.islower(),
'isGreek' : isGreek(word),
'isNumber' : word.isdigit()
})
## Anterior word
## sentence needs more tha one word
if i > 0:
## Split CoreNLP output by columns
listElem = sent[i - 1].split('|')
## Saving CoreNLP annotations
word0 = listElem[0]
features['-1:word']= word0
## Posterior word
## is not the last word
if i < len(sent)-1:
## Split CoreNLP output by columns
listElem = sent[i + 1].split('|')
## Saving CoreNLP annotations
word2 = listElem[0]
features['+1:word']= word2
#=========================== S3 =============================#
## NAME LEVEL S3
## FEATURE TYPE Extended context features
if S3:
## more than two words in sentence
if i > 1:
## Split CoreNLP output by columns
listElem = sent[i - 2].split('|')
## Saving CoreNLP annotations
## two anterior lemma and postag
lemma01 = listElem[1]
postag01 = listElem[2]
features['-2:lemma']= lemma01
features['-2:postag']= postag01
## is not the penultimate word
if i < len(sent) - 2:
## Split CoreNLP output by columns
listElem = sent[i + 2].split('|')
## Saving CoreNLP annotations
lemma02 = listElem[1]
postag02 = listElem[2]
## two posterior lemma and postag
features['+2:lemma']= lemma02
features['+2:postag']= postag02
#=========================== S4 =============================#
## NAME LEVEL S4if S4:
## FEATURE TYPE NER
if S4:
## more than one word in sentence
if i > 0:
## Split CoreNLP output by columns
listElem = sent[i - 1].split('|')
## =============== Anterior ner ====================##
## Saving CoreNLP annotations according column position
ner0 = listElem[3]
## Adding to features dictionary
features['-1:ner'] = ner
## is not the last word
if i < len(sent) - 1:
## Split CoreNLP output by columns
listElem = sent[i + 1].split('|')
## ============= Posterior ner ====================##
## Saving CoreNLP annotations according column position
ner2 = listElem[3]
## Adding to features dictionary
features['+1:ner'] = ner2
if i > 1:
## Split CoreNLP output by columns
listElem = sent[i - 2].split('|')
## Saving CoreNLP annotations
## =============== 2 Anterior ner =================##
ner01 = listElem[3]
features['-2:ner']= ner01
## is not the penultimate word
if i < len(sent) - 2:
## Split CoreNLP output by columns
listElem = sent[i + 2].split('|')
## Saving CoreNLP annotations
ner02 = listElem[3]
## ============= 2 Posterior ner =================##
features['+2:ner']= ner02
return features
def sent2features(sent, S1, S2, S3, S4, v):
## Itering in sentence for each word and saving its features
return [word2features(sent, i, S1, S2, S3, S4, v) for i in range(len(sent))]
def sent2labels(sent):
## Save tag, last position by word tokens
return [elem.split('|')[-1] for elem in sent]
def sent2tokens(sent):
return [token for token, postag, label in sent]
def print_transitions(trans_features, f):
for (label_from, label_to), weight in trans_features:
f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight))
def print_state_features(state_features, f):
for (attr, label), weight in state_features:
f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8")))
__author__ = 'egaytan'
##################################################################
# MAIN PROGRAM #
##################################################################
if __name__ == "__main__":
## Defining parameters
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath", help="Path of training data set", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
parser.add_option("--trainingFile", dest="trainingFile", help="File with training data set", metavar="FILE")
parser.add_option("--testFile", dest="testFile", help="File with test data set", metavar="FILE")
parser.add_option("--reportName", dest="reportName", help="Report number run", metavar="FILE")
parser.add_option("--variant", dest="variant", help="Report file", metavar="FILE")
parser.add_option("--S1", dest="S1", help="General features", action="store_true", default=False)
parser.add_option("--S2", dest="S2", help="Inner/Complete word features", action="store_true", default=False)
parser.add_option("--S3", dest="S3", help="Extended context features", action="store_true", default=False)
parser.add_option("--S4", dest="S4", help="Semantic features", action="store_true", default=False)
parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False)
parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False)
parser.add_option("--nrules", dest="nrules", help="Number of crf rules on report", type="int")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Any parameter given.")
sys.exit(1)
print('-------------------------------- PARAMETERS --------------------------------')
print("Path of test and training data sets: " + options.inputPath)
print("Path of outputs: " + options.outputPath)
print("File with training data set: " + str(options.trainingFile))
print("File with test data set: " + str(options.testFile))
print("reportName: " + str(options.reportName))
print("Exclude stop words: " + str(options.excludeStopWords))
print("Levels: " + "S1: " + str(options.S1) + "S2: " + str(options.S2) + "S3: " + str(options.S3) + "S4: " + str(options.S4))
print("Run variant: " + str(options.variant))
print("Number of rules on report file: " + str(options.nrules))
symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
'}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
print("Exclude symbols: " + str(options.excludeSymbols))
print('-------------------------------- PROCESSING --------------------------------')
print('Reading corpus...')
t0 = time()
sentencesTrainingData = []
sentencesTestData = []
stopwords = [word for word in stopwords.words('english')]
with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile:
for line in iFile.readlines():
listLine = []
line = line.strip('\n')
for token in line.split():
if options.excludeStopWords:
listToken = token.split('|')
lemma = listToken[1]
if lemma in stopwords:
continue
if options.excludeSymbols:
listToken = token.split('|')
lemma = listToken[1]
if lemma in symbols:
continue
listLine.append(token)
sentencesTrainingData.append(listLine)
print(" Sentences training data: " + str(len(sentencesTrainingData)))
with open(os.path.join(options.inputPath, options.testFile), "r") as iFile:
for line in iFile.readlines():
listLine = []
line = line.strip('\n')
for token in line.split():
if options.excludeStopWords:
listToken = token.split('|')
lemma = listToken[1]
if lemma in stopwords:
continue
if options.excludeSymbols:
listToken = token.split('|')
lemma = listToken[1]
if lemma in symbols:
continue
listLine.append(token)
sentencesTestData.append(listLine)
print(" Sentences test data: " + str(len(sentencesTestData)))
print("Reading corpus done in: %fs" % (time() - t0))
print('-------------------------------- FEATURES --------------------------------')
Dtraning = sent2features(sentencesTrainingData[0], options.S1, options.S2, options.S3, options.S4, int(options.variant))[2]
Dtest = sent2features(sentencesTestData[0], options.S1, options.S2, options.S3, options.S4, int(options.variant))[2]
print('--------------------------Features Training ---------------------------')
print(DF(list(Dtraning.items())))
print('--------------------------- FeaturesTest -----------------------------')
print(DF(list(Dtest.items())))
t0 = time()
X_train = [sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesTrainingData]
y_train = [sent2labels(s) for s in sentencesTrainingData]
X_test = [sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesTestData]
# print X_test
y_test = [sent2labels(s) for s in sentencesTestData]
'''
Fixed parameters
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_pgossible_transitions=True
)
'''
# Hyperparameter Optimization
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
max_iterations=100,
all_possible_transitions=True
)
params_space = {
'c1': scipy.stats.expon(scale=0.5),
'c2': scipy.stats.expon(scale=0.05),
}
# Original: labels = list(crf.classes_)
# Original: labels.remove('O')
labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels)
# search
rs = RandomizedSearchCV(crf, params_space,
cv=5,
verbose=3,
n_jobs=-1,
n_iter=100,
scoring=f1_scorer,
random_state=42)
rs.fit(X_train, y_train)
# Fixed parameters
# crf.fit(X_train, y_train)
# Best hiperparameters
# crf = rs.best_estimator_
nameReport = str(options.reportName) + '_v'+ str(options.variant) + '.txt'
with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile:
oFile.write("********** TRAINING AND TESTING REPORT **********\n")
oFile.write("Training file: " + options.trainingFile + '\n')
oFile.write('\n')
oFile.write('best params:' + str(rs.best_params_) + '\n')
oFile.write('best CV score:' + str(rs.best_score_) + '\n')
oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000))
print("Training done in: %fs" % (time() - t0))
t0 = time()
# Update best crf
crf = rs.best_estimator_
# Saving model
print(" Saving training model...")
t1 = time()
nameModel = 'model_' + str(options.reportName) + '_v'+ str(options.variant) + '_S1_' + str(options.S1) + '_S2_' + str(options.S2) + '_S3_' + str(options.S3) + '_S4_' + str(options.S4) + '_' + str(options.reportName) + '_v' + str(options.variant) +'.mod'
joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
print(" Saving training model done in: %fs" % (time() - t1))
# Evaluation against test data
y_pred = crf.predict(X_test)
print("*********************************")
print("Prediction done in: %fs" % (time() - t0))
with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile:
oFile.write('\n')
oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)))
oFile.write('\n')
# labels = list(crf.classes_)
sorted_labels = sorted(
labels,
key=lambda name: (name[1:], name[0])
)
oFile.write(metrics.flat_classification_report( y_test, y_pred, labels=sorted_labels, digits=3))
oFile.write('\n')
oFile.write("\nTop likely transitions:\n")
print_transitions(Counter(crf.transition_features_).most_common(options.nrules), oFile)
oFile.write('\n')
oFile.write("\nTop unlikely transitions:\n")
print_transitions(Counter(crf.transition_features_).most_common()[-options.nrules:], oFile)
oFile.write('\n')
oFile.write("\nTop positive:\n")
print_state_features(Counter(crf.state_features_).most_common(options.nrules), oFile)
oFile.write('\n')
oFile.write("\nTop negative:\n")
print_state_features(Counter(crf.state_features_).most_common()[-options.nrules:], oFile)
oFile.write('\n')
This diff could not be displayed because it is too large.
-------------------------------- PARAMETERS --------------------------------
Path to read input files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
Mode name: model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
Model path: /home/egaytan/automatic-extraction-growth-conditions/CRF/models
Path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
Filtering stop words: False
Levels: S1: FalseS2: FalseS3: FalseS4: False
Run variant: None
Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
-------------------------------- PROCESSING --------------------------------
Reading CRF model...
Reading CRF model done in: 0.008342s
Processing corpus...
Preprocessing file...annot-input_bg_v3.txt
Sentences input data: 14716
Predicting tags with model
Prediction done in: 0.983480s
Tagging file
0 1
0 <Gtype> antibody : Flag <Gtype/> Gtype
1 <Gversion> ChIP-Seq <Gversion/> Gversion
2 Cultures of Caulobacter -LRB- TLS1631-TLS1633 ... Gtype
3 <Gtype> developmental stage : mixed population... Gtype
4 DNA was isolated using the Qiagen Cell Lysis a...
5 Escherichia coli
6 Escherichia coli AB1157
7 For analysis of ChIP-seq data , Hiseq 2500 Ill...
8 For analysis of IDAP-seq data , Hiseq 2500 Ill... Gtype
9 Genome _ build : NC _ 000913.3
10 Genome _ build : NC _ 011916.1
11 <Gtype> genotype : AB1157 ybbD : : parS scramb... Gtype
12 <Gtype> genotype : AB1157 ybbD : : parS scramb... Gtype
13 <Gtype> genotype : AB1157 ybbD : : parS site 1... Gtype
14 <Gtype> genotype : AB1157 ybbD : : parS site 2... Gtype
15 <Gtype> genotype : AB1157 ybbD : : parS site 2... Gtype
16 <Gtype> genotype : AB1157 ybbD : : parS site 3... Gtype
17 <Gtype> genotype : AB1157 ybbD : : parS site 3... Gtype
18 <Gtype> genotype : AB1157 ybbD : : parS site 4... Gtype
19 <Gtype> genotype : AB1157 ybbD : : parS site 4... Gtype
20 <Gtype> genotype : AB1157 ybbD : : parS site 5... Gtype
21 <Gtype> genotype : AB1157 ybbD : : parS site 5... Gtype
22 <Gtype> genotype : AB1157 ybbD : : parS site 6... Gtype
23 <Gtype> genotype : AB1157 ybbD : : parS site 7... Gtype
24 <Gtype> genotype : AB1157 ybbD : : parS site 7... Gtype
25 Hiseq 2500 Illumina short reads -LRB- 50 bp -R...
26 LELab _ ChIP _ seq _ TLS1637 _ anti _ FLAG
27 LELab _ ChIP _ seq _ TLS1638 _ anti _ FLAG
28 LELab _ ChIP _ seq _ TLS1639 _ anti _ FLAG
29 LELab _ ChIP _ seq _ TLS1640 _ anti _ FLAG
... ... ...
14686 <Phase> ESBL019 Coliform <Phase/> Phase
14687 <Gtype> ESBL019 Filamented <Gtype/> Gtype
14688 ESBL019 Reverted
14689 <Phase> ESBL019 Transition <Phase/> Phase
14690 Escherichia coli
14691 Four morphologic states of ESBL019 were used d...
14692 <Gtype> morphology : Coliform <Gtype/> Gtype
14693 <Gtype> morphology : Filamented <Gtype/> Gtype
14694 morphology : Reverted -LRB- reverted back from...
14695 morphology : Transition -LRB- from Coli into F...
14696 RNA isolation was performed using an RNeasy mi...
14697 <Gtype> strain : beta-lactamase -LRB- ESBL -RR... Gtype
14698 The E. coli isolate ESBL019 was originally iso...
14699 Escherichia coli
14700 lexA 10 ' after UV vs. 0 ' , MG1655
14701 <Gtype> lexA 10 min after UV treatment , 25 ug... Gtype
14702 lexA 20 ' after NOuv vs. 0 ' , MG1655
14703 lexA 20 ' after UV vs. 0 ' , MG1655
14704 lexA 20 min after NOuv , 25 ug total RNA , 2 u...
14705 <Gtype> lexA 20 min after UV treatment , 25 ug... Gtype
14706 lexA 40 ' after UV vs. 0 ' , MG1655
14707 <Gtype> lexA 40 min after UV treatment , 25 ug... Gtype
14708 lexA 5 ' after UV vs. 0 ' , MG1655
14709 <Gtype> lexA 5 min after UV treatment , 25 ug ... Gtype
14710 lexA 60 ' after NOuv vs. 0 ' , MG1655
14711 lexA 60 ' after UV vs. 0 ' , MG1655
14712 lexA 60 min after NOuv , 25 ug total RNA , 2 u...
14713 <Gtype> lexA 60 min after UV treatment , 25 ug... Gtype
14714 lexA vs. wt , before UV treatment , MG1655
14715 untreated cells , 25 ug total RNA
[14716 rows x 2 columns]