Carlos-Francisco Méndez-Cruz

Conditional Random Fields

This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
# -*- coding: UTF-8 -*-
from optparse import OptionParser
import os
import sys
from time import time
import re
__author__ = 'CMendezC'
# Objective: Take text-annotated-abstracts-original.txt as input
# for obtaining abstracts separated in files without tags and collecting dictionary of genes
# for tagging after NLP pipeline.
# Parameters:
# 1) --inputPath Input path.
# 2) --inputFile Input file.
# 3) --outputPath Output path
# Execution:
# python3 prepare-abstracts.py
# --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
# --inputFile text-annotated-abstracts.txt
# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original
# --dicPath /export/space1/users/compu2/bionlp/nlp-preprocessing-pipeline/dictionaries
# --dicFile genes.txt
# python3 prepare-abstracts.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets --inputFile text-annotated-abstracts-original.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original --dicPath /export/space1/users/compu2/bionlp/nlp-preprocessing-pipeline/dictionaries --dicFile genes.txt
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Input path", metavar="PATH")
parser.add_option("--inputFile", dest="inputFile",
help="Input file", metavar="FILE")
parser.add_option("--outputPath", dest="outputPath",
help="Output path", metavar="PATH")
parser.add_option("--dicPath", dest="dicPath",
help="Dictionary path", metavar="PATH")
parser.add_option("--dicFile", dest="dicFile",
help="Dictionary file", metavar="FILE")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameters indicated.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Input path: " + str(options.inputPath))
print("Input file", str(options.inputFile))
print("Output path: " + str(options.outputPath))
print("Dictionary path: " + str(options.dicPath))
print("Dictionary file", str(options.dicFile))
filesWritten = 0
t0 = time()
hashGenes = {}
rePmid = re.compile(r'([\d]+)\|t\|')
reGene = re.compile(r'<g>([^<]+)</g>')
reTags = re.compile(r'(<g>|</g>|<d>|</d>|<i>|</i>)')
with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile:
print("Reading file..." + options.inputFile)
for line in iFile:
line = line.strip('\r\n')
for gene in reGene.findall(line):
# print("genes: {}".format(gene))
if gene not in hashGenes:
hashGenes[gene] = 1
else:
hashGenes[gene] += 1
line = reTags.sub('', line)
result = rePmid.match(line)
if result:
line = rePmid.sub('', line)
with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile:
oFile.write(line)
else:
print("Warning: line without PMID")
with open(os.path.join(options.dicPath, options.dicFile), "w", encoding="utf-8", errors="replace") as dFile:
for gene in hashGenes.keys():
dFile.write("{}\n".format(gene))
# -*- coding: UTF-8 -*-
from optparse import OptionParser
import os
import sys
from time import time
__author__ = 'CMendezC'
# Objective: Join transformed files for obtaining training and test data sets
# Parameters:
# 1) --inputPath Path to read files.
# 2) --trainingFile File name for training data.
# 3) --testFile File name for test data.
# 4) --outputPath Path to write files.
# Ouput:
# 1) Files created.
# Execution:
# python3.4 prepare-training-test.py
# --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/transformed
# --trainingFile training-data-set-70.txt
# --testFile test-data-set-30.txt
# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
# python3.4 prepare-training-test.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/transformed --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
###########################################################
# MAIN PROGRAM #
###########################################################
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Path to read files", metavar="PATH")
parser.add_option("--trainingFile", dest="trainingFile",
help="File for training examples", metavar="FILE")
parser.add_option("--testFile", dest="testFile",
help="File for test examples", metavar="FILE")
parser.add_option("--outputPath", dest="outputPath",
help="Path to write output file, feature parameter is concatenated to file name.", metavar="PATH")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameters indicated.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read files: " + str(options.inputPath))
print("File for training examples", str(options.trainingFile))
print("File for test examples", str(options.testFile))
print("Path to write output files: " + str(options.outputPath))
t0 = time()
trainingDataset = []
testDataset = []
counter = 1
for path, dirs, files in os.walk(options.inputPath):
# For each file in dir
for file in files:
if counter <= 70:
print(" Joining file {} {} to training data set".format(counter, file))
with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
for line in iFile:
line = line.strip('\r\n')
trainingDataset.append(line)
elif counter > 70 and counter <= 100:
print(" Joining file {} {} to test data set".format(counter, file))
with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
for line in iFile:
line = line.strip('\r\n')
testDataset.append(line)
counter += 1
with open(os.path.join(options.outputPath, options.trainingFile), "w", encoding="utf-8", errors="replace") as oFile:
for line in trainingDataset:
oFile.write("{}\n".format(line))
with open(os.path.join(options.outputPath, options.testFile), "w", encoding="utf-8", errors="replace") as oFile:
for line in testDataset:
oFile.write("{}\n".format(line))
This diff is collapsed. Click to expand it.
......@@ -49,12 +49,6 @@ from nltk.corpus import stopwords
#################################
# FUNCTIONS #
#################################
def endsConLow(word):
miregex = re.compile(r'[^aeiouA-Z0-9]$')
if miregex.search(word):
return True
else:
return False
def word2features(sent, i):
listElem = sent[i].split('|')
......@@ -63,21 +57,9 @@ def word2features(sent, i):
postag = listElem[2]
features = {
# Suffixes
'word[-3:]': word[-3:],
'word[-2:]': word[-2:],
'word[-1:]': word[-1:],
#'word.isupper()': word.isupper(),
'word': word,
'lemma': lemma,
'postag': postag,
'lemma[-3:]': lemma[-3:],
'lemma[-2:]': lemma[-2:],
'lemma[-1:]': lemma[-1:],
'word[:3]': word[:3],
'word[:2]': word[:2],
'word[:1]': word[:1],
'endsConLow()={}'.format(endsConLow(word)): endsConLow(word),
#'lemma': lemma,
#'postag': postag,
}
if i > 0:
listElem = sent[i - 1].split('|')
......@@ -86,8 +68,8 @@ def word2features(sent, i):
postag1 = listElem[2]
features.update({
'-1:word': word1,
'-1:lemma': lemma1,
'-1:postag': postag1,
#'-1:lemma': lemma1,
#'-1:postag': postag1,
})
if i < len(sent) - 1:
......@@ -97,55 +79,10 @@ def word2features(sent, i):
postag1 = listElem[2]
features.update({
'+1:word': word1,
'+1:lemma': lemma1,
'+1:postag': postag1,
#'+1:lemma': lemma1,
#'+1:postag': postag1,
})
'''
if i > 1:
listElem = sent[i - 2].split('|')
word2 = listElem[0]
lemma2 = listElem[1]
postag2 = listElem[2]
features.update({
'-2:word': word2,
'-2:lemma': lemma2,
})
if i < len(sent) - 2:
listElem = sent[i + 2].split('|')
word2 = listElem[0]
lemma2 = listElem[1]
postag2 = listElem[2]
features.update({
'+2:word': word2,
'+2:lemma': lemma2,
})
trigrams = False
if trigrams:
if i > 2:
listElem = sent[i - 3].split('|')
word3 = listElem[0]
lemma3 = listElem[1]
postag3 = listElem[2]
features.update({
'-3:word': word3,
'-3:lemma': lemma3,
})
if i < len(sent) - 3:
listElem = sent[i + 3].split('|')
word3 = listElem[0]
lemma3 = listElem[1]
postag3 = listElem[2]
features.update({
'+3:word': word3,
'+3:lemma': lemma3,
})
'''
return features
def sent2features(sent):
return [word2features(sent, i) for i in range(len(sent))]
......@@ -271,24 +208,13 @@ if __name__ == "__main__":
y_test = [sent2labels(s) for s in sentencesTestData]
# Fixed parameters
# crf = sklearn_crfsuite.CRF(
# algorithm='lbfgs',
# c1=0.1,
# c2=0.1,
# max_iterations=100,
# all_possible_transitions=True
# )
# Hyperparameter Optimization
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
c1=0.1,
c2=0.1,
max_iterations=100,
all_possible_transitions=True
)
params_space = {
'c1': scipy.stats.expon(scale=0.5),
'c2': scipy.stats.expon(scale=0.05),
}
# Original: labels = list(crf.classes_)
# Original: labels.remove('O')
......@@ -298,18 +224,8 @@ if __name__ == "__main__":
f1_scorer = make_scorer(metrics.flat_f1_score,
average='weighted', labels=labels)
# search
rs = RandomizedSearchCV(crf, params_space,
cv=10,
verbose=3,
n_jobs=-1,
n_iter=20,
# n_iter=50,
scoring=f1_scorer)
rs.fit(X_train, y_train)
# Fixed parameters
# crf.fit(X_train, y_train)
crf.fit(X_train, y_train)
# Best hiperparameters
# crf = rs.best_estimator_
......@@ -319,16 +235,13 @@ if __name__ == "__main__":
oFile.write("********** TRAINING AND TESTING REPORT **********\n")
oFile.write("Training file: " + options.trainingFile + '\n')
oFile.write('\n')
oFile.write('best params:' + str(rs.best_params_) + '\n')
oFile.write('best CV score:' + str(rs.best_score_) + '\n')
oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000))
oFile.write('best params:' + str(crf.best_params_) + '\n')
oFile.write('best CV score:' + str(crf.best_score_) + '\n')
oFile.write('model size: {:0.2f}M\n'.format(crf.best_estimator_.size_ / 1000000))
print("Training done in: %fs" % (time() - t0))
t0 = time()
# Update best crf
crf = rs.best_estimator_
# Saving model
print(" Saving training model...")
t1 = time()
......@@ -337,7 +250,7 @@ if __name__ == "__main__":
joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
print(" Saving training model done in: %fs" % (time() - t1))
# Evaluation against test data
# Evaluation against evaluation data
y_pred = crf.predict(X_test)
print("*********************************")
name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
......