Carlos-Francisco Méndez-Cruz

Conditional Random Fields

# -*- coding: UTF-8 -*-
import os
from itertools import chain
from optparse import OptionParser
from time import time
from collections import Counter
import nltk
import sklearn
import scipy.stats
import sys
from sklearn.externals import joblib
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from nltk.corpus import stopwords
from trainingTesting_Sklearn_crfsuite import word2features
from trainingTesting_Sklearn_crfsuite import sent2features
# from trainingTesting_Sklearn_crfsuite import hasNonAlphaNum
# from trainingTesting_Sklearn_crfsuite import hasDigit
# Objective
# Tagging transformed file with CRF model with sklearn-crfsuite.
#
# Input parameters
# --inputPath=PATH Path of transformed files x|y|z
# --modelPath Path to CRF model
# --modelName Model name
# --outputPath=PATH Output path to place output files
# --filteringStopWords Filtering stop words
# --filterSymbols Filtering punctuation marks
# Output
# 1) Tagged files in transformed format
# Examples
# Sentences
# C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName aspectsTraining.fStopWords_False.fSymbols_True --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged --filterSymbols > output.taggingCRF.20161107.txt
# C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName sentencesTraining.fStopWords_False.fSymbols_False --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged > output.taggingCRF.20161107.txt
#################################
# FUNCTIONS #
#################################
def word2features(sent, i):
listElem = sent[i].split('|')
word = listElem[0]
lemma = listElem[1]
postag = listElem[2]
features = {
# Suffixes
#'word[-3:]': word[-3:],
#'word[-2:]': word[-2:],
#'word[-1:]': word[-1:],
#'word.isupper()': word.isupper(),
#'word': word,
#'lemma': lemma,
#'postag': postag,
'lemma[-3:]': lemma[-3:],
'lemma[-2:]': lemma[-2:],
'lemma[-1:]': lemma[-1:],
'lemma[+3:]': lemma[:3],
'lemma[+2:]': lemma[:2],
'lemma[+1:]': lemma[:1],
#'word[:3]': word[:3],
#'word[:2]': word[:2],
#'word[:1]': word[:1],
#'endsConLow()={}'.format(endsConLow(word)): endsConLow(word),
}
if i > 0:
listElem = sent[i - 1].split('|')
word1 = listElem[0]
lemma1 = listElem[1]
postag1 = listElem[2]
features.update({
#'-1:word': word1,
'-1:lemma': lemma1,
'-1:postag': postag1,
})
if i < len(sent) - 1:
listElem = sent[i + 1].split('|')
word1 = listElem[0]
lemma1 = listElem[1]
postag1 = listElem[2]
features.update({
#'+1:word': word1,
'+1:lemma': lemma1,
'+1:postag': postag1,
})
'''
if i > 1:
listElem = sent[i - 2].split('|')
word2 = listElem[0]
lemma2 = listElem[1]
postag2 = listElem[2]
features.update({
'-2:word': word2,
'-2:lemma': lemma2,
})
if i < len(sent) - 2:
listElem = sent[i + 2].split('|')
word2 = listElem[0]
lemma2 = listElem[1]
postag2 = listElem[2]
features.update({
'+2:word': word2,
'+2:lemma': lemma2,
})
trigrams = False
if trigrams:
if i > 2:
listElem = sent[i - 3].split('|')
word3 = listElem[0]
lemma3 = listElem[1]
postag3 = listElem[2]
features.update({
'-3:word': word3,
'-3:lemma': lemma3,
})
if i < len(sent) - 3:
listElem = sent[i + 3].split('|')
word3 = listElem[0]
lemma3 = listElem[1]
postag3 = listElem[2]
features.update({
'+3:word': word3,
'+3:lemma': lemma3,
})
'''
return features
__author__ = 'CMendezC'
##########################################
# MAIN PROGRAM #
##########################################
if __name__ == "__main__":
# Defining parameters
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Path of training data set", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
help="Output path to place output files",
metavar="PATH")
parser.add_option("--modelPath", dest="modelPath",
help="Path to read CRF model",
metavar="PATH")
parser.add_option("--modelName", dest="modelName",
help="Model name", metavar="TEXT")
parser.add_option("--filterStopWords", default=False,
action="store_true", dest="filterStopWords",
help="Filtering stop words")
parser.add_option("--filterSymbols", default=False,
action="store_true", dest="filterSymbols",
help="Filtering punctuation marks")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Any parameter given.")
sys.exit(1)
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + options.inputPath)
print("Mode name: " + str(options.modelName))
print("Model path: " + options.modelPath)
print("Path to place output files: " + options.outputPath)
print("Filtering stop words: " + str(options.filterStopWords))
symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
'}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
# symbols = [sym.decode('utf-8') for sym in ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
# '}', '[', ']', '*', '%', '$', '#', '&', '°']]
# symbols = [u'.', u',', u':', u';', u'?', u'!', u'\'', u'"', u'<', u'>', u'(', u')', u'-', u'_', u'/', u'\\', u'¿', u'¡', u'+', u'{',
# u'}', u'[', u']', u'*', u'%', u'$', u'#', u'&', u'°', u'`']
print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
print('-------------------------------- PROCESSING --------------------------------')
stopwords = [word.decode('utf-8') for word in stopwords.words('english')]
# Read CRF model
t0 = time()
print('Reading CRF model...')
crf = joblib.load(os.path.join(options.modelPath, 'models', options.modelName + '.mod'))
print("Reading CRF model done in: %fs" % (time() - t0))
print('Processing corpus...')
t0 = time()
# labels = list(['MF', 'TF', 'DFAM', 'DMOT', 'DPOS', 'PRO'])
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
# For each file in dir
for file in files:
print(" Preprocessing file..." + str(file))
sentencesInputData = []
sentencesOutputData = []
with open(os.path.join(options.inputPath, file), "r") as iFile:
lines = iFile.readlines()
for line in lines:
listLine = []
# line = line.decode("utf-8")
for token in line.strip('\n').split():
if options.filterStopWords:
listToken = token.split('|')
lemma = listToken[1]
# Original if lemma in stopwords.words('english'):
if lemma in stopwords:
continue
if options.filterSymbols:
listToken = token.split('|')
lemma = listToken[1]
if lemma in symbols:
if lemma == ',':
print("Coma , identificada")
continue
listLine.append(token)
sentencesInputData.append(listLine)
print(" Sentences input data: " + str(len(sentencesInputData)))
# print sentencesInputData[0]
# print(sent2features(sentencesInputData[0])[0])
# print(sent2labels(sentencesInputData[0]))
X_input = [sent2features(s) for s in sentencesInputData]
print(sent2features(sentencesInputData[0])[0])
# y_test = [sent2labels(s) for s in sentencesInputData]
# Predicting tags
t1 = time()
print(" Predicting tags with model")
y_pred = crf.predict(X_input)
#print y_pred[0]
print(" Prediction done in: %fs" % (time() - t1))
exit
# Tagging with CRF model
print(" Tagging file")
for line, tagLine in zip(lines, y_pred):
outputLine = ''
idx_tagLine = 0
line = line.strip('\n')
print("\nLine: " + str(line))
print ("CRF tagged line: " + str(tagLine))
for token in line.split():
listToken = token.split('|')
word = listToken[0]
lemma = listToken[1]
tag = listToken[2]
if options.filterStopWords:
if lemma in stopwords:
outputLine += token + ' '
continue
if options.filterSymbols:
if lemma in symbols:
if lemma == ',':
print("Coma , identificada")
outputLine += token + ' '
continue
CRFtag = tagLine[idx_tagLine]
#if (tag not in labels) and (CRFtag != 'O'):
# print "*** CRF change token {} to {}".format(token, CRFtag)
# outputLine += word + '|' + lemma + '|' + CRFtag + ' '
#else:
# outputLine += word + '|' + lemma + '|' + tag + ' '
#idx_tagLine += 1
sentencesOutputData.append(outputLine.rstrip())
with open(os.path.join(options.outputPath, file), "w") as oFile:
for line in sentencesOutputData:
oFile.write(line + '\n')
print("Processing corpus done in: %fs" % (time() - t0))