Conditional Random Fields

Carlos-Francisco Méndez-Cruz
Commit a6dee85cbb441c215afdec281daabfa2a95a9bd1 a6dee85c 1 parent bebea4ad
Showing 1 changed file with 280 additions and 0 deletions
tagging_Sklearn_crfsuite.py
--- a/tagging_Sklearn_crfsuite.py 0 → 100644
View file @a6dee85
+++ b/tagging_Sklearn_crfsuite.py 0 → 100644
View file @a6dee85
+ # -*- coding: UTF-8 -*-
+ 
+ import os
+ from itertools import chain
+ from optparse import OptionParser
+ from time import time
+ from collections import Counter
+ 
+ import nltk
+ import sklearn
+ import scipy.stats
+ import sys
+ 
+ from sklearn.externals import joblib
+ from sklearn.metrics import make_scorer
+ from sklearn.cross_validation import cross_val_score
+ from sklearn.grid_search import RandomizedSearchCV
+ 
+ import sklearn_crfsuite
+ from sklearn_crfsuite import scorers
+ from sklearn_crfsuite import metrics
+ 
+ from nltk.corpus import stopwords
+ from trainingTesting_Sklearn_crfsuite import word2features
+ from trainingTesting_Sklearn_crfsuite import sent2features
+ # from trainingTesting_Sklearn_crfsuite import hasNonAlphaNum
+ # from trainingTesting_Sklearn_crfsuite import hasDigit
+ 
+ # Objective
+ # Tagging transformed file with CRF model with sklearn-crfsuite.
+ #
+ # Input parameters
+ # --inputPath=PATH    Path of transformed files x|y|z
+ # --modelPath        Path to CRF model
+ # --modelName    Model name
+ # --outputPath=PATH    Output path to place output files
+ # --filteringStopWords   Filtering stop words
+ # --filterSymbols      Filtering punctuation marks
+ 
+ # Output
+ # 1) Tagged files in transformed format
+ 
+ # Examples
+ # Sentences
+ # C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName aspectsTraining.fStopWords_False.fSymbols_True --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged --filterSymbols > output.taggingCRF.20161107.txt
+ # C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName sentencesTraining.fStopWords_False.fSymbols_False --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged > output.taggingCRF.20161107.txt
+ 
+ #################################
+ #           FUNCTIONS           #
+ #################################
+ def word2features(sent, i):
+     listElem = sent[i].split('|')
+     word = listElem[0]
+     lemma = listElem[1]
+     postag = listElem[2]
+ 
+     features = {
+         # Suffixes
+         #'word[-3:]': word[-3:],
+         #'word[-2:]': word[-2:],
+         #'word[-1:]': word[-1:],
+         #'word.isupper()': word.isupper(),
+         #'word': word,
+         #'lemma': lemma,
+         #'postag': postag,
+         'lemma[-3:]': lemma[-3:],
+         'lemma[-2:]': lemma[-2:],
+         'lemma[-1:]': lemma[-1:],
+         'lemma[+3:]': lemma[:3],
+         'lemma[+2:]': lemma[:2],
+         'lemma[+1:]': lemma[:1],
+         #'word[:3]': word[:3],
+         #'word[:2]': word[:2],
+         #'word[:1]': word[:1],
+         #'endsConLow()={}'.format(endsConLow(word)): endsConLow(word),
+     }
+     if i > 0:
+         listElem = sent[i - 1].split('|')
+         word1 = listElem[0]
+         lemma1 = listElem[1]
+         postag1 = listElem[2]
+         features.update({
+             #'-1:word': word1,
+             '-1:lemma': lemma1,
+             '-1:postag': postag1,
+         })
+ 
+     if i < len(sent) - 1:
+         listElem = sent[i + 1].split('|')
+         word1 = listElem[0]
+         lemma1 = listElem[1]
+         postag1 = listElem[2]
+         features.update({
+             #'+1:word': word1,
+             '+1:lemma': lemma1,
+             '+1:postag': postag1,
+         })
+ 
+     '''    
+     if i > 1:
+         listElem = sent[i - 2].split('|')
+         word2 = listElem[0]
+         lemma2 = listElem[1]
+         postag2 = listElem[2]
+         features.update({
+             '-2:word': word2,
+             '-2:lemma': lemma2,
+         })
+ 
+     if i < len(sent) - 2:
+         listElem = sent[i + 2].split('|')
+         word2 = listElem[0]
+         lemma2 = listElem[1]
+         postag2 = listElem[2]
+         features.update({
+             '+2:word': word2,
+             '+2:lemma': lemma2,
+         })
+ 
+     trigrams = False
+     if trigrams:
+         if i > 2:
+             listElem = sent[i - 3].split('|')
+             word3 = listElem[0]
+             lemma3 = listElem[1]
+             postag3 = listElem[2]
+             features.update({
+                 '-3:word': word3,
+                 '-3:lemma': lemma3,
+             })
+ 
+         if i < len(sent) - 3:
+             listElem = sent[i + 3].split('|')
+             word3 = listElem[0]
+             lemma3 = listElem[1]
+             postag3 = listElem[2]
+             features.update({
+                 '+3:word': word3,
+                 '+3:lemma': lemma3,
+             })
+     '''
+     return features
+ 
+ __author__ = 'CMendezC'
+ 
+ ##########################################
+ #               MAIN PROGRAM             #
+ ##########################################
+ 
+ if __name__ == "__main__":
+     # Defining parameters
+     parser = OptionParser()
+     parser.add_option("--inputPath", dest="inputPath",
+                       help="Path of training data set", metavar="PATH")
+     parser.add_option("--outputPath", dest="outputPath",
+                       help="Output path to place output files",
+                       metavar="PATH")
+     parser.add_option("--modelPath", dest="modelPath",
+                       help="Path to read CRF model",
+                       metavar="PATH")
+     parser.add_option("--modelName", dest="modelName",
+                       help="Model name", metavar="TEXT")
+     parser.add_option("--filterStopWords", default=False,
+                       action="store_true", dest="filterStopWords",
+                       help="Filtering stop words")
+     parser.add_option("--filterSymbols", default=False,
+                       action="store_true", dest="filterSymbols",
+                       help="Filtering punctuation marks")
+ 
+     (options, args) = parser.parse_args()
+     if len(args) > 0:
+         parser.error("Any parameter given.")
+         sys.exit(1)
+ 
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Path to read input files: " + options.inputPath)
+     print("Mode name: " + str(options.modelName))
+     print("Model path: " + options.modelPath)
+     print("Path to place output files: " + options.outputPath)
+     print("Filtering stop words: " + str(options.filterStopWords))
+     symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
+                '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
+     # symbols = [sym.decode('utf-8') for sym in ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
+     #            '}', '[', ']', '*', '%', '$', '#', '&', '°']]
+     # symbols = [u'.', u',', u':', u';', u'?', u'!', u'\'', u'"', u'<', u'>', u'(', u')', u'-', u'_', u'/', u'\\', u'¿', u'¡', u'+', u'{',
+     #             u'}', u'[', u']', u'*', u'%', u'$', u'#', u'&', u'°', u'`']
+     print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
+ 
+     print('-------------------------------- PROCESSING --------------------------------')
+ 
+     stopwords = [word.decode('utf-8') for word in stopwords.words('english')]
+ 
+     # Read CRF model
+     t0 = time()
+     print('Reading CRF model...')
+     crf = joblib.load(os.path.join(options.modelPath, 'models', options.modelName + '.mod'))
+     print("Reading CRF model done in: %fs" % (time() - t0))
+ 
+     print('Processing corpus...')
+     t0 = time()
+     # labels = list(['MF', 'TF', 'DFAM', 'DMOT', 'DPOS', 'PRO'])
+     # Walk directory to read files
+     for path, dirs, files in os.walk(options.inputPath):
+         # For each file in dir
+         for file in files:
+             print("   Preprocessing file..." + str(file))
+             sentencesInputData = []
+             sentencesOutputData = []
+             with open(os.path.join(options.inputPath, file), "r") as iFile:
+                 lines = iFile.readlines()
+                 for line in lines:
+                     listLine = []
+                     # line = line.decode("utf-8")
+                     for token in line.strip('\n').split():
+                         if options.filterStopWords:
+                             listToken = token.split('|')
+                             lemma = listToken[1]
+                             # Original if lemma in stopwords.words('english'):
+                             if lemma in stopwords:
+                                 continue
+                         if options.filterSymbols:
+                             listToken = token.split('|')
+                             lemma = listToken[1]
+                             if lemma in symbols:
+                                 if lemma == ',':
+                                     print("Coma , identificada")
+                                 continue
+                         listLine.append(token)
+                     sentencesInputData.append(listLine)
+                 print("   Sentences input data: " + str(len(sentencesInputData)))
+                 # print sentencesInputData[0]
+                 # print(sent2features(sentencesInputData[0])[0])
+                 # print(sent2labels(sentencesInputData[0]))
+                 X_input = [sent2features(s) for s in sentencesInputData]
+                 print(sent2features(sentencesInputData[0])[0])
+                 # y_test = [sent2labels(s) for s in sentencesInputData]
+                 # Predicting tags
+                 t1 = time()
+                 print("   Predicting tags with model")
+                 y_pred = crf.predict(X_input)
+                 #print y_pred[0]
+                 print("      Prediction done in: %fs" % (time() - t1))
+                 exit
+ 
+                 # Tagging with CRF model
+                 print("   Tagging file")
+                 for line, tagLine in zip(lines, y_pred):
+                     outputLine = ''
+                     idx_tagLine = 0
+                     line = line.strip('\n')
+                     print("\nLine: " + str(line))
+                     print ("CRF tagged line: " + str(tagLine))
+                     for token in line.split():
+                         listToken = token.split('|')
+                         word = listToken[0]
+                         lemma = listToken[1]
+                         tag = listToken[2]
+                         if options.filterStopWords:
+                             if lemma in stopwords:
+                                 outputLine += token + ' '
+                                 continue
+                         if options.filterSymbols:
+                             if lemma in symbols:
+                                 if lemma == ',':
+                                     print("Coma , identificada")
+                                 outputLine += token + ' '
+                                 continue
+                         CRFtag = tagLine[idx_tagLine]
+                         #if (tag not in labels) and (CRFtag != 'O'):
+                         #    print "*** CRF change token {} to {}".format(token, CRFtag)
+                         #    outputLine += word + '|' + lemma + '|' + CRFtag + ' '
+                         #else:
+                         #    outputLine += word + '|' + lemma + '|' + tag + ' '
+                         #idx_tagLine += 1
+                     sentencesOutputData.append(outputLine.rstrip())
+             with open(os.path.join(options.outputPath, file), "w") as oFile:
+                 for line in sentencesOutputData:
+                     oFile.write(line + '\n')
+ 
+     print("Processing corpus done in: %fs" % (time() - t0))