Conditional Random Fields

Carlos-Francisco Méndez-Cruz
Commit a6dee85cbb441c215afdec281daabfa2a95a9bd1 a6dee85c 1 parent bebea4ad
Showing 1 changed file with 280 additions and 0 deletions
tagging_Sklearn_crfsuite.py
--- a/tagging_Sklearn_crfsuite.py 0 → 100644
View file @a6dee85
+++ b/tagging_Sklearn_crfsuite.py 0 → 100644
View file @a6dee85
+# -*- coding: UTF-8 -*-
+
+import os
+from itertools import chain
+from optparse import OptionParser
+from time import time
+from collections import Counter
+
+import nltk
+import sklearn
+import scipy.stats
+import sys
+
+from sklearn.externals import joblib
+from sklearn.metrics import make_scorer
+from sklearn.cross_validation import cross_val_score
+from sklearn.grid_search import RandomizedSearchCV
+
+import sklearn_crfsuite
+from sklearn_crfsuite import scorers
+from sklearn_crfsuite import metrics
+
+from nltk.corpus import stopwords
+from trainingTesting_Sklearn_crfsuite import word2features
+from trainingTesting_Sklearn_crfsuite import sent2features
+# from trainingTesting_Sklearn_crfsuite import hasNonAlphaNum
+# from trainingTesting_Sklearn_crfsuite import hasDigit
+
+# Objective
+# Tagging transformed file with CRF model with sklearn-crfsuite.
+#
+# Input parameters
+# --inputPath=PATH    Path of transformed files x|y|z
+# --modelPath        Path to CRF model
+# --modelName    Model name
+# --outputPath=PATH    Output path to place output files
+# --filteringStopWords   Filtering stop words
+# --filterSymbols      Filtering punctuation marks
+
+# Output
+# 1) Tagged files in transformed format
+
+# Examples
+# Sentences
+# C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName aspectsTraining.fStopWords_False.fSymbols_True --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged --filterSymbols > output.taggingCRF.20161107.txt
+# C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName sentencesTraining.fStopWords_False.fSymbols_False --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged > output.taggingCRF.20161107.txt
+
+#################################
+#           FUNCTIONS           #
+#################################
+def word2features(sent, i):
+    listElem = sent[i].split('|')
+    word = listElem[0]
+    lemma = listElem[1]
+    postag = listElem[2]
+
+    features = {
+        # Suffixes
+        #'word[-3:]': word[-3:],
+        #'word[-2:]': word[-2:],
+        #'word[-1:]': word[-1:],
+        #'word.isupper()': word.isupper(),
+        #'word': word,
+        #'lemma': lemma,
+        #'postag': postag,
+        'lemma[-3:]': lemma[-3:],
+        'lemma[-2:]': lemma[-2:],
+        'lemma[-1:]': lemma[-1:],
+        'lemma[+3:]': lemma[:3],
+        'lemma[+2:]': lemma[:2],
+        'lemma[+1:]': lemma[:1],
+        #'word[:3]': word[:3],
+        #'word[:2]': word[:2],
+        #'word[:1]': word[:1],
+        #'endsConLow()={}'.format(endsConLow(word)): endsConLow(word),
+    }
+    if i > 0:
+        listElem = sent[i - 1].split('|')
+        word1 = listElem[0]
+        lemma1 = listElem[1]
+        postag1 = listElem[2]
+        features.update({
+            #'-1:word': word1,
+            '-1:lemma': lemma1,
+            '-1:postag': postag1,
+        })
+
+    if i < len(sent) - 1:
+        listElem = sent[i + 1].split('|')
+        word1 = listElem[0]
+        lemma1 = listElem[1]
+        postag1 = listElem[2]
+        features.update({
+            #'+1:word': word1,
+            '+1:lemma': lemma1,
+            '+1:postag': postag1,
+        })
+
+    '''    
+    if i > 1:
+        listElem = sent[i - 2].split('|')
+        word2 = listElem[0]
+        lemma2 = listElem[1]
+        postag2 = listElem[2]
+        features.update({
+            '-2:word': word2,
+            '-2:lemma': lemma2,
+        })
+
+    if i < len(sent) - 2:
+        listElem = sent[i + 2].split('|')
+        word2 = listElem[0]
+        lemma2 = listElem[1]
+        postag2 = listElem[2]
+        features.update({
+            '+2:word': word2,
+            '+2:lemma': lemma2,
+        })
+
+    trigrams = False
+    if trigrams:
+        if i > 2:
+            listElem = sent[i - 3].split('|')
+            word3 = listElem[0]
+            lemma3 = listElem[1]
+            postag3 = listElem[2]
+            features.update({
+                '-3:word': word3,
+                '-3:lemma': lemma3,
+            })
+
+        if i < len(sent) - 3:
+            listElem = sent[i + 3].split('|')
+            word3 = listElem[0]
+            lemma3 = listElem[1]
+            postag3 = listElem[2]
+            features.update({
+                '+3:word': word3,
+                '+3:lemma': lemma3,
+            })
+    '''
+    return features
+
+__author__ = 'CMendezC'
+
+##########################################
+#               MAIN PROGRAM             #
+##########################################
+
+if __name__ == "__main__":
+    # Defining parameters
+    parser = OptionParser()
+    parser.add_option("--inputPath", dest="inputPath",
+                      help="Path of training data set", metavar="PATH")
+    parser.add_option("--outputPath", dest="outputPath",
+                      help="Output path to place output files",
+                      metavar="PATH")
+    parser.add_option("--modelPath", dest="modelPath",
+                      help="Path to read CRF model",
+                      metavar="PATH")
+    parser.add_option("--modelName", dest="modelName",
+                      help="Model name", metavar="TEXT")
+    parser.add_option("--filterStopWords", default=False,
+                      action="store_true", dest="filterStopWords",
+                      help="Filtering stop words")
+    parser.add_option("--filterSymbols", default=False,
+                      action="store_true", dest="filterSymbols",
+                      help="Filtering punctuation marks")
+
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("Any parameter given.")
+        sys.exit(1)
+
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path to read input files: " + options.inputPath)
+    print("Mode name: " + str(options.modelName))
+    print("Model path: " + options.modelPath)
+    print("Path to place output files: " + options.outputPath)
+    print("Filtering stop words: " + str(options.filterStopWords))
+    symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
+               '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
+    # symbols = [sym.decode('utf-8') for sym in ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
+    #            '}', '[', ']', '*', '%', '$', '#', '&', '°']]
+    # symbols = [u'.', u',', u':', u';', u'?', u'!', u'\'', u'"', u'<', u'>', u'(', u')', u'-', u'_', u'/', u'\\', u'¿', u'¡', u'+', u'{',
+    #             u'}', u'[', u']', u'*', u'%', u'$', u'#', u'&', u'°', u'`']
+    print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
+
+    print('-------------------------------- PROCESSING --------------------------------')
+
+    stopwords = [word.decode('utf-8') for word in stopwords.words('english')]
+
+    # Read CRF model
+    t0 = time()
+    print('Reading CRF model...')
+    crf = joblib.load(os.path.join(options.modelPath, 'models', options.modelName + '.mod'))
+    print("Reading CRF model done in: %fs" % (time() - t0))
+
+    print('Processing corpus...')
+    t0 = time()
+    # labels = list(['MF', 'TF', 'DFAM', 'DMOT', 'DPOS', 'PRO'])
+    # Walk directory to read files
+    for path, dirs, files in os.walk(options.inputPath):
+        # For each file in dir
+        for file in files:
+            print("   Preprocessing file..." + str(file))
+            sentencesInputData = []
+            sentencesOutputData = []
+            with open(os.path.join(options.inputPath, file), "r") as iFile:
+                lines = iFile.readlines()
+                for line in lines:
+                    listLine = []
+                    # line = line.decode("utf-8")
+                    for token in line.strip('\n').split():
+                        if options.filterStopWords:
+                            listToken = token.split('|')
+                            lemma = listToken[1]
+                            # Original if lemma in stopwords.words('english'):
+                            if lemma in stopwords:
+                                continue
+                        if options.filterSymbols:
+                            listToken = token.split('|')
+                            lemma = listToken[1]
+                            if lemma in symbols:
+                                if lemma == ',':
+                                    print("Coma , identificada")
+                                continue
+                        listLine.append(token)
+                    sentencesInputData.append(listLine)
+                print("   Sentences input data: " + str(len(sentencesInputData)))
+                # print sentencesInputData[0]
+                # print(sent2features(sentencesInputData[0])[0])
+                # print(sent2labels(sentencesInputData[0]))
+                X_input = [sent2features(s) for s in sentencesInputData]
+                print(sent2features(sentencesInputData[0])[0])
+                # y_test = [sent2labels(s) for s in sentencesInputData]
+                # Predicting tags
+                t1 = time()
+                print("   Predicting tags with model")
+                y_pred = crf.predict(X_input)
+                #print y_pred[0]
+                print("      Prediction done in: %fs" % (time() - t1))
+                exit
+
+                # Tagging with CRF model
+                print("   Tagging file")
+                for line, tagLine in zip(lines, y_pred):
+                    outputLine = ''
+                    idx_tagLine = 0
+                    line = line.strip('\n')
+                    print("\nLine: " + str(line))
+                    print ("CRF tagged line: " + str(tagLine))
+                    for token in line.split():
+                        listToken = token.split('|')
+                        word = listToken[0]
+                        lemma = listToken[1]
+                        tag = listToken[2]
+                        if options.filterStopWords:
+                            if lemma in stopwords:
+                                outputLine += token + ' '
+                                continue
+                        if options.filterSymbols:
+                            if lemma in symbols:
+                                if lemma == ',':
+                                    print("Coma , identificada")
+                                outputLine += token + ' '
+                                continue
+                        CRFtag = tagLine[idx_tagLine]
+                        #if (tag not in labels) and (CRFtag != 'O'):
+                        #    print "*** CRF change token {} to {}".format(token, CRFtag)
+                        #    outputLine += word + '|' + lemma + '|' + CRFtag + ' '
+                        #else:
+                        #    outputLine += word + '|' + lemma + '|' + tag + ' '
+                        #idx_tagLine += 1
+                    sentencesOutputData.append(outputLine.rstrip())
+            with open(os.path.join(options.outputPath, file), "w") as oFile:
+                for line in sentencesOutputData:
+                    oFile.write(line + '\n')
+
+    print("Processing corpus done in: %fs" % (time() - t0))