Showing
1 changed file
with
280 additions
and
0 deletions
tagging_Sklearn_crfsuite.py
0 → 100644
| 1 | +# -*- coding: UTF-8 -*- | ||
| 2 | + | ||
| 3 | +import os | ||
| 4 | +from itertools import chain | ||
| 5 | +from optparse import OptionParser | ||
| 6 | +from time import time | ||
| 7 | +from collections import Counter | ||
| 8 | + | ||
| 9 | +import nltk | ||
| 10 | +import sklearn | ||
| 11 | +import scipy.stats | ||
| 12 | +import sys | ||
| 13 | + | ||
| 14 | +from sklearn.externals import joblib | ||
| 15 | +from sklearn.metrics import make_scorer | ||
| 16 | +from sklearn.cross_validation import cross_val_score | ||
| 17 | +from sklearn.grid_search import RandomizedSearchCV | ||
| 18 | + | ||
| 19 | +import sklearn_crfsuite | ||
| 20 | +from sklearn_crfsuite import scorers | ||
| 21 | +from sklearn_crfsuite import metrics | ||
| 22 | + | ||
| 23 | +from nltk.corpus import stopwords | ||
| 24 | +from trainingTesting_Sklearn_crfsuite import word2features | ||
| 25 | +from trainingTesting_Sklearn_crfsuite import sent2features | ||
| 26 | +# from trainingTesting_Sklearn_crfsuite import hasNonAlphaNum | ||
| 27 | +# from trainingTesting_Sklearn_crfsuite import hasDigit | ||
| 28 | + | ||
| 29 | +# Objective | ||
| 30 | +# Tagging transformed file with CRF model with sklearn-crfsuite. | ||
| 31 | +# | ||
| 32 | +# Input parameters | ||
| 33 | +# --inputPath=PATH Path of transformed files x|y|z | ||
| 34 | +# --modelPath Path to CRF model | ||
| 35 | +# --modelName Model name | ||
| 36 | +# --outputPath=PATH Output path to place output files | ||
| 37 | +# --filteringStopWords Filtering stop words | ||
| 38 | +# --filterSymbols Filtering punctuation marks | ||
| 39 | + | ||
| 40 | +# Output | ||
| 41 | +# 1) Tagged files in transformed format | ||
| 42 | + | ||
| 43 | +# Examples | ||
| 44 | +# Sentences | ||
| 45 | +# C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName aspectsTraining.fStopWords_False.fSymbols_True --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged --filterSymbols > output.taggingCRF.20161107.txt | ||
| 46 | +# C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName sentencesTraining.fStopWords_False.fSymbols_False --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged > output.taggingCRF.20161107.txt | ||
| 47 | + | ||
| 48 | +################################# | ||
| 49 | +# FUNCTIONS # | ||
| 50 | +################################# | ||
| 51 | +def word2features(sent, i): | ||
| 52 | + listElem = sent[i].split('|') | ||
| 53 | + word = listElem[0] | ||
| 54 | + lemma = listElem[1] | ||
| 55 | + postag = listElem[2] | ||
| 56 | + | ||
| 57 | + features = { | ||
| 58 | + # Suffixes | ||
| 59 | + #'word[-3:]': word[-3:], | ||
| 60 | + #'word[-2:]': word[-2:], | ||
| 61 | + #'word[-1:]': word[-1:], | ||
| 62 | + #'word.isupper()': word.isupper(), | ||
| 63 | + #'word': word, | ||
| 64 | + #'lemma': lemma, | ||
| 65 | + #'postag': postag, | ||
| 66 | + 'lemma[-3:]': lemma[-3:], | ||
| 67 | + 'lemma[-2:]': lemma[-2:], | ||
| 68 | + 'lemma[-1:]': lemma[-1:], | ||
| 69 | + 'lemma[+3:]': lemma[:3], | ||
| 70 | + 'lemma[+2:]': lemma[:2], | ||
| 71 | + 'lemma[+1:]': lemma[:1], | ||
| 72 | + #'word[:3]': word[:3], | ||
| 73 | + #'word[:2]': word[:2], | ||
| 74 | + #'word[:1]': word[:1], | ||
| 75 | + #'endsConLow()={}'.format(endsConLow(word)): endsConLow(word), | ||
| 76 | + } | ||
| 77 | + if i > 0: | ||
| 78 | + listElem = sent[i - 1].split('|') | ||
| 79 | + word1 = listElem[0] | ||
| 80 | + lemma1 = listElem[1] | ||
| 81 | + postag1 = listElem[2] | ||
| 82 | + features.update({ | ||
| 83 | + #'-1:word': word1, | ||
| 84 | + '-1:lemma': lemma1, | ||
| 85 | + '-1:postag': postag1, | ||
| 86 | + }) | ||
| 87 | + | ||
| 88 | + if i < len(sent) - 1: | ||
| 89 | + listElem = sent[i + 1].split('|') | ||
| 90 | + word1 = listElem[0] | ||
| 91 | + lemma1 = listElem[1] | ||
| 92 | + postag1 = listElem[2] | ||
| 93 | + features.update({ | ||
| 94 | + #'+1:word': word1, | ||
| 95 | + '+1:lemma': lemma1, | ||
| 96 | + '+1:postag': postag1, | ||
| 97 | + }) | ||
| 98 | + | ||
| 99 | + ''' | ||
| 100 | + if i > 1: | ||
| 101 | + listElem = sent[i - 2].split('|') | ||
| 102 | + word2 = listElem[0] | ||
| 103 | + lemma2 = listElem[1] | ||
| 104 | + postag2 = listElem[2] | ||
| 105 | + features.update({ | ||
| 106 | + '-2:word': word2, | ||
| 107 | + '-2:lemma': lemma2, | ||
| 108 | + }) | ||
| 109 | + | ||
| 110 | + if i < len(sent) - 2: | ||
| 111 | + listElem = sent[i + 2].split('|') | ||
| 112 | + word2 = listElem[0] | ||
| 113 | + lemma2 = listElem[1] | ||
| 114 | + postag2 = listElem[2] | ||
| 115 | + features.update({ | ||
| 116 | + '+2:word': word2, | ||
| 117 | + '+2:lemma': lemma2, | ||
| 118 | + }) | ||
| 119 | + | ||
| 120 | + trigrams = False | ||
| 121 | + if trigrams: | ||
| 122 | + if i > 2: | ||
| 123 | + listElem = sent[i - 3].split('|') | ||
| 124 | + word3 = listElem[0] | ||
| 125 | + lemma3 = listElem[1] | ||
| 126 | + postag3 = listElem[2] | ||
| 127 | + features.update({ | ||
| 128 | + '-3:word': word3, | ||
| 129 | + '-3:lemma': lemma3, | ||
| 130 | + }) | ||
| 131 | + | ||
| 132 | + if i < len(sent) - 3: | ||
| 133 | + listElem = sent[i + 3].split('|') | ||
| 134 | + word3 = listElem[0] | ||
| 135 | + lemma3 = listElem[1] | ||
| 136 | + postag3 = listElem[2] | ||
| 137 | + features.update({ | ||
| 138 | + '+3:word': word3, | ||
| 139 | + '+3:lemma': lemma3, | ||
| 140 | + }) | ||
| 141 | + ''' | ||
| 142 | + return features | ||
| 143 | + | ||
| 144 | +__author__ = 'CMendezC' | ||
| 145 | + | ||
| 146 | +########################################## | ||
| 147 | +# MAIN PROGRAM # | ||
| 148 | +########################################## | ||
| 149 | + | ||
| 150 | +if __name__ == "__main__": | ||
| 151 | + # Defining parameters | ||
| 152 | + parser = OptionParser() | ||
| 153 | + parser.add_option("--inputPath", dest="inputPath", | ||
| 154 | + help="Path of training data set", metavar="PATH") | ||
| 155 | + parser.add_option("--outputPath", dest="outputPath", | ||
| 156 | + help="Output path to place output files", | ||
| 157 | + metavar="PATH") | ||
| 158 | + parser.add_option("--modelPath", dest="modelPath", | ||
| 159 | + help="Path to read CRF model", | ||
| 160 | + metavar="PATH") | ||
| 161 | + parser.add_option("--modelName", dest="modelName", | ||
| 162 | + help="Model name", metavar="TEXT") | ||
| 163 | + parser.add_option("--filterStopWords", default=False, | ||
| 164 | + action="store_true", dest="filterStopWords", | ||
| 165 | + help="Filtering stop words") | ||
| 166 | + parser.add_option("--filterSymbols", default=False, | ||
| 167 | + action="store_true", dest="filterSymbols", | ||
| 168 | + help="Filtering punctuation marks") | ||
| 169 | + | ||
| 170 | + (options, args) = parser.parse_args() | ||
| 171 | + if len(args) > 0: | ||
| 172 | + parser.error("Any parameter given.") | ||
| 173 | + sys.exit(1) | ||
| 174 | + | ||
| 175 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
| 176 | + print("Path to read input files: " + options.inputPath) | ||
| 177 | + print("Mode name: " + str(options.modelName)) | ||
| 178 | + print("Model path: " + options.modelPath) | ||
| 179 | + print("Path to place output files: " + options.outputPath) | ||
| 180 | + print("Filtering stop words: " + str(options.filterStopWords)) | ||
| 181 | + symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | ||
| 182 | + '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...'] | ||
| 183 | + # symbols = [sym.decode('utf-8') for sym in ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | ||
| 184 | + # '}', '[', ']', '*', '%', '$', '#', '&', '°']] | ||
| 185 | + # symbols = [u'.', u',', u':', u';', u'?', u'!', u'\'', u'"', u'<', u'>', u'(', u')', u'-', u'_', u'/', u'\\', u'¿', u'¡', u'+', u'{', | ||
| 186 | + # u'}', u'[', u']', u'*', u'%', u'$', u'#', u'&', u'°', u'`'] | ||
| 187 | + print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols)) | ||
| 188 | + | ||
| 189 | + print('-------------------------------- PROCESSING --------------------------------') | ||
| 190 | + | ||
| 191 | + stopwords = [word.decode('utf-8') for word in stopwords.words('english')] | ||
| 192 | + | ||
| 193 | + # Read CRF model | ||
| 194 | + t0 = time() | ||
| 195 | + print('Reading CRF model...') | ||
| 196 | + crf = joblib.load(os.path.join(options.modelPath, 'models', options.modelName + '.mod')) | ||
| 197 | + print("Reading CRF model done in: %fs" % (time() - t0)) | ||
| 198 | + | ||
| 199 | + print('Processing corpus...') | ||
| 200 | + t0 = time() | ||
| 201 | + # labels = list(['MF', 'TF', 'DFAM', 'DMOT', 'DPOS', 'PRO']) | ||
| 202 | + # Walk directory to read files | ||
| 203 | + for path, dirs, files in os.walk(options.inputPath): | ||
| 204 | + # For each file in dir | ||
| 205 | + for file in files: | ||
| 206 | + print(" Preprocessing file..." + str(file)) | ||
| 207 | + sentencesInputData = [] | ||
| 208 | + sentencesOutputData = [] | ||
| 209 | + with open(os.path.join(options.inputPath, file), "r") as iFile: | ||
| 210 | + lines = iFile.readlines() | ||
| 211 | + for line in lines: | ||
| 212 | + listLine = [] | ||
| 213 | + # line = line.decode("utf-8") | ||
| 214 | + for token in line.strip('\n').split(): | ||
| 215 | + if options.filterStopWords: | ||
| 216 | + listToken = token.split('|') | ||
| 217 | + lemma = listToken[1] | ||
| 218 | + # Original if lemma in stopwords.words('english'): | ||
| 219 | + if lemma in stopwords: | ||
| 220 | + continue | ||
| 221 | + if options.filterSymbols: | ||
| 222 | + listToken = token.split('|') | ||
| 223 | + lemma = listToken[1] | ||
| 224 | + if lemma in symbols: | ||
| 225 | + if lemma == ',': | ||
| 226 | + print("Coma , identificada") | ||
| 227 | + continue | ||
| 228 | + listLine.append(token) | ||
| 229 | + sentencesInputData.append(listLine) | ||
| 230 | + print(" Sentences input data: " + str(len(sentencesInputData))) | ||
| 231 | + # print sentencesInputData[0] | ||
| 232 | + # print(sent2features(sentencesInputData[0])[0]) | ||
| 233 | + # print(sent2labels(sentencesInputData[0])) | ||
| 234 | + X_input = [sent2features(s) for s in sentencesInputData] | ||
| 235 | + print(sent2features(sentencesInputData[0])[0]) | ||
| 236 | + # y_test = [sent2labels(s) for s in sentencesInputData] | ||
| 237 | + # Predicting tags | ||
| 238 | + t1 = time() | ||
| 239 | + print(" Predicting tags with model") | ||
| 240 | + y_pred = crf.predict(X_input) | ||
| 241 | + #print y_pred[0] | ||
| 242 | + print(" Prediction done in: %fs" % (time() - t1)) | ||
| 243 | + exit | ||
| 244 | + | ||
| 245 | + # Tagging with CRF model | ||
| 246 | + print(" Tagging file") | ||
| 247 | + for line, tagLine in zip(lines, y_pred): | ||
| 248 | + outputLine = '' | ||
| 249 | + idx_tagLine = 0 | ||
| 250 | + line = line.strip('\n') | ||
| 251 | + print("\nLine: " + str(line)) | ||
| 252 | + print ("CRF tagged line: " + str(tagLine)) | ||
| 253 | + for token in line.split(): | ||
| 254 | + listToken = token.split('|') | ||
| 255 | + word = listToken[0] | ||
| 256 | + lemma = listToken[1] | ||
| 257 | + tag = listToken[2] | ||
| 258 | + if options.filterStopWords: | ||
| 259 | + if lemma in stopwords: | ||
| 260 | + outputLine += token + ' ' | ||
| 261 | + continue | ||
| 262 | + if options.filterSymbols: | ||
| 263 | + if lemma in symbols: | ||
| 264 | + if lemma == ',': | ||
| 265 | + print("Coma , identificada") | ||
| 266 | + outputLine += token + ' ' | ||
| 267 | + continue | ||
| 268 | + CRFtag = tagLine[idx_tagLine] | ||
| 269 | + #if (tag not in labels) and (CRFtag != 'O'): | ||
| 270 | + # print "*** CRF change token {} to {}".format(token, CRFtag) | ||
| 271 | + # outputLine += word + '|' + lemma + '|' + CRFtag + ' ' | ||
| 272 | + #else: | ||
| 273 | + # outputLine += word + '|' + lemma + '|' + tag + ' ' | ||
| 274 | + #idx_tagLine += 1 | ||
| 275 | + sentencesOutputData.append(outputLine.rstrip()) | ||
| 276 | + with open(os.path.join(options.outputPath, file), "w") as oFile: | ||
| 277 | + for line in sentencesOutputData: | ||
| 278 | + oFile.write(line + '\n') | ||
| 279 | + | ||
| 280 | + print("Processing corpus done in: %fs" % (time() - t0)) |
-
Please register or login to post a comment