Carlos-Francisco Méndez-Cruz

Conditional Random Fields

1 +# -*- coding: UTF-8 -*-
2 +
3 +import os
4 +from itertools import chain
5 +from optparse import OptionParser
6 +from time import time
7 +from collections import Counter
8 +
9 +import nltk
10 +import sklearn
11 +import scipy.stats
12 +import sys
13 +
14 +from sklearn.externals import joblib
15 +from sklearn.metrics import make_scorer
16 +from sklearn.cross_validation import cross_val_score
17 +from sklearn.grid_search import RandomizedSearchCV
18 +
19 +import sklearn_crfsuite
20 +from sklearn_crfsuite import scorers
21 +from sklearn_crfsuite import metrics
22 +
23 +from nltk.corpus import stopwords
24 +from trainingTesting_Sklearn_crfsuite import word2features
25 +from trainingTesting_Sklearn_crfsuite import sent2features
26 +# from trainingTesting_Sklearn_crfsuite import hasNonAlphaNum
27 +# from trainingTesting_Sklearn_crfsuite import hasDigit
28 +
29 +# Objective
30 +# Tagging transformed file with CRF model with sklearn-crfsuite.
31 +#
32 +# Input parameters
33 +# --inputPath=PATH Path of transformed files x|y|z
34 +# --modelPath Path to CRF model
35 +# --modelName Model name
36 +# --outputPath=PATH Output path to place output files
37 +# --filteringStopWords Filtering stop words
38 +# --filterSymbols Filtering punctuation marks
39 +
40 +# Output
41 +# 1) Tagged files in transformed format
42 +
43 +# Examples
44 +# Sentences
45 +# C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName aspectsTraining.fStopWords_False.fSymbols_True --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged --filterSymbols > output.taggingCRF.20161107.txt
46 +# C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName sentencesTraining.fStopWords_False.fSymbols_False --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged > output.taggingCRF.20161107.txt
47 +
48 +#################################
49 +# FUNCTIONS #
50 +#################################
51 +def word2features(sent, i):
52 + listElem = sent[i].split('|')
53 + word = listElem[0]
54 + lemma = listElem[1]
55 + postag = listElem[2]
56 +
57 + features = {
58 + # Suffixes
59 + #'word[-3:]': word[-3:],
60 + #'word[-2:]': word[-2:],
61 + #'word[-1:]': word[-1:],
62 + #'word.isupper()': word.isupper(),
63 + #'word': word,
64 + #'lemma': lemma,
65 + #'postag': postag,
66 + 'lemma[-3:]': lemma[-3:],
67 + 'lemma[-2:]': lemma[-2:],
68 + 'lemma[-1:]': lemma[-1:],
69 + 'lemma[+3:]': lemma[:3],
70 + 'lemma[+2:]': lemma[:2],
71 + 'lemma[+1:]': lemma[:1],
72 + #'word[:3]': word[:3],
73 + #'word[:2]': word[:2],
74 + #'word[:1]': word[:1],
75 + #'endsConLow()={}'.format(endsConLow(word)): endsConLow(word),
76 + }
77 + if i > 0:
78 + listElem = sent[i - 1].split('|')
79 + word1 = listElem[0]
80 + lemma1 = listElem[1]
81 + postag1 = listElem[2]
82 + features.update({
83 + #'-1:word': word1,
84 + '-1:lemma': lemma1,
85 + '-1:postag': postag1,
86 + })
87 +
88 + if i < len(sent) - 1:
89 + listElem = sent[i + 1].split('|')
90 + word1 = listElem[0]
91 + lemma1 = listElem[1]
92 + postag1 = listElem[2]
93 + features.update({
94 + #'+1:word': word1,
95 + '+1:lemma': lemma1,
96 + '+1:postag': postag1,
97 + })
98 +
99 + '''
100 + if i > 1:
101 + listElem = sent[i - 2].split('|')
102 + word2 = listElem[0]
103 + lemma2 = listElem[1]
104 + postag2 = listElem[2]
105 + features.update({
106 + '-2:word': word2,
107 + '-2:lemma': lemma2,
108 + })
109 +
110 + if i < len(sent) - 2:
111 + listElem = sent[i + 2].split('|')
112 + word2 = listElem[0]
113 + lemma2 = listElem[1]
114 + postag2 = listElem[2]
115 + features.update({
116 + '+2:word': word2,
117 + '+2:lemma': lemma2,
118 + })
119 +
120 + trigrams = False
121 + if trigrams:
122 + if i > 2:
123 + listElem = sent[i - 3].split('|')
124 + word3 = listElem[0]
125 + lemma3 = listElem[1]
126 + postag3 = listElem[2]
127 + features.update({
128 + '-3:word': word3,
129 + '-3:lemma': lemma3,
130 + })
131 +
132 + if i < len(sent) - 3:
133 + listElem = sent[i + 3].split('|')
134 + word3 = listElem[0]
135 + lemma3 = listElem[1]
136 + postag3 = listElem[2]
137 + features.update({
138 + '+3:word': word3,
139 + '+3:lemma': lemma3,
140 + })
141 + '''
142 + return features
143 +
144 +__author__ = 'CMendezC'
145 +
146 +##########################################
147 +# MAIN PROGRAM #
148 +##########################################
149 +
150 +if __name__ == "__main__":
151 + # Defining parameters
152 + parser = OptionParser()
153 + parser.add_option("--inputPath", dest="inputPath",
154 + help="Path of training data set", metavar="PATH")
155 + parser.add_option("--outputPath", dest="outputPath",
156 + help="Output path to place output files",
157 + metavar="PATH")
158 + parser.add_option("--modelPath", dest="modelPath",
159 + help="Path to read CRF model",
160 + metavar="PATH")
161 + parser.add_option("--modelName", dest="modelName",
162 + help="Model name", metavar="TEXT")
163 + parser.add_option("--filterStopWords", default=False,
164 + action="store_true", dest="filterStopWords",
165 + help="Filtering stop words")
166 + parser.add_option("--filterSymbols", default=False,
167 + action="store_true", dest="filterSymbols",
168 + help="Filtering punctuation marks")
169 +
170 + (options, args) = parser.parse_args()
171 + if len(args) > 0:
172 + parser.error("Any parameter given.")
173 + sys.exit(1)
174 +
175 + print('-------------------------------- PARAMETERS --------------------------------')
176 + print("Path to read input files: " + options.inputPath)
177 + print("Mode name: " + str(options.modelName))
178 + print("Model path: " + options.modelPath)
179 + print("Path to place output files: " + options.outputPath)
180 + print("Filtering stop words: " + str(options.filterStopWords))
181 + symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
182 + '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
183 + # symbols = [sym.decode('utf-8') for sym in ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
184 + # '}', '[', ']', '*', '%', '$', '#', '&', '°']]
185 + # symbols = [u'.', u',', u':', u';', u'?', u'!', u'\'', u'"', u'<', u'>', u'(', u')', u'-', u'_', u'/', u'\\', u'¿', u'¡', u'+', u'{',
186 + # u'}', u'[', u']', u'*', u'%', u'$', u'#', u'&', u'°', u'`']
187 + print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
188 +
189 + print('-------------------------------- PROCESSING --------------------------------')
190 +
191 + stopwords = [word.decode('utf-8') for word in stopwords.words('english')]
192 +
193 + # Read CRF model
194 + t0 = time()
195 + print('Reading CRF model...')
196 + crf = joblib.load(os.path.join(options.modelPath, 'models', options.modelName + '.mod'))
197 + print("Reading CRF model done in: %fs" % (time() - t0))
198 +
199 + print('Processing corpus...')
200 + t0 = time()
201 + # labels = list(['MF', 'TF', 'DFAM', 'DMOT', 'DPOS', 'PRO'])
202 + # Walk directory to read files
203 + for path, dirs, files in os.walk(options.inputPath):
204 + # For each file in dir
205 + for file in files:
206 + print(" Preprocessing file..." + str(file))
207 + sentencesInputData = []
208 + sentencesOutputData = []
209 + with open(os.path.join(options.inputPath, file), "r") as iFile:
210 + lines = iFile.readlines()
211 + for line in lines:
212 + listLine = []
213 + # line = line.decode("utf-8")
214 + for token in line.strip('\n').split():
215 + if options.filterStopWords:
216 + listToken = token.split('|')
217 + lemma = listToken[1]
218 + # Original if lemma in stopwords.words('english'):
219 + if lemma in stopwords:
220 + continue
221 + if options.filterSymbols:
222 + listToken = token.split('|')
223 + lemma = listToken[1]
224 + if lemma in symbols:
225 + if lemma == ',':
226 + print("Coma , identificada")
227 + continue
228 + listLine.append(token)
229 + sentencesInputData.append(listLine)
230 + print(" Sentences input data: " + str(len(sentencesInputData)))
231 + # print sentencesInputData[0]
232 + # print(sent2features(sentencesInputData[0])[0])
233 + # print(sent2labels(sentencesInputData[0]))
234 + X_input = [sent2features(s) for s in sentencesInputData]
235 + print(sent2features(sentencesInputData[0])[0])
236 + # y_test = [sent2labels(s) for s in sentencesInputData]
237 + # Predicting tags
238 + t1 = time()
239 + print(" Predicting tags with model")
240 + y_pred = crf.predict(X_input)
241 + #print y_pred[0]
242 + print(" Prediction done in: %fs" % (time() - t1))
243 + exit
244 +
245 + # Tagging with CRF model
246 + print(" Tagging file")
247 + for line, tagLine in zip(lines, y_pred):
248 + outputLine = ''
249 + idx_tagLine = 0
250 + line = line.strip('\n')
251 + print("\nLine: " + str(line))
252 + print ("CRF tagged line: " + str(tagLine))
253 + for token in line.split():
254 + listToken = token.split('|')
255 + word = listToken[0]
256 + lemma = listToken[1]
257 + tag = listToken[2]
258 + if options.filterStopWords:
259 + if lemma in stopwords:
260 + outputLine += token + ' '
261 + continue
262 + if options.filterSymbols:
263 + if lemma in symbols:
264 + if lemma == ',':
265 + print("Coma , identificada")
266 + outputLine += token + ' '
267 + continue
268 + CRFtag = tagLine[idx_tagLine]
269 + #if (tag not in labels) and (CRFtag != 'O'):
270 + # print "*** CRF change token {} to {}".format(token, CRFtag)
271 + # outputLine += word + '|' + lemma + '|' + CRFtag + ' '
272 + #else:
273 + # outputLine += word + '|' + lemma + '|' + tag + ' '
274 + #idx_tagLine += 1
275 + sentencesOutputData.append(outputLine.rstrip())
276 + with open(os.path.join(options.outputPath, file), "w") as oFile:
277 + for line in sentencesOutputData:
278 + oFile.write(line + '\n')
279 +
280 + print("Processing corpus done in: %fs" % (time() - t0))