Carlos-Francisco Méndez-Cruz

Training validation script

...@@ -76,7 +76,7 @@ def word2features(sent, i): ...@@ -76,7 +76,7 @@ def word2features(sent, i):
76 'word[:3]': word[:3], 76 'word[:3]': word[:3],
77 'word[:2]': word[:2], 77 'word[:2]': word[:2],
78 'word[:1]': word[:1], 78 'word[:1]': word[:1],
79 - 'endsConLow()': endsConLow(word), 79 + 'endsConLow()={}'.format(endsConLow(word)): endsConLow(word),
80 } 80 }
81 ''' 81 '''
82 if i > 0: 82 if i > 0:
......
1 -# -*- coding: UTF-8 -*-
2 -
3 -import os
4 -from itertools import chain
5 -from optparse import OptionParser
6 -from time import time
7 -from collections import Counter
8 -
9 -import nltk
10 -import sklearn
11 -import scipy.stats
12 -import sys
13 -
14 -from sklearn.externals import joblib
15 -from sklearn.metrics import make_scorer
16 -from sklearn.cross_validation import cross_val_score
17 -from sklearn.grid_search import RandomizedSearchCV
18 -
19 -import sklearn_crfsuite
20 -from sklearn_crfsuite import scorers
21 -from sklearn_crfsuite import metrics
22 -
23 -from nltk.corpus import stopwords
24 -
25 -
26 -# Objective
27 -# Training and evaluation of CRFs with sklearn-crfsuite.
28 -#
29 -# Input parameters
30 -# --inputPath=PATH Path of training and test data set
31 -# --trainingFile File with training data set
32 -# --testFile File with test data set
33 -# --outputPath=PATH Output path to place output files
34 -# --filteringStopWords Filtering stop words
35 -# --filterSymbols Filtering punctuation marks
36 -
37 -# Output
38 -# 1) Best model
39 -
40 -# Examples
41 -# Sentences
42 -# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS > output.TrainingTestingCRF.20161106_1.txt
43 -# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords > output.TrainingTestingCRF.20161106_2.txt
44 -# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterSymbols > output.TrainingTestingCRF.20161106_3.txt
45 -# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords --filterSymbols > output.TrainingTestingCRF.20161106_4.txt
46 -
47 -# Aspects
48 -# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS > output.TrainingTestingCRF.20161106_5.txt
49 -# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords > output.TrainingTestingCRF.20161106_6.txt
50 -# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterSymbols > output.TrainingTestingCRF.20161106_7.txt
51 -# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords --filterSymbols > output.TrainingTestingCRF.20161106_8.txt
52 -
53 -#################################
54 -# FUNCTIONS #
55 -#################################
56 -
57 -def wordSize(text):
58 - lWord = len(text)
59 - if lWord == 1:
60 - return '1'
61 - elif lWord == 2:
62 - return '2'
63 - elif lWord == 3:
64 - return '3'
65 - elif lWord == 4:
66 - return '4'
67 - elif lWord == 5:
68 - return '5'
69 - elif 6 <= lWord <= 10:
70 - return '6-10'
71 - elif 11 <= lWord <= 15:
72 - return '11-15'
73 - elif 16 <= lWord <= 20:
74 - return '16-20'
75 - elif 21 <= lWord <= 30:
76 - return '21-30'
77 - else:
78 - return '>30'
79 -
80 -def hasUpperLower(text):
81 - has = False
82 - if len(text) < 3:
83 - return False
84 - regexUp = nltk.re.compile('[A-Z]')
85 - regexLo = nltk.re.compile('[a-z]')
86 - if (regexUp.search(text) != None) and (regexLo.search(text) != None):
87 - has = True
88 - return has
89 -
90 -def hasDigit(text):
91 - has = False
92 - if len(text) < 3:
93 - return False
94 - myRegex = nltk.re.compile('[0-9]')
95 - if myRegex.search(text) != None:
96 - has = True
97 - return has
98 -
99 -
100 -def hasNonAlphaNum(text):
101 - has = False
102 - if len(text) < 3:
103 - return False
104 - myRegex = nltk.re.compile('\W')
105 - if myRegex.search(text) != None:
106 - has = True
107 - return has
108 -
109 -def word2features(sent, i):
110 - # print "i: " + str(i)
111 - # print "sent[i]" + sent[i]
112 - listElem = sent[i].split('|')
113 - word = listElem[0]
114 - lemma = listElem[1]
115 - postag = listElem[2]
116 -
117 - features = {
118 - # Names of TF and genes change by lower and upper characters: 'word.lower()': word.lower(),
119 - # Suffixes
120 - 'word[-3:]': word[-3:],
121 - 'word[-2:]': word[-2:],
122 - 'word[-1:]': word[-1:],
123 - 'word.isupper()': word.isupper(),
124 - 'word.istitle()': word.istitle(),
125 - 'word.hasDigit()': hasDigit(word),
126 - 'word.hasNonAlphaNum': hasNonAlphaNum(word),
127 - # 'word.hasUpperLower': hasUpperLower(word),
128 - #'wordSize': wordSize(word),
129 - # 'word.isdigit()': word.isdigit(),
130 - 'word': word,
131 - 'lemma': lemma,
132 - 'lemma[-3:]': lemma[-3:],
133 - 'lemma[-2:]': lemma[-2:],
134 - 'lemma[-1:]': lemma[-1:],
135 - 'postag': postag,
136 - # Prefixes
137 - 'postag[:2]': postag[:2],
138 - 'postag[:1]': postag[:1],
139 - }
140 - if i > 0:
141 - listElem = sent[i - 1].split('|')
142 - word1 = listElem[0]
143 - lemma1 = listElem[1]
144 - postag1 = listElem[2]
145 - features.update({
146 - '-1:word.lower()': word1.lower(),
147 - '-1:word.istitle()': word1.istitle(),
148 - '-1:word.isupper()': word1.isupper(),
149 - '-1:word.hasDigit()': hasDigit(word1),
150 - '-1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
151 - # '-1:word.hasUpperLower': hasUpperLower(word1),
152 - '-1:word': word1,
153 - '-1:lemma': lemma1,
154 - '-1:postag': postag1,
155 - '-1:postag[:2]': postag1[:2],
156 - '-1:postag[:1]': postag1[:1],
157 - })
158 - # else:
159 - # features['BOS'] = True
160 -
161 - if i < len(sent) - 1:
162 - listElem = sent[i + 1].split('|')
163 - word1 = listElem[0]
164 - lemma1 = listElem[1]
165 - postag1 = listElem[2]
166 - features.update({
167 - '+1:word.lower()': word1.lower(),
168 - '+1:word.istitle()': word1.istitle(),
169 - '+1:word.isupper()': word1.isupper(),
170 - '+1:word.hasDigit()': hasDigit(word1),
171 - '+1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
172 - # '+1:word.hasUpperLower': hasUpperLower(word1),
173 - '+1:word': word1,
174 - '+1:lemma': lemma1,
175 - '+1:postag': postag1,
176 - '+1:postag[:2]': postag1[:2],
177 - '+1:postag[:1]': postag1[:1],
178 - })
179 - # else:
180 - # features['EOS'] = True
181 - if i > 1:
182 - listElem = sent[i - 2].split('|')
183 - word2 = listElem[0]
184 - lemma2 = listElem[1]
185 - postag2 = listElem[2]
186 - features.update({
187 - '-2:word.lower()': word2.lower(),
188 - '-2:word.istitle()': word2.istitle(),
189 - '-2:word.isupper()': word2.isupper(),
190 - '-2:word.hasDigit()': hasDigit(word2),
191 - '-2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
192 - # '-2:word.hasUpperLower': hasUpperLower(word2),
193 - '-2:word': word2,
194 - '-2:lemma': lemma2,
195 - '-2:postag': postag2,
196 - '-2:postag[:2]': postag2[:2],
197 - '-2:postag[:1]': postag2[:1],
198 - })
199 -
200 - if i < len(sent) - 2:
201 - listElem = sent[i + 2].split('|')
202 - word2 = listElem[0]
203 - lemma2 = listElem[1]
204 - postag2 = listElem[2]
205 - features.update({
206 - '+2:word.lower()': word2.lower(),
207 - '+2:word.istitle()': word2.istitle(),
208 - '+2:word.isupper()': word2.isupper(),
209 - '+2:word.hasDigit()': hasDigit(word2),
210 - '+2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
211 - # '+2:word.hasUpperLower': hasUpperLower(word2),
212 - '+2:word': word2,
213 - '+2:lemma': lemma2,
214 - '+2:postag': postag2,
215 - '+2:postag[:2]': postag2[:2],
216 - '+2:postag[:1]': postag2[:1],
217 - })
218 -
219 - trigrams = False
220 - if trigrams:
221 - if i > 2:
222 - listElem = sent[i - 3].split('|')
223 - word3 = listElem[0]
224 - lemma3 = listElem[1]
225 - postag3 = listElem[2]
226 - features.update({
227 - '-3:word.lower()': word3.lower(),
228 - '-3:word.istitle()': word3.istitle(),
229 - '-3:word.isupper()': word3.isupper(),
230 - '-3:word.hasDigit()': hasDigit(word3),
231 - '-3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
232 - # '-3:word.hasUpperLower': hasUpperLower(word3),
233 - '-3:word': word3,
234 - '-3:lemma': lemma3,
235 - '-3:postag': postag3,
236 - '-3:postag[:2]': postag3[:2],
237 - '-3:postag[:1]': postag3[:1],
238 - })
239 -
240 - if i < len(sent) - 3:
241 - listElem = sent[i + 3].split('|')
242 - word3 = listElem[0]
243 - lemma3 = listElem[1]
244 - postag3 = listElem[2]
245 - features.update({
246 - '+3:word.lower()': word3.lower(),
247 - '+3:word.istitle()': word3.istitle(),
248 - '+3:word.isupper()': word3.isupper(),
249 - '+3:word.hasDigit()': hasDigit(word3),
250 - '+3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
251 - # '+3:word.hasUpperLower': hasUpperLower(word3),
252 - '+3:word': word3,
253 - '+3:lemma': lemma3,
254 - '+3:postag': postag3,
255 - '+3:postag[:2]': postag3[:2],
256 - '+3:postag[:1]': postag3[:1],
257 - })
258 -
259 - return features
260 -
261 -
262 -def sent2features(sent):
263 - return [word2features(sent, i) for i in range(len(sent))]
264 -
265 -
266 -def sent2labels(sent):
267 - return [elem.split('|')[3] for elem in sent]
268 - # return [label for token, postag, label in sent]
269 -
270 -
271 -def sent2tokens(sent):
272 - return [token for token, postag, label in sent]
273 -
274 -
275 -def print_transitions(trans_features, f):
276 - for (label_from, label_to), weight in trans_features:
277 - # f.write("%-6s -> %-7s %0.6f\n" % (label_from, label_to, weight))
278 - # f.write("label_from :" + label_from)
279 - # f.write("label_to :" + label_to)
280 - # f.write("label_weight :" + weight)
281 - # f.write("{} -> {} {:0.6f}\n".format(label_from.encode("utf-8"), label_to.encode("utf-8"), weight))
282 - f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight))
283 -
284 -
285 -def print_state_features(state_features, f):
286 - for (attr, label), weight in state_features:
287 - # f.write("%0.6f %-8s %s\n" % (weight, label, attr))
288 - # f.write(attr.encode("utf-8"))
289 - # '{:06.2f}'.format(3.141592653589793)
290 - f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8")))
291 -
292 -
293 -__author__ = 'CMendezC'
294 -
295 -##########################################
296 -# MAIN PROGRAM #
297 -##########################################
298 -
299 -if __name__ == "__main__":
300 - # Defining parameters
301 - parser = OptionParser()
302 - parser.add_option("--inputPath", dest="inputPath",
303 - help="Path of training data set", metavar="PATH")
304 - parser.add_option("--outputPath", dest="outputPath",
305 - help="Output path to place output files",
306 - metavar="PATH")
307 - parser.add_option("--trainingFile", dest="trainingFile",
308 - help="File with training data set", metavar="FILE")
309 - parser.add_option("--testFile", dest="testFile",
310 - help="File with test data set", metavar="FILE")
311 - parser.add_option("--filterStopWords", default=False,
312 - action="store_true", dest="filterStopWords",
313 - help="Filtering stop words")
314 - parser.add_option("--filterSymbols", default=False,
315 - action="store_true", dest="filterSymbols",
316 - help="Filtering punctuation marks")
317 -
318 - (options, args) = parser.parse_args()
319 - if len(args) > 0:
320 - parser.error("Any parameter given.")
321 - sys.exit(1)
322 -
323 - print('-------------------------------- PARAMETERS --------------------------------')
324 - print("Path of training data set: " + options.inputPath)
325 - print("File with training data set: " + str(options.trainingFile))
326 - print("Path of test data set: " + options.inputPath)
327 - print("File with test data set: " + str(options.testFile))
328 - print("Filtering stop words: " + str(options.filterStopWords))
329 - symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
330 - '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
331 - print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
332 -
333 - print('-------------------------------- PROCESSING --------------------------------')
334 - print('Reading corpus...')
335 - t0 = time()
336 -
337 - sentencesTrainingData = []
338 - sentencesTestData = []
339 -
340 - stopwords = [word.decode('utf-8') for word in stopwords.words('english')]
341 -
342 - with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile:
343 - # with open(os.path.join(options.inputPath, options.trainingFile), "r", encoding="utf-8", errors='replace') as iFile:
344 - for line in iFile.readlines():
345 - listLine = []
346 - line = line.decode("utf-8")
347 - for token in line.strip('\n').split():
348 - if options.filterStopWords:
349 - listToken = token.split('|')
350 - lemma = listToken[1]
351 - # Original: if lemma in stopwords.words('english'):
352 - # trainingTesting_Sklearn_crfsuite.py:269:
353 - # UnicodeWarning: Unicode equal comparison failed to
354 - # convert both arguments to Unicode -
355 - # interpreting them as being unequal
356 - if lemma in stopwords:
357 - continue
358 - if options.filterSymbols:
359 - listToken = token.split('|')
360 - lemma = listToken[1]
361 - if lemma in symbols:
362 - # if lemma == ',':
363 - # print "Coma , identificada"
364 - continue
365 - listLine.append(token)
366 - sentencesTrainingData.append(listLine)
367 - print " Sentences training data: " + str(len(sentencesTrainingData))
368 - # print sentencesTrainingData[0]
369 -
370 - with open(os.path.join(options.inputPath, options.testFile), "r") as iFile:
371 - # with open(os.path.join(options.inputPath, options.testFile), "r", encoding="utf-8", errors='replace') as iFile:
372 - for line in iFile.readlines():
373 - listLine = []
374 - line = line.decode("utf-8")
375 - for token in line.strip('\n').split():
376 - if options.filterStopWords:
377 - listToken = token.split('|')
378 - lemma = listToken[1]
379 - # Original if lemma in stopwords.words('english'):
380 - if lemma in stopwords:
381 - continue
382 - if options.filterSymbols:
383 - listToken = token.split('|')
384 - lemma = listToken[1]
385 - if lemma in symbols:
386 - # if lemma == ',':
387 - # print "Coma , identificada"
388 - continue
389 - listLine.append(token)
390 - sentencesTestData.append(listLine)
391 - print " Sentences test data: " + str(len(sentencesTestData))
392 - # print sentencesTestData[0]
393 -
394 - print("Reading corpus done in: %fs" % (time() - t0))
395 -
396 - print(sent2features(sentencesTrainingData[0])[0])
397 - print(sent2features(sentencesTestData[0])[0])
398 - # print(sent2labels(sentencesTrainingData[0]))
399 - # print(sent2labels(sentencesTestData[0]))
400 - t0 = time()
401 -
402 - X_train = [sent2features(s) for s in sentencesTrainingData]
403 - y_train = [sent2labels(s) for s in sentencesTrainingData]
404 -
405 - X_test = [sent2features(s) for s in sentencesTestData]
406 - # print X_test
407 - y_test = [sent2labels(s) for s in sentencesTestData]
408 -
409 - # Fixed parameters
410 - # crf = sklearn_crfsuite.CRF(
411 - # algorithm='lbfgs',
412 - # c1=0.1,
413 - # c2=0.1,
414 - # max_iterations=100,
415 - # all_possible_transitions=True
416 - # )
417 -
418 - # Hyperparameter Optimization
419 - crf = sklearn_crfsuite.CRF(
420 - algorithm='lbfgs',
421 - max_iterations=100,
422 - all_possible_transitions=True
423 - )
424 - params_space = {
425 - 'c1': scipy.stats.expon(scale=0.5),
426 - 'c2': scipy.stats.expon(scale=0.05),
427 - }
428 -
429 - # Original: labels = list(crf.classes_)
430 - # Original: labels.remove('O')
431 - labels = list(['GENE'])
432 -
433 - # use the same metric for evaluation
434 - f1_scorer = make_scorer(metrics.flat_f1_score,
435 - average='weighted', labels=labels)
436 -
437 - # search
438 - rs = RandomizedSearchCV(crf, params_space,
439 - cv=10,
440 - verbose=3,
441 - n_jobs=-1,
442 - n_iter=20,
443 - # n_iter=50,
444 - scoring=f1_scorer)
445 - rs.fit(X_train, y_train)
446 -
447 - # Fixed parameters
448 - # crf.fit(X_train, y_train)
449 -
450 - # Best hiperparameters
451 - # crf = rs.best_estimator_
452 - nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
453 - options.filterSymbols) + '.txt')
454 - with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile:
455 - oFile.write("********** TRAINING AND TESTING REPORT **********\n")
456 - oFile.write("Training file: " + options.trainingFile + '\n')
457 - oFile.write('\n')
458 - oFile.write('best params:' + str(rs.best_params_) + '\n')
459 - oFile.write('best CV score:' + str(rs.best_score_) + '\n')
460 - oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000))
461 -
462 - print("Training done in: %fs" % (time() - t0))
463 - t0 = time()
464 -
465 - # Update best crf
466 - crf = rs.best_estimator_
467 -
468 - # Saving model
469 - print(" Saving training model...")
470 - t1 = time()
471 - nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
472 - options.filterSymbols) + '.mod')
473 - joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
474 - print(" Saving training model done in: %fs" % (time() - t1))
475 -
476 - # Evaluation against test data
477 - y_pred = crf.predict(X_test)
478 - print("*********************************")
479 - name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
480 - options.filterSymbols) + '.txt')
481 - with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile:
482 - for y in y_pred:
483 - oFile.write(str(y) + '\n')
484 -
485 - print("*********************************")
486 - name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
487 - options.filterSymbols) + '.txt')
488 - with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile:
489 - for y in y_test:
490 - oFile.write(str(y) + '\n')
491 -
492 - print("Prediction done in: %fs" % (time() - t0))
493 -
494 - # labels = list(crf.classes_)
495 - # labels.remove('O')
496 -
497 - with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile:
498 - oFile.write('\n')
499 - oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)))
500 - oFile.write('\n')
501 - # labels = list(crf.classes_)
502 - sorted_labels = sorted(
503 - labels,
504 - key=lambda name: (name[1:], name[0])
505 - )
506 - oFile.write(metrics.flat_classification_report(
507 - y_test, y_pred, labels=sorted_labels, digits=3
508 - ))
509 - oFile.write('\n')
510 -
511 - oFile.write("\nTop likely transitions:\n")
512 - print_transitions(Counter(crf.transition_features_).most_common(50), oFile)
513 - oFile.write('\n')
514 -
515 - oFile.write("\nTop unlikely transitions:\n")
516 - print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile)
517 - oFile.write('\n')
518 -
519 - oFile.write("\nTop positive:\n")
520 - print_state_features(Counter(crf.state_features_).most_common(200), oFile)
521 - oFile.write('\n')
522 -
523 - oFile.write("\nTop negative:\n")
524 - print_state_features(Counter(crf.state_features_).most_common()[-200:], oFile)
525 - oFile.write('\n')