Carlos-Francisco Méndez-Cruz

Training, validation and evaluation

1 +# -*- coding: UTF-8 -*-
2 +
3 +import os
4 +from itertools import chain
5 +from optparse import OptionParser
6 +from time import time
7 +from collections import Counter
8 +
9 +import nltk
10 +import sklearn
11 +import scipy.stats
12 +import sys
13 +
14 +from sklearn.externals import joblib
15 +from sklearn.metrics import make_scorer
16 +from sklearn.cross_validation import cross_val_score
17 +from sklearn.grid_search import RandomizedSearchCV
18 +
19 +import sklearn_crfsuite
20 +from sklearn_crfsuite import scorers
21 +from sklearn_crfsuite import metrics
22 +
23 +from nltk.corpus import stopwords
24 +
25 +
26 +# Objective
27 +# Training and evaluation of CRFs with sklearn-crfsuite.
28 +#
29 +# Input parameters
30 +# --inputPath=PATH Path of training and test data set
31 +# --trainingFile File with training data set
32 +# --testFile File with test data set
33 +# --outputPath=PATH Output path to place output files
34 +# --filteringStopWords Filtering stop words
35 +# --filterSymbols Filtering punctuation marks
36 +
37 +# Output
38 +# 1) Best model
39 +
40 +# Examples
41 +# Sentences
42 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS > output.TrainingTestingCRF.20161106_1.txt
43 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords > output.TrainingTestingCRF.20161106_2.txt
44 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterSymbols > output.TrainingTestingCRF.20161106_3.txt
45 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords --filterSymbols > output.TrainingTestingCRF.20161106_4.txt
46 +
47 +# Aspects
48 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS > output.TrainingTestingCRF.20161106_5.txt
49 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords > output.TrainingTestingCRF.20161106_6.txt
50 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterSymbols > output.TrainingTestingCRF.20161106_7.txt
51 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords --filterSymbols > output.TrainingTestingCRF.20161106_8.txt
52 +
53 +#################################
54 +# FUNCTIONS #
55 +#################################
56 +
57 +def wordSize(text):
58 + lWord = len(text)
59 + if lWord == 1:
60 + return '1'
61 + elif lWord == 2:
62 + return '2'
63 + elif lWord == 3:
64 + return '3'
65 + elif lWord == 4:
66 + return '4'
67 + elif lWord == 5:
68 + return '5'
69 + elif 6 <= lWord <= 10:
70 + return '6-10'
71 + elif 11 <= lWord <= 15:
72 + return '11-15'
73 + elif 16 <= lWord <= 20:
74 + return '16-20'
75 + elif 21 <= lWord <= 30:
76 + return '21-30'
77 + else:
78 + return '>30'
79 +
80 +def hasUpperLower(text):
81 + has = False
82 + if len(text) < 3:
83 + return False
84 + regexUp = nltk.re.compile('[A-Z]')
85 + regexLo = nltk.re.compile('[a-z]')
86 + if (regexUp.search(text) != None) and (regexLo.search(text) != None):
87 + has = True
88 + return has
89 +
90 +def hasDigit(text):
91 + has = False
92 + if len(text) < 3:
93 + return False
94 + myRegex = nltk.re.compile('[0-9]')
95 + if myRegex.search(text) != None:
96 + has = True
97 + return has
98 +
99 +
100 +def hasNonAlphaNum(text):
101 + has = False
102 + if len(text) < 3:
103 + return False
104 + myRegex = nltk.re.compile('\W')
105 + if myRegex.search(text) != None:
106 + has = True
107 + return has
108 +
109 +def word2features(sent, i):
110 + # print "i: " + str(i)
111 + # print "sent[i]" + sent[i]
112 + listElem = sent[i].split('|')
113 + word = listElem[0]
114 + lemma = listElem[1]
115 + postag = listElem[2]
116 +
117 + features = {
118 + # Names of TF and genes change by lower and upper characters: 'word.lower()': word.lower(),
119 + # Suffixes
120 + 'word[-3:]': word[-3:],
121 + 'word[-2:]': word[-2:],
122 + 'word[-1:]': word[-1:],
123 + 'word.isupper()': word.isupper(),
124 + 'word.istitle()': word.istitle(),
125 + 'word.hasDigit()': hasDigit(word),
126 + 'word.hasNonAlphaNum': hasNonAlphaNum(word),
127 + # 'word.hasUpperLower': hasUpperLower(word),
128 + #'wordSize': wordSize(word),
129 + # 'word.isdigit()': word.isdigit(),
130 + 'word': word,
131 + 'lemma': lemma,
132 + 'lemma[-3:]': lemma[-3:],
133 + 'lemma[-2:]': lemma[-2:],
134 + 'lemma[-1:]': lemma[-1:],
135 + 'postag': postag,
136 + # Prefixes
137 + 'postag[:2]': postag[:2],
138 + 'postag[:1]': postag[:1],
139 + }
140 + if i > 0:
141 + listElem = sent[i - 1].split('|')
142 + word1 = listElem[0]
143 + lemma1 = listElem[1]
144 + postag1 = listElem[2]
145 + features.update({
146 + '-1:word.lower()': word1.lower(),
147 + '-1:word.istitle()': word1.istitle(),
148 + '-1:word.isupper()': word1.isupper(),
149 + '-1:word.hasDigit()': hasDigit(word1),
150 + '-1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
151 + # '-1:word.hasUpperLower': hasUpperLower(word1),
152 + '-1:word': word1,
153 + '-1:lemma': lemma1,
154 + '-1:postag': postag1,
155 + '-1:postag[:2]': postag1[:2],
156 + '-1:postag[:1]': postag1[:1],
157 + })
158 + # else:
159 + # features['BOS'] = True
160 +
161 + if i < len(sent) - 1:
162 + listElem = sent[i + 1].split('|')
163 + word1 = listElem[0]
164 + lemma1 = listElem[1]
165 + postag1 = listElem[2]
166 + features.update({
167 + '+1:word.lower()': word1.lower(),
168 + '+1:word.istitle()': word1.istitle(),
169 + '+1:word.isupper()': word1.isupper(),
170 + '+1:word.hasDigit()': hasDigit(word1),
171 + '+1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
172 + # '+1:word.hasUpperLower': hasUpperLower(word1),
173 + '+1:word': word1,
174 + '+1:lemma': lemma1,
175 + '+1:postag': postag1,
176 + '+1:postag[:2]': postag1[:2],
177 + '+1:postag[:1]': postag1[:1],
178 + })
179 + # else:
180 + # features['EOS'] = True
181 + if i > 1:
182 + listElem = sent[i - 2].split('|')
183 + word2 = listElem[0]
184 + lemma2 = listElem[1]
185 + postag2 = listElem[2]
186 + features.update({
187 + '-2:word.lower()': word2.lower(),
188 + '-2:word.istitle()': word2.istitle(),
189 + '-2:word.isupper()': word2.isupper(),
190 + '-2:word.hasDigit()': hasDigit(word2),
191 + '-2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
192 + # '-2:word.hasUpperLower': hasUpperLower(word2),
193 + '-2:word': word2,
194 + '-2:lemma': lemma2,
195 + '-2:postag': postag2,
196 + '-2:postag[:2]': postag2[:2],
197 + '-2:postag[:1]': postag2[:1],
198 + })
199 +
200 + if i < len(sent) - 2:
201 + listElem = sent[i + 2].split('|')
202 + word2 = listElem[0]
203 + lemma2 = listElem[1]
204 + postag2 = listElem[2]
205 + features.update({
206 + '+2:word.lower()': word2.lower(),
207 + '+2:word.istitle()': word2.istitle(),
208 + '+2:word.isupper()': word2.isupper(),
209 + '+2:word.hasDigit()': hasDigit(word2),
210 + '+2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
211 + # '+2:word.hasUpperLower': hasUpperLower(word2),
212 + '+2:word': word2,
213 + '+2:lemma': lemma2,
214 + '+2:postag': postag2,
215 + '+2:postag[:2]': postag2[:2],
216 + '+2:postag[:1]': postag2[:1],
217 + })
218 +
219 + trigrams = False
220 + if trigrams:
221 + if i > 2:
222 + listElem = sent[i - 3].split('|')
223 + word3 = listElem[0]
224 + lemma3 = listElem[1]
225 + postag3 = listElem[2]
226 + features.update({
227 + '-3:word.lower()': word3.lower(),
228 + '-3:word.istitle()': word3.istitle(),
229 + '-3:word.isupper()': word3.isupper(),
230 + '-3:word.hasDigit()': hasDigit(word3),
231 + '-3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
232 + # '-3:word.hasUpperLower': hasUpperLower(word3),
233 + '-3:word': word3,
234 + '-3:lemma': lemma3,
235 + '-3:postag': postag3,
236 + '-3:postag[:2]': postag3[:2],
237 + '-3:postag[:1]': postag3[:1],
238 + })
239 +
240 + if i < len(sent) - 3:
241 + listElem = sent[i + 3].split('|')
242 + word3 = listElem[0]
243 + lemma3 = listElem[1]
244 + postag3 = listElem[2]
245 + features.update({
246 + '+3:word.lower()': word3.lower(),
247 + '+3:word.istitle()': word3.istitle(),
248 + '+3:word.isupper()': word3.isupper(),
249 + '+3:word.hasDigit()': hasDigit(word3),
250 + '+3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
251 + # '+3:word.hasUpperLower': hasUpperLower(word3),
252 + '+3:word': word3,
253 + '+3:lemma': lemma3,
254 + '+3:postag': postag3,
255 + '+3:postag[:2]': postag3[:2],
256 + '+3:postag[:1]': postag3[:1],
257 + })
258 +
259 + return features
260 +
261 +
262 +def sent2features(sent):
263 + return [word2features(sent, i) for i in range(len(sent))]
264 +
265 +
266 +def sent2labels(sent):
267 + return [elem.split('|')[3] for elem in sent]
268 + # return [label for token, postag, label in sent]
269 +
270 +
271 +def sent2tokens(sent):
272 + return [token for token, postag, label in sent]
273 +
274 +
275 +def print_transitions(trans_features, f):
276 + for (label_from, label_to), weight in trans_features:
277 + # f.write("%-6s -> %-7s %0.6f\n" % (label_from, label_to, weight))
278 + # f.write("label_from :" + label_from)
279 + # f.write("label_to :" + label_to)
280 + # f.write("label_weight :" + weight)
281 + # f.write("{} -> {} {:0.6f}\n".format(label_from.encode("utf-8"), label_to.encode("utf-8"), weight))
282 + f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight))
283 +
284 +
285 +def print_state_features(state_features, f):
286 + for (attr, label), weight in state_features:
287 + # f.write("%0.6f %-8s %s\n" % (weight, label, attr))
288 + # f.write(attr.encode("utf-8"))
289 + # '{:06.2f}'.format(3.141592653589793)
290 + f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8")))
291 +
292 +
293 +__author__ = 'CMendezC'
294 +
295 +##########################################
296 +# MAIN PROGRAM #
297 +##########################################
298 +
299 +if __name__ == "__main__":
300 + # Defining parameters
301 + parser = OptionParser()
302 + parser.add_option("--inputPath", dest="inputPath",
303 + help="Path of training data set", metavar="PATH")
304 + parser.add_option("--outputPath", dest="outputPath",
305 + help="Output path to place output files",
306 + metavar="PATH")
307 + parser.add_option("--trainingFile", dest="trainingFile",
308 + help="File with training data set", metavar="FILE")
309 + parser.add_option("--testFile", dest="testFile",
310 + help="File with test data set", metavar="FILE")
311 + parser.add_option("--filterStopWords", default=False,
312 + action="store_true", dest="filterStopWords",
313 + help="Filtering stop words")
314 + parser.add_option("--filterSymbols", default=False,
315 + action="store_true", dest="filterSymbols",
316 + help="Filtering punctuation marks")
317 +
318 + (options, args) = parser.parse_args()
319 + if len(args) > 0:
320 + parser.error("Any parameter given.")
321 + sys.exit(1)
322 +
323 + print('-------------------------------- PARAMETERS --------------------------------')
324 + print("Path of training data set: " + options.inputPath)
325 + print("File with training data set: " + str(options.trainingFile))
326 + print("Path of test data set: " + options.inputPath)
327 + print("File with test data set: " + str(options.testFile))
328 + print("Filtering stop words: " + str(options.filterStopWords))
329 + symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
330 + '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
331 + print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
332 +
333 + print('-------------------------------- PROCESSING --------------------------------')
334 + print('Reading corpus...')
335 + t0 = time()
336 +
337 + sentencesTrainingData = []
338 + sentencesTestData = []
339 +
340 + stopwords = [word.decode('utf-8') for word in stopwords.words('english')]
341 +
342 + with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile:
343 + # with open(os.path.join(options.inputPath, options.trainingFile), "r", encoding="utf-8", errors='replace') as iFile:
344 + for line in iFile.readlines():
345 + listLine = []
346 + line = line.decode("utf-8")
347 + for token in line.strip('\n').split():
348 + if options.filterStopWords:
349 + listToken = token.split('|')
350 + lemma = listToken[1]
351 + # Original: if lemma in stopwords.words('english'):
352 + # trainingTesting_Sklearn_crfsuite.py:269:
353 + # UnicodeWarning: Unicode equal comparison failed to
354 + # convert both arguments to Unicode -
355 + # interpreting them as being unequal
356 + if lemma in stopwords:
357 + continue
358 + if options.filterSymbols:
359 + listToken = token.split('|')
360 + lemma = listToken[1]
361 + if lemma in symbols:
362 + # if lemma == ',':
363 + # print "Coma , identificada"
364 + continue
365 + listLine.append(token)
366 + sentencesTrainingData.append(listLine)
367 + print " Sentences training data: " + str(len(sentencesTrainingData))
368 + # print sentencesTrainingData[0]
369 +
370 + with open(os.path.join(options.inputPath, options.testFile), "r") as iFile:
371 + # with open(os.path.join(options.inputPath, options.testFile), "r", encoding="utf-8", errors='replace') as iFile:
372 + for line in iFile.readlines():
373 + listLine = []
374 + line = line.decode("utf-8")
375 + for token in line.strip('\n').split():
376 + if options.filterStopWords:
377 + listToken = token.split('|')
378 + lemma = listToken[1]
379 + # Original if lemma in stopwords.words('english'):
380 + if lemma in stopwords:
381 + continue
382 + if options.filterSymbols:
383 + listToken = token.split('|')
384 + lemma = listToken[1]
385 + if lemma in symbols:
386 + # if lemma == ',':
387 + # print "Coma , identificada"
388 + continue
389 + listLine.append(token)
390 + sentencesTestData.append(listLine)
391 + print " Sentences test data: " + str(len(sentencesTestData))
392 + # print sentencesTestData[0]
393 +
394 + print("Reading corpus done in: %fs" % (time() - t0))
395 +
396 + print(sent2features(sentencesTrainingData[0])[0])
397 + print(sent2features(sentencesTestData[0])[0])
398 + # print(sent2labels(sentencesTrainingData[0]))
399 + # print(sent2labels(sentencesTestData[0]))
400 + t0 = time()
401 +
402 + X_train = [sent2features(s) for s in sentencesTrainingData]
403 + y_train = [sent2labels(s) for s in sentencesTrainingData]
404 +
405 + X_test = [sent2features(s) for s in sentencesTestData]
406 + # print X_test
407 + y_test = [sent2labels(s) for s in sentencesTestData]
408 +
409 + # Fixed parameters
410 + # crf = sklearn_crfsuite.CRF(
411 + # algorithm='lbfgs',
412 + # c1=0.1,
413 + # c2=0.1,
414 + # max_iterations=100,
415 + # all_possible_transitions=True
416 + # )
417 +
418 + # Hyperparameter Optimization
419 + crf = sklearn_crfsuite.CRF(
420 + algorithm='lbfgs',
421 + max_iterations=100,
422 + all_possible_transitions=True
423 + )
424 + params_space = {
425 + 'c1': scipy.stats.expon(scale=0.5),
426 + 'c2': scipy.stats.expon(scale=0.05),
427 + }
428 +
429 + # Original: labels = list(crf.classes_)
430 + # Original: labels.remove('O')
431 + labels = list(['GENE'])
432 +
433 + # use the same metric for evaluation
434 + f1_scorer = make_scorer(metrics.flat_f1_score,
435 + average='weighted', labels=labels)
436 +
437 + # search
438 + rs = RandomizedSearchCV(crf, params_space,
439 + cv=10,
440 + verbose=3,
441 + n_jobs=-1,
442 + n_iter=20,
443 + # n_iter=50,
444 + scoring=f1_scorer)
445 + rs.fit(X_train, y_train)
446 +
447 + # Fixed parameters
448 + # crf.fit(X_train, y_train)
449 +
450 + # Best hiperparameters
451 + # crf = rs.best_estimator_
452 + nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
453 + options.filterSymbols) + '.txt')
454 + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile:
455 + oFile.write("********** TRAINING AND TESTING REPORT **********\n")
456 + oFile.write("Training file: " + options.trainingFile + '\n')
457 + oFile.write('\n')
458 + oFile.write('best params:' + str(rs.best_params_) + '\n')
459 + oFile.write('best CV score:' + str(rs.best_score_) + '\n')
460 + oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000))
461 +
462 + print("Training done in: %fs" % (time() - t0))
463 + t0 = time()
464 +
465 + # Update best crf
466 + crf = rs.best_estimator_
467 +
468 + # Saving model
469 + print(" Saving training model...")
470 + t1 = time()
471 + nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
472 + options.filterSymbols) + '.mod')
473 + joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
474 + print(" Saving training model done in: %fs" % (time() - t1))
475 +
476 + # Evaluation against test data
477 + y_pred = crf.predict(X_test)
478 + print("*********************************")
479 + name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
480 + options.filterSymbols) + '.txt')
481 + with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile:
482 + for y in y_pred:
483 + oFile.write(str(y) + '\n')
484 +
485 + print("*********************************")
486 + name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
487 + options.filterSymbols) + '.txt')
488 + with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile:
489 + for y in y_test:
490 + oFile.write(str(y) + '\n')
491 +
492 + print("Prediction done in: %fs" % (time() - t0))
493 +
494 + # labels = list(crf.classes_)
495 + # labels.remove('O')
496 +
497 + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile:
498 + oFile.write('\n')
499 + oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)))
500 + oFile.write('\n')
501 + # labels = list(crf.classes_)
502 + sorted_labels = sorted(
503 + labels,
504 + key=lambda name: (name[1:], name[0])
505 + )
506 + oFile.write(metrics.flat_classification_report(
507 + y_test, y_pred, labels=sorted_labels, digits=3
508 + ))
509 + oFile.write('\n')
510 +
511 + oFile.write("\nTop likely transitions:\n")
512 + print_transitions(Counter(crf.transition_features_).most_common(50), oFile)
513 + oFile.write('\n')
514 +
515 + oFile.write("\nTop unlikely transitions:\n")
516 + print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile)
517 + oFile.write('\n')
518 +
519 + oFile.write("\nTop positive:\n")
520 + print_state_features(Counter(crf.state_features_).most_common(200), oFile)
521 + oFile.write('\n')
522 +
523 + oFile.write("\nTop negative:\n")
524 + print_state_features(Counter(crf.state_features_).most_common()[-200:], oFile)
525 + oFile.write('\n')