Showing
2 changed files
with
1 additions
and
526 deletions
... | @@ -76,7 +76,7 @@ def word2features(sent, i): | ... | @@ -76,7 +76,7 @@ def word2features(sent, i): |
76 | 'word[:3]': word[:3], | 76 | 'word[:3]': word[:3], |
77 | 'word[:2]': word[:2], | 77 | 'word[:2]': word[:2], |
78 | 'word[:1]': word[:1], | 78 | 'word[:1]': word[:1], |
79 | - 'endsConLow()': endsConLow(word), | 79 | + 'endsConLow()={}'.format(endsConLow(word)): endsConLow(word), |
80 | } | 80 | } |
81 | ''' | 81 | ''' |
82 | if i > 0: | 82 | if i > 0: | ... | ... |
training-validation.py
deleted
100644 → 0
1 | -# -*- coding: UTF-8 -*- | ||
2 | - | ||
3 | -import os | ||
4 | -from itertools import chain | ||
5 | -from optparse import OptionParser | ||
6 | -from time import time | ||
7 | -from collections import Counter | ||
8 | - | ||
9 | -import nltk | ||
10 | -import sklearn | ||
11 | -import scipy.stats | ||
12 | -import sys | ||
13 | - | ||
14 | -from sklearn.externals import joblib | ||
15 | -from sklearn.metrics import make_scorer | ||
16 | -from sklearn.cross_validation import cross_val_score | ||
17 | -from sklearn.grid_search import RandomizedSearchCV | ||
18 | - | ||
19 | -import sklearn_crfsuite | ||
20 | -from sklearn_crfsuite import scorers | ||
21 | -from sklearn_crfsuite import metrics | ||
22 | - | ||
23 | -from nltk.corpus import stopwords | ||
24 | - | ||
25 | - | ||
26 | -# Objective | ||
27 | -# Training and evaluation of CRFs with sklearn-crfsuite. | ||
28 | -# | ||
29 | -# Input parameters | ||
30 | -# --inputPath=PATH Path of training and test data set | ||
31 | -# --trainingFile File with training data set | ||
32 | -# --testFile File with test data set | ||
33 | -# --outputPath=PATH Output path to place output files | ||
34 | -# --filteringStopWords Filtering stop words | ||
35 | -# --filterSymbols Filtering punctuation marks | ||
36 | - | ||
37 | -# Output | ||
38 | -# 1) Best model | ||
39 | - | ||
40 | -# Examples | ||
41 | -# Sentences | ||
42 | -# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS > output.TrainingTestingCRF.20161106_1.txt | ||
43 | -# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords > output.TrainingTestingCRF.20161106_2.txt | ||
44 | -# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterSymbols > output.TrainingTestingCRF.20161106_3.txt | ||
45 | -# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords --filterSymbols > output.TrainingTestingCRF.20161106_4.txt | ||
46 | - | ||
47 | -# Aspects | ||
48 | -# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS > output.TrainingTestingCRF.20161106_5.txt | ||
49 | -# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords > output.TrainingTestingCRF.20161106_6.txt | ||
50 | -# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterSymbols > output.TrainingTestingCRF.20161106_7.txt | ||
51 | -# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords --filterSymbols > output.TrainingTestingCRF.20161106_8.txt | ||
52 | - | ||
53 | -################################# | ||
54 | -# FUNCTIONS # | ||
55 | -################################# | ||
56 | - | ||
57 | -def wordSize(text): | ||
58 | - lWord = len(text) | ||
59 | - if lWord == 1: | ||
60 | - return '1' | ||
61 | - elif lWord == 2: | ||
62 | - return '2' | ||
63 | - elif lWord == 3: | ||
64 | - return '3' | ||
65 | - elif lWord == 4: | ||
66 | - return '4' | ||
67 | - elif lWord == 5: | ||
68 | - return '5' | ||
69 | - elif 6 <= lWord <= 10: | ||
70 | - return '6-10' | ||
71 | - elif 11 <= lWord <= 15: | ||
72 | - return '11-15' | ||
73 | - elif 16 <= lWord <= 20: | ||
74 | - return '16-20' | ||
75 | - elif 21 <= lWord <= 30: | ||
76 | - return '21-30' | ||
77 | - else: | ||
78 | - return '>30' | ||
79 | - | ||
80 | -def hasUpperLower(text): | ||
81 | - has = False | ||
82 | - if len(text) < 3: | ||
83 | - return False | ||
84 | - regexUp = nltk.re.compile('[A-Z]') | ||
85 | - regexLo = nltk.re.compile('[a-z]') | ||
86 | - if (regexUp.search(text) != None) and (regexLo.search(text) != None): | ||
87 | - has = True | ||
88 | - return has | ||
89 | - | ||
90 | -def hasDigit(text): | ||
91 | - has = False | ||
92 | - if len(text) < 3: | ||
93 | - return False | ||
94 | - myRegex = nltk.re.compile('[0-9]') | ||
95 | - if myRegex.search(text) != None: | ||
96 | - has = True | ||
97 | - return has | ||
98 | - | ||
99 | - | ||
100 | -def hasNonAlphaNum(text): | ||
101 | - has = False | ||
102 | - if len(text) < 3: | ||
103 | - return False | ||
104 | - myRegex = nltk.re.compile('\W') | ||
105 | - if myRegex.search(text) != None: | ||
106 | - has = True | ||
107 | - return has | ||
108 | - | ||
109 | -def word2features(sent, i): | ||
110 | - # print "i: " + str(i) | ||
111 | - # print "sent[i]" + sent[i] | ||
112 | - listElem = sent[i].split('|') | ||
113 | - word = listElem[0] | ||
114 | - lemma = listElem[1] | ||
115 | - postag = listElem[2] | ||
116 | - | ||
117 | - features = { | ||
118 | - # Names of TF and genes change by lower and upper characters: 'word.lower()': word.lower(), | ||
119 | - # Suffixes | ||
120 | - 'word[-3:]': word[-3:], | ||
121 | - 'word[-2:]': word[-2:], | ||
122 | - 'word[-1:]': word[-1:], | ||
123 | - 'word.isupper()': word.isupper(), | ||
124 | - 'word.istitle()': word.istitle(), | ||
125 | - 'word.hasDigit()': hasDigit(word), | ||
126 | - 'word.hasNonAlphaNum': hasNonAlphaNum(word), | ||
127 | - # 'word.hasUpperLower': hasUpperLower(word), | ||
128 | - #'wordSize': wordSize(word), | ||
129 | - # 'word.isdigit()': word.isdigit(), | ||
130 | - 'word': word, | ||
131 | - 'lemma': lemma, | ||
132 | - 'lemma[-3:]': lemma[-3:], | ||
133 | - 'lemma[-2:]': lemma[-2:], | ||
134 | - 'lemma[-1:]': lemma[-1:], | ||
135 | - 'postag': postag, | ||
136 | - # Prefixes | ||
137 | - 'postag[:2]': postag[:2], | ||
138 | - 'postag[:1]': postag[:1], | ||
139 | - } | ||
140 | - if i > 0: | ||
141 | - listElem = sent[i - 1].split('|') | ||
142 | - word1 = listElem[0] | ||
143 | - lemma1 = listElem[1] | ||
144 | - postag1 = listElem[2] | ||
145 | - features.update({ | ||
146 | - '-1:word.lower()': word1.lower(), | ||
147 | - '-1:word.istitle()': word1.istitle(), | ||
148 | - '-1:word.isupper()': word1.isupper(), | ||
149 | - '-1:word.hasDigit()': hasDigit(word1), | ||
150 | - '-1:word.hasNonAlphaNum': hasNonAlphaNum(word1), | ||
151 | - # '-1:word.hasUpperLower': hasUpperLower(word1), | ||
152 | - '-1:word': word1, | ||
153 | - '-1:lemma': lemma1, | ||
154 | - '-1:postag': postag1, | ||
155 | - '-1:postag[:2]': postag1[:2], | ||
156 | - '-1:postag[:1]': postag1[:1], | ||
157 | - }) | ||
158 | - # else: | ||
159 | - # features['BOS'] = True | ||
160 | - | ||
161 | - if i < len(sent) - 1: | ||
162 | - listElem = sent[i + 1].split('|') | ||
163 | - word1 = listElem[0] | ||
164 | - lemma1 = listElem[1] | ||
165 | - postag1 = listElem[2] | ||
166 | - features.update({ | ||
167 | - '+1:word.lower()': word1.lower(), | ||
168 | - '+1:word.istitle()': word1.istitle(), | ||
169 | - '+1:word.isupper()': word1.isupper(), | ||
170 | - '+1:word.hasDigit()': hasDigit(word1), | ||
171 | - '+1:word.hasNonAlphaNum': hasNonAlphaNum(word1), | ||
172 | - # '+1:word.hasUpperLower': hasUpperLower(word1), | ||
173 | - '+1:word': word1, | ||
174 | - '+1:lemma': lemma1, | ||
175 | - '+1:postag': postag1, | ||
176 | - '+1:postag[:2]': postag1[:2], | ||
177 | - '+1:postag[:1]': postag1[:1], | ||
178 | - }) | ||
179 | - # else: | ||
180 | - # features['EOS'] = True | ||
181 | - if i > 1: | ||
182 | - listElem = sent[i - 2].split('|') | ||
183 | - word2 = listElem[0] | ||
184 | - lemma2 = listElem[1] | ||
185 | - postag2 = listElem[2] | ||
186 | - features.update({ | ||
187 | - '-2:word.lower()': word2.lower(), | ||
188 | - '-2:word.istitle()': word2.istitle(), | ||
189 | - '-2:word.isupper()': word2.isupper(), | ||
190 | - '-2:word.hasDigit()': hasDigit(word2), | ||
191 | - '-2:word.hasNonAlphaNum': hasNonAlphaNum(word2), | ||
192 | - # '-2:word.hasUpperLower': hasUpperLower(word2), | ||
193 | - '-2:word': word2, | ||
194 | - '-2:lemma': lemma2, | ||
195 | - '-2:postag': postag2, | ||
196 | - '-2:postag[:2]': postag2[:2], | ||
197 | - '-2:postag[:1]': postag2[:1], | ||
198 | - }) | ||
199 | - | ||
200 | - if i < len(sent) - 2: | ||
201 | - listElem = sent[i + 2].split('|') | ||
202 | - word2 = listElem[0] | ||
203 | - lemma2 = listElem[1] | ||
204 | - postag2 = listElem[2] | ||
205 | - features.update({ | ||
206 | - '+2:word.lower()': word2.lower(), | ||
207 | - '+2:word.istitle()': word2.istitle(), | ||
208 | - '+2:word.isupper()': word2.isupper(), | ||
209 | - '+2:word.hasDigit()': hasDigit(word2), | ||
210 | - '+2:word.hasNonAlphaNum': hasNonAlphaNum(word2), | ||
211 | - # '+2:word.hasUpperLower': hasUpperLower(word2), | ||
212 | - '+2:word': word2, | ||
213 | - '+2:lemma': lemma2, | ||
214 | - '+2:postag': postag2, | ||
215 | - '+2:postag[:2]': postag2[:2], | ||
216 | - '+2:postag[:1]': postag2[:1], | ||
217 | - }) | ||
218 | - | ||
219 | - trigrams = False | ||
220 | - if trigrams: | ||
221 | - if i > 2: | ||
222 | - listElem = sent[i - 3].split('|') | ||
223 | - word3 = listElem[0] | ||
224 | - lemma3 = listElem[1] | ||
225 | - postag3 = listElem[2] | ||
226 | - features.update({ | ||
227 | - '-3:word.lower()': word3.lower(), | ||
228 | - '-3:word.istitle()': word3.istitle(), | ||
229 | - '-3:word.isupper()': word3.isupper(), | ||
230 | - '-3:word.hasDigit()': hasDigit(word3), | ||
231 | - '-3:word.hasNonAlphaNum': hasNonAlphaNum(word3), | ||
232 | - # '-3:word.hasUpperLower': hasUpperLower(word3), | ||
233 | - '-3:word': word3, | ||
234 | - '-3:lemma': lemma3, | ||
235 | - '-3:postag': postag3, | ||
236 | - '-3:postag[:2]': postag3[:2], | ||
237 | - '-3:postag[:1]': postag3[:1], | ||
238 | - }) | ||
239 | - | ||
240 | - if i < len(sent) - 3: | ||
241 | - listElem = sent[i + 3].split('|') | ||
242 | - word3 = listElem[0] | ||
243 | - lemma3 = listElem[1] | ||
244 | - postag3 = listElem[2] | ||
245 | - features.update({ | ||
246 | - '+3:word.lower()': word3.lower(), | ||
247 | - '+3:word.istitle()': word3.istitle(), | ||
248 | - '+3:word.isupper()': word3.isupper(), | ||
249 | - '+3:word.hasDigit()': hasDigit(word3), | ||
250 | - '+3:word.hasNonAlphaNum': hasNonAlphaNum(word3), | ||
251 | - # '+3:word.hasUpperLower': hasUpperLower(word3), | ||
252 | - '+3:word': word3, | ||
253 | - '+3:lemma': lemma3, | ||
254 | - '+3:postag': postag3, | ||
255 | - '+3:postag[:2]': postag3[:2], | ||
256 | - '+3:postag[:1]': postag3[:1], | ||
257 | - }) | ||
258 | - | ||
259 | - return features | ||
260 | - | ||
261 | - | ||
262 | -def sent2features(sent): | ||
263 | - return [word2features(sent, i) for i in range(len(sent))] | ||
264 | - | ||
265 | - | ||
266 | -def sent2labels(sent): | ||
267 | - return [elem.split('|')[3] for elem in sent] | ||
268 | - # return [label for token, postag, label in sent] | ||
269 | - | ||
270 | - | ||
271 | -def sent2tokens(sent): | ||
272 | - return [token for token, postag, label in sent] | ||
273 | - | ||
274 | - | ||
275 | -def print_transitions(trans_features, f): | ||
276 | - for (label_from, label_to), weight in trans_features: | ||
277 | - # f.write("%-6s -> %-7s %0.6f\n" % (label_from, label_to, weight)) | ||
278 | - # f.write("label_from :" + label_from) | ||
279 | - # f.write("label_to :" + label_to) | ||
280 | - # f.write("label_weight :" + weight) | ||
281 | - # f.write("{} -> {} {:0.6f}\n".format(label_from.encode("utf-8"), label_to.encode("utf-8"), weight)) | ||
282 | - f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight)) | ||
283 | - | ||
284 | - | ||
285 | -def print_state_features(state_features, f): | ||
286 | - for (attr, label), weight in state_features: | ||
287 | - # f.write("%0.6f %-8s %s\n" % (weight, label, attr)) | ||
288 | - # f.write(attr.encode("utf-8")) | ||
289 | - # '{:06.2f}'.format(3.141592653589793) | ||
290 | - f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8"))) | ||
291 | - | ||
292 | - | ||
293 | -__author__ = 'CMendezC' | ||
294 | - | ||
295 | -########################################## | ||
296 | -# MAIN PROGRAM # | ||
297 | -########################################## | ||
298 | - | ||
299 | -if __name__ == "__main__": | ||
300 | - # Defining parameters | ||
301 | - parser = OptionParser() | ||
302 | - parser.add_option("--inputPath", dest="inputPath", | ||
303 | - help="Path of training data set", metavar="PATH") | ||
304 | - parser.add_option("--outputPath", dest="outputPath", | ||
305 | - help="Output path to place output files", | ||
306 | - metavar="PATH") | ||
307 | - parser.add_option("--trainingFile", dest="trainingFile", | ||
308 | - help="File with training data set", metavar="FILE") | ||
309 | - parser.add_option("--testFile", dest="testFile", | ||
310 | - help="File with test data set", metavar="FILE") | ||
311 | - parser.add_option("--filterStopWords", default=False, | ||
312 | - action="store_true", dest="filterStopWords", | ||
313 | - help="Filtering stop words") | ||
314 | - parser.add_option("--filterSymbols", default=False, | ||
315 | - action="store_true", dest="filterSymbols", | ||
316 | - help="Filtering punctuation marks") | ||
317 | - | ||
318 | - (options, args) = parser.parse_args() | ||
319 | - if len(args) > 0: | ||
320 | - parser.error("Any parameter given.") | ||
321 | - sys.exit(1) | ||
322 | - | ||
323 | - print('-------------------------------- PARAMETERS --------------------------------') | ||
324 | - print("Path of training data set: " + options.inputPath) | ||
325 | - print("File with training data set: " + str(options.trainingFile)) | ||
326 | - print("Path of test data set: " + options.inputPath) | ||
327 | - print("File with test data set: " + str(options.testFile)) | ||
328 | - print("Filtering stop words: " + str(options.filterStopWords)) | ||
329 | - symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | ||
330 | - '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...'] | ||
331 | - print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols)) | ||
332 | - | ||
333 | - print('-------------------------------- PROCESSING --------------------------------') | ||
334 | - print('Reading corpus...') | ||
335 | - t0 = time() | ||
336 | - | ||
337 | - sentencesTrainingData = [] | ||
338 | - sentencesTestData = [] | ||
339 | - | ||
340 | - stopwords = [word.decode('utf-8') for word in stopwords.words('english')] | ||
341 | - | ||
342 | - with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile: | ||
343 | - # with open(os.path.join(options.inputPath, options.trainingFile), "r", encoding="utf-8", errors='replace') as iFile: | ||
344 | - for line in iFile.readlines(): | ||
345 | - listLine = [] | ||
346 | - line = line.decode("utf-8") | ||
347 | - for token in line.strip('\n').split(): | ||
348 | - if options.filterStopWords: | ||
349 | - listToken = token.split('|') | ||
350 | - lemma = listToken[1] | ||
351 | - # Original: if lemma in stopwords.words('english'): | ||
352 | - # trainingTesting_Sklearn_crfsuite.py:269: | ||
353 | - # UnicodeWarning: Unicode equal comparison failed to | ||
354 | - # convert both arguments to Unicode - | ||
355 | - # interpreting them as being unequal | ||
356 | - if lemma in stopwords: | ||
357 | - continue | ||
358 | - if options.filterSymbols: | ||
359 | - listToken = token.split('|') | ||
360 | - lemma = listToken[1] | ||
361 | - if lemma in symbols: | ||
362 | - # if lemma == ',': | ||
363 | - # print "Coma , identificada" | ||
364 | - continue | ||
365 | - listLine.append(token) | ||
366 | - sentencesTrainingData.append(listLine) | ||
367 | - print " Sentences training data: " + str(len(sentencesTrainingData)) | ||
368 | - # print sentencesTrainingData[0] | ||
369 | - | ||
370 | - with open(os.path.join(options.inputPath, options.testFile), "r") as iFile: | ||
371 | - # with open(os.path.join(options.inputPath, options.testFile), "r", encoding="utf-8", errors='replace') as iFile: | ||
372 | - for line in iFile.readlines(): | ||
373 | - listLine = [] | ||
374 | - line = line.decode("utf-8") | ||
375 | - for token in line.strip('\n').split(): | ||
376 | - if options.filterStopWords: | ||
377 | - listToken = token.split('|') | ||
378 | - lemma = listToken[1] | ||
379 | - # Original if lemma in stopwords.words('english'): | ||
380 | - if lemma in stopwords: | ||
381 | - continue | ||
382 | - if options.filterSymbols: | ||
383 | - listToken = token.split('|') | ||
384 | - lemma = listToken[1] | ||
385 | - if lemma in symbols: | ||
386 | - # if lemma == ',': | ||
387 | - # print "Coma , identificada" | ||
388 | - continue | ||
389 | - listLine.append(token) | ||
390 | - sentencesTestData.append(listLine) | ||
391 | - print " Sentences test data: " + str(len(sentencesTestData)) | ||
392 | - # print sentencesTestData[0] | ||
393 | - | ||
394 | - print("Reading corpus done in: %fs" % (time() - t0)) | ||
395 | - | ||
396 | - print(sent2features(sentencesTrainingData[0])[0]) | ||
397 | - print(sent2features(sentencesTestData[0])[0]) | ||
398 | - # print(sent2labels(sentencesTrainingData[0])) | ||
399 | - # print(sent2labels(sentencesTestData[0])) | ||
400 | - t0 = time() | ||
401 | - | ||
402 | - X_train = [sent2features(s) for s in sentencesTrainingData] | ||
403 | - y_train = [sent2labels(s) for s in sentencesTrainingData] | ||
404 | - | ||
405 | - X_test = [sent2features(s) for s in sentencesTestData] | ||
406 | - # print X_test | ||
407 | - y_test = [sent2labels(s) for s in sentencesTestData] | ||
408 | - | ||
409 | - # Fixed parameters | ||
410 | - # crf = sklearn_crfsuite.CRF( | ||
411 | - # algorithm='lbfgs', | ||
412 | - # c1=0.1, | ||
413 | - # c2=0.1, | ||
414 | - # max_iterations=100, | ||
415 | - # all_possible_transitions=True | ||
416 | - # ) | ||
417 | - | ||
418 | - # Hyperparameter Optimization | ||
419 | - crf = sklearn_crfsuite.CRF( | ||
420 | - algorithm='lbfgs', | ||
421 | - max_iterations=100, | ||
422 | - all_possible_transitions=True | ||
423 | - ) | ||
424 | - params_space = { | ||
425 | - 'c1': scipy.stats.expon(scale=0.5), | ||
426 | - 'c2': scipy.stats.expon(scale=0.05), | ||
427 | - } | ||
428 | - | ||
429 | - # Original: labels = list(crf.classes_) | ||
430 | - # Original: labels.remove('O') | ||
431 | - labels = list(['GENE']) | ||
432 | - | ||
433 | - # use the same metric for evaluation | ||
434 | - f1_scorer = make_scorer(metrics.flat_f1_score, | ||
435 | - average='weighted', labels=labels) | ||
436 | - | ||
437 | - # search | ||
438 | - rs = RandomizedSearchCV(crf, params_space, | ||
439 | - cv=10, | ||
440 | - verbose=3, | ||
441 | - n_jobs=-1, | ||
442 | - n_iter=20, | ||
443 | - # n_iter=50, | ||
444 | - scoring=f1_scorer) | ||
445 | - rs.fit(X_train, y_train) | ||
446 | - | ||
447 | - # Fixed parameters | ||
448 | - # crf.fit(X_train, y_train) | ||
449 | - | ||
450 | - # Best hiperparameters | ||
451 | - # crf = rs.best_estimator_ | ||
452 | - nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str( | ||
453 | - options.filterSymbols) + '.txt') | ||
454 | - with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile: | ||
455 | - oFile.write("********** TRAINING AND TESTING REPORT **********\n") | ||
456 | - oFile.write("Training file: " + options.trainingFile + '\n') | ||
457 | - oFile.write('\n') | ||
458 | - oFile.write('best params:' + str(rs.best_params_) + '\n') | ||
459 | - oFile.write('best CV score:' + str(rs.best_score_) + '\n') | ||
460 | - oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000)) | ||
461 | - | ||
462 | - print("Training done in: %fs" % (time() - t0)) | ||
463 | - t0 = time() | ||
464 | - | ||
465 | - # Update best crf | ||
466 | - crf = rs.best_estimator_ | ||
467 | - | ||
468 | - # Saving model | ||
469 | - print(" Saving training model...") | ||
470 | - t1 = time() | ||
471 | - nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str( | ||
472 | - options.filterSymbols) + '.mod') | ||
473 | - joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel)) | ||
474 | - print(" Saving training model done in: %fs" % (time() - t1)) | ||
475 | - | ||
476 | - # Evaluation against test data | ||
477 | - y_pred = crf.predict(X_test) | ||
478 | - print("*********************************") | ||
479 | - name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str( | ||
480 | - options.filterSymbols) + '.txt') | ||
481 | - with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile: | ||
482 | - for y in y_pred: | ||
483 | - oFile.write(str(y) + '\n') | ||
484 | - | ||
485 | - print("*********************************") | ||
486 | - name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str( | ||
487 | - options.filterSymbols) + '.txt') | ||
488 | - with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile: | ||
489 | - for y in y_test: | ||
490 | - oFile.write(str(y) + '\n') | ||
491 | - | ||
492 | - print("Prediction done in: %fs" % (time() - t0)) | ||
493 | - | ||
494 | - # labels = list(crf.classes_) | ||
495 | - # labels.remove('O') | ||
496 | - | ||
497 | - with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile: | ||
498 | - oFile.write('\n') | ||
499 | - oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels))) | ||
500 | - oFile.write('\n') | ||
501 | - # labels = list(crf.classes_) | ||
502 | - sorted_labels = sorted( | ||
503 | - labels, | ||
504 | - key=lambda name: (name[1:], name[0]) | ||
505 | - ) | ||
506 | - oFile.write(metrics.flat_classification_report( | ||
507 | - y_test, y_pred, labels=sorted_labels, digits=3 | ||
508 | - )) | ||
509 | - oFile.write('\n') | ||
510 | - | ||
511 | - oFile.write("\nTop likely transitions:\n") | ||
512 | - print_transitions(Counter(crf.transition_features_).most_common(50), oFile) | ||
513 | - oFile.write('\n') | ||
514 | - | ||
515 | - oFile.write("\nTop unlikely transitions:\n") | ||
516 | - print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile) | ||
517 | - oFile.write('\n') | ||
518 | - | ||
519 | - oFile.write("\nTop positive:\n") | ||
520 | - print_state_features(Counter(crf.state_features_).most_common(200), oFile) | ||
521 | - oFile.write('\n') | ||
522 | - | ||
523 | - oFile.write("\nTop negative:\n") | ||
524 | - print_state_features(Counter(crf.state_features_).most_common()[-200:], oFile) | ||
525 | - oFile.write('\n') |
-
Please register or login to post a comment