Showing
1 changed file
with
30 additions
and
174 deletions
... | @@ -32,7 +32,7 @@ from nltk.corpus import stopwords | ... | @@ -32,7 +32,7 @@ from nltk.corpus import stopwords |
32 | # --testFile File with test data set | 32 | # --testFile File with test data set |
33 | # --outputPath=PATH Output path to place output files | 33 | # --outputPath=PATH Output path to place output files |
34 | # --filteringStopWords Filtering stop words | 34 | # --filteringStopWords Filtering stop words |
35 | -# --filterSymbols Filtering punctuation marks | 35 | +# --excludeSymbols Filtering punctuation marks |
36 | 36 | ||
37 | # Output | 37 | # Output |
38 | # 1) Best model | 38 | # 1) Best model |
... | @@ -42,116 +42,44 @@ from nltk.corpus import stopwords | ... | @@ -42,116 +42,44 @@ from nltk.corpus import stopwords |
42 | # --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets | 42 | # --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets |
43 | # --trainingFile training-data-set-70.txt | 43 | # --trainingFile training-data-set-70.txt |
44 | # --testFile test-data-set-30.txt | 44 | # --testFile test-data-set-30.txt |
45 | -# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/reports | 45 | +# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields |
46 | -# python3.4 training-validation-v1.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/reports | 46 | +# python3.4 training-validation-v1.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields |
47 | 47 | ||
48 | ################################# | 48 | ################################# |
49 | # FUNCTIONS # | 49 | # FUNCTIONS # |
50 | ################################# | 50 | ################################# |
51 | 51 | ||
52 | -def wordSize(text): | ||
53 | - lWord = len(text) | ||
54 | - if lWord == 1: | ||
55 | - return '1' | ||
56 | - elif lWord == 2: | ||
57 | - return '2' | ||
58 | - elif lWord == 3: | ||
59 | - return '3' | ||
60 | - elif lWord == 4: | ||
61 | - return '4' | ||
62 | - elif lWord == 5: | ||
63 | - return '5' | ||
64 | - elif 6 <= lWord <= 10: | ||
65 | - return '6-10' | ||
66 | - elif 11 <= lWord <= 15: | ||
67 | - return '11-15' | ||
68 | - elif 16 <= lWord <= 20: | ||
69 | - return '16-20' | ||
70 | - elif 21 <= lWord <= 30: | ||
71 | - return '21-30' | ||
72 | - else: | ||
73 | - return '>30' | ||
74 | - | ||
75 | -def hasUpperLower(text): | ||
76 | - has = False | ||
77 | - if len(text) < 3: | ||
78 | - return False | ||
79 | - regexUp = nltk.re.compile('[A-Z]') | ||
80 | - regexLo = nltk.re.compile('[a-z]') | ||
81 | - if (regexUp.search(text) != None) and (regexLo.search(text) != None): | ||
82 | - has = True | ||
83 | - return has | ||
84 | - | ||
85 | -def hasDigit(text): | ||
86 | - has = False | ||
87 | - if len(text) < 3: | ||
88 | - return False | ||
89 | - myRegex = nltk.re.compile('[0-9]') | ||
90 | - if myRegex.search(text) != None: | ||
91 | - has = True | ||
92 | - return has | ||
93 | - | ||
94 | - | ||
95 | -def hasNonAlphaNum(text): | ||
96 | - has = False | ||
97 | - if len(text) < 3: | ||
98 | - return False | ||
99 | - myRegex = nltk.re.compile('\W') | ||
100 | - if myRegex.search(text) != None: | ||
101 | - has = True | ||
102 | - return has | ||
103 | - | ||
104 | def word2features(sent, i): | 52 | def word2features(sent, i): |
105 | - # print "i: " + str(i) | ||
106 | - # print "sent[i]" + sent[i] | ||
107 | listElem = sent[i].split('|') | 53 | listElem = sent[i].split('|') |
108 | word = listElem[0] | 54 | word = listElem[0] |
109 | lemma = listElem[1] | 55 | lemma = listElem[1] |
110 | postag = listElem[2] | 56 | postag = listElem[2] |
111 | 57 | ||
112 | features = { | 58 | features = { |
113 | - # Names of TF and genes change by lower and upper characters: 'word.lower()': word.lower(), | ||
114 | # Suffixes | 59 | # Suffixes |
115 | 'word[-3:]': word[-3:], | 60 | 'word[-3:]': word[-3:], |
116 | 'word[-2:]': word[-2:], | 61 | 'word[-2:]': word[-2:], |
117 | 'word[-1:]': word[-1:], | 62 | 'word[-1:]': word[-1:], |
118 | - 'word.isupper()': word.isupper(), | 63 | + #'word.isupper()': word.isupper(), |
119 | - 'word.istitle()': word.istitle(), | ||
120 | - 'word.hasDigit()': hasDigit(word), | ||
121 | - 'word.hasNonAlphaNum': hasNonAlphaNum(word), | ||
122 | - # 'word.hasUpperLower': hasUpperLower(word), | ||
123 | - #'wordSize': wordSize(word), | ||
124 | - # 'word.isdigit()': word.isdigit(), | ||
125 | 'word': word, | 64 | 'word': word, |
126 | 'lemma': lemma, | 65 | 'lemma': lemma, |
127 | 'lemma[-3:]': lemma[-3:], | 66 | 'lemma[-3:]': lemma[-3:], |
128 | 'lemma[-2:]': lemma[-2:], | 67 | 'lemma[-2:]': lemma[-2:], |
129 | 'lemma[-1:]': lemma[-1:], | 68 | 'lemma[-1:]': lemma[-1:], |
130 | - 'postag': postag, | 69 | + 'word[:3]': word[:3], |
131 | - # Prefixes | 70 | + 'word[:2]': word[:2], |
132 | - 'postag[:2]': postag[:2], | 71 | + 'word[:1]': word[:1], |
133 | - 'postag[:1]': postag[:1], | ||
134 | } | 72 | } |
73 | + ''' | ||
135 | if i > 0: | 74 | if i > 0: |
136 | listElem = sent[i - 1].split('|') | 75 | listElem = sent[i - 1].split('|') |
137 | word1 = listElem[0] | 76 | word1 = listElem[0] |
138 | lemma1 = listElem[1] | 77 | lemma1 = listElem[1] |
139 | postag1 = listElem[2] | 78 | postag1 = listElem[2] |
140 | features.update({ | 79 | features.update({ |
141 | - '-1:word.lower()': word1.lower(), | ||
142 | - '-1:word.istitle()': word1.istitle(), | ||
143 | - '-1:word.isupper()': word1.isupper(), | ||
144 | - '-1:word.hasDigit()': hasDigit(word1), | ||
145 | - '-1:word.hasNonAlphaNum': hasNonAlphaNum(word1), | ||
146 | - # '-1:word.hasUpperLower': hasUpperLower(word1), | ||
147 | '-1:word': word1, | 80 | '-1:word': word1, |
148 | '-1:lemma': lemma1, | 81 | '-1:lemma': lemma1, |
149 | - '-1:postag': postag1, | ||
150 | - '-1:postag[:2]': postag1[:2], | ||
151 | - '-1:postag[:1]': postag1[:1], | ||
152 | }) | 82 | }) |
153 | - # else: | ||
154 | - # features['BOS'] = True | ||
155 | 83 | ||
156 | if i < len(sent) - 1: | 84 | if i < len(sent) - 1: |
157 | listElem = sent[i + 1].split('|') | 85 | listElem = sent[i + 1].split('|') |
... | @@ -159,37 +87,18 @@ def word2features(sent, i): | ... | @@ -159,37 +87,18 @@ def word2features(sent, i): |
159 | lemma1 = listElem[1] | 87 | lemma1 = listElem[1] |
160 | postag1 = listElem[2] | 88 | postag1 = listElem[2] |
161 | features.update({ | 89 | features.update({ |
162 | - '+1:word.lower()': word1.lower(), | ||
163 | - '+1:word.istitle()': word1.istitle(), | ||
164 | - '+1:word.isupper()': word1.isupper(), | ||
165 | - '+1:word.hasDigit()': hasDigit(word1), | ||
166 | - '+1:word.hasNonAlphaNum': hasNonAlphaNum(word1), | ||
167 | - # '+1:word.hasUpperLower': hasUpperLower(word1), | ||
168 | '+1:word': word1, | 90 | '+1:word': word1, |
169 | '+1:lemma': lemma1, | 91 | '+1:lemma': lemma1, |
170 | - '+1:postag': postag1, | ||
171 | - '+1:postag[:2]': postag1[:2], | ||
172 | - '+1:postag[:1]': postag1[:1], | ||
173 | }) | 92 | }) |
174 | - # else: | 93 | + |
175 | - # features['EOS'] = True | ||
176 | if i > 1: | 94 | if i > 1: |
177 | listElem = sent[i - 2].split('|') | 95 | listElem = sent[i - 2].split('|') |
178 | word2 = listElem[0] | 96 | word2 = listElem[0] |
179 | lemma2 = listElem[1] | 97 | lemma2 = listElem[1] |
180 | postag2 = listElem[2] | 98 | postag2 = listElem[2] |
181 | features.update({ | 99 | features.update({ |
182 | - '-2:word.lower()': word2.lower(), | ||
183 | - '-2:word.istitle()': word2.istitle(), | ||
184 | - '-2:word.isupper()': word2.isupper(), | ||
185 | - '-2:word.hasDigit()': hasDigit(word2), | ||
186 | - '-2:word.hasNonAlphaNum': hasNonAlphaNum(word2), | ||
187 | - # '-2:word.hasUpperLower': hasUpperLower(word2), | ||
188 | '-2:word': word2, | 100 | '-2:word': word2, |
189 | '-2:lemma': lemma2, | 101 | '-2:lemma': lemma2, |
190 | - '-2:postag': postag2, | ||
191 | - '-2:postag[:2]': postag2[:2], | ||
192 | - '-2:postag[:1]': postag2[:1], | ||
193 | }) | 102 | }) |
194 | 103 | ||
195 | if i < len(sent) - 2: | 104 | if i < len(sent) - 2: |
... | @@ -198,17 +107,8 @@ def word2features(sent, i): | ... | @@ -198,17 +107,8 @@ def word2features(sent, i): |
198 | lemma2 = listElem[1] | 107 | lemma2 = listElem[1] |
199 | postag2 = listElem[2] | 108 | postag2 = listElem[2] |
200 | features.update({ | 109 | features.update({ |
201 | - '+2:word.lower()': word2.lower(), | ||
202 | - '+2:word.istitle()': word2.istitle(), | ||
203 | - '+2:word.isupper()': word2.isupper(), | ||
204 | - '+2:word.hasDigit()': hasDigit(word2), | ||
205 | - '+2:word.hasNonAlphaNum': hasNonAlphaNum(word2), | ||
206 | - # '+2:word.hasUpperLower': hasUpperLower(word2), | ||
207 | '+2:word': word2, | 110 | '+2:word': word2, |
208 | '+2:lemma': lemma2, | 111 | '+2:lemma': lemma2, |
209 | - '+2:postag': postag2, | ||
210 | - '+2:postag[:2]': postag2[:2], | ||
211 | - '+2:postag[:1]': postag2[:1], | ||
212 | }) | 112 | }) |
213 | 113 | ||
214 | trigrams = False | 114 | trigrams = False |
... | @@ -219,17 +119,8 @@ def word2features(sent, i): | ... | @@ -219,17 +119,8 @@ def word2features(sent, i): |
219 | lemma3 = listElem[1] | 119 | lemma3 = listElem[1] |
220 | postag3 = listElem[2] | 120 | postag3 = listElem[2] |
221 | features.update({ | 121 | features.update({ |
222 | - '-3:word.lower()': word3.lower(), | ||
223 | - '-3:word.istitle()': word3.istitle(), | ||
224 | - '-3:word.isupper()': word3.isupper(), | ||
225 | - '-3:word.hasDigit()': hasDigit(word3), | ||
226 | - '-3:word.hasNonAlphaNum': hasNonAlphaNum(word3), | ||
227 | - # '-3:word.hasUpperLower': hasUpperLower(word3), | ||
228 | '-3:word': word3, | 122 | '-3:word': word3, |
229 | '-3:lemma': lemma3, | 123 | '-3:lemma': lemma3, |
230 | - '-3:postag': postag3, | ||
231 | - '-3:postag[:2]': postag3[:2], | ||
232 | - '-3:postag[:1]': postag3[:1], | ||
233 | }) | 124 | }) |
234 | 125 | ||
235 | if i < len(sent) - 3: | 126 | if i < len(sent) - 3: |
... | @@ -238,19 +129,10 @@ def word2features(sent, i): | ... | @@ -238,19 +129,10 @@ def word2features(sent, i): |
238 | lemma3 = listElem[1] | 129 | lemma3 = listElem[1] |
239 | postag3 = listElem[2] | 130 | postag3 = listElem[2] |
240 | features.update({ | 131 | features.update({ |
241 | - '+3:word.lower()': word3.lower(), | ||
242 | - '+3:word.istitle()': word3.istitle(), | ||
243 | - '+3:word.isupper()': word3.isupper(), | ||
244 | - '+3:word.hasDigit()': hasDigit(word3), | ||
245 | - '+3:word.hasNonAlphaNum': hasNonAlphaNum(word3), | ||
246 | - # '+3:word.hasUpperLower': hasUpperLower(word3), | ||
247 | '+3:word': word3, | 132 | '+3:word': word3, |
248 | '+3:lemma': lemma3, | 133 | '+3:lemma': lemma3, |
249 | - '+3:postag': postag3, | ||
250 | - '+3:postag[:2]': postag3[:2], | ||
251 | - '+3:postag[:1]': postag3[:1], | ||
252 | }) | 134 | }) |
253 | - | 135 | + ''' |
254 | return features | 136 | return features |
255 | 137 | ||
256 | 138 | ||
... | @@ -260,7 +142,6 @@ def sent2features(sent): | ... | @@ -260,7 +142,6 @@ def sent2features(sent): |
260 | 142 | ||
261 | def sent2labels(sent): | 143 | def sent2labels(sent): |
262 | return [elem.split('|')[3] for elem in sent] | 144 | return [elem.split('|')[3] for elem in sent] |
263 | - # return [label for token, postag, label in sent] | ||
264 | 145 | ||
265 | 146 | ||
266 | def sent2tokens(sent): | 147 | def sent2tokens(sent): |
... | @@ -269,19 +150,11 @@ def sent2tokens(sent): | ... | @@ -269,19 +150,11 @@ def sent2tokens(sent): |
269 | 150 | ||
270 | def print_transitions(trans_features, f): | 151 | def print_transitions(trans_features, f): |
271 | for (label_from, label_to), weight in trans_features: | 152 | for (label_from, label_to), weight in trans_features: |
272 | - # f.write("%-6s -> %-7s %0.6f\n" % (label_from, label_to, weight)) | ||
273 | - # f.write("label_from :" + label_from) | ||
274 | - # f.write("label_to :" + label_to) | ||
275 | - # f.write("label_weight :" + weight) | ||
276 | - # f.write("{} -> {} {:0.6f}\n".format(label_from.encode("utf-8"), label_to.encode("utf-8"), weight)) | ||
277 | f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight)) | 153 | f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight)) |
278 | 154 | ||
279 | 155 | ||
280 | def print_state_features(state_features, f): | 156 | def print_state_features(state_features, f): |
281 | for (attr, label), weight in state_features: | 157 | for (attr, label), weight in state_features: |
282 | - # f.write("%0.6f %-8s %s\n" % (weight, label, attr)) | ||
283 | - # f.write(attr.encode("utf-8")) | ||
284 | - # '{:06.2f}'.format(3.141592653589793) | ||
285 | f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8"))) | 158 | f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8"))) |
286 | 159 | ||
287 | 160 | ||
... | @@ -303,12 +176,12 @@ if __name__ == "__main__": | ... | @@ -303,12 +176,12 @@ if __name__ == "__main__": |
303 | help="File with training data set", metavar="FILE") | 176 | help="File with training data set", metavar="FILE") |
304 | parser.add_option("--testFile", dest="testFile", | 177 | parser.add_option("--testFile", dest="testFile", |
305 | help="File with test data set", metavar="FILE") | 178 | help="File with test data set", metavar="FILE") |
306 | - parser.add_option("--filterStopWords", default=False, | 179 | + parser.add_option("--excludeStopWords", default=False, |
307 | - action="store_true", dest="filterStopWords", | 180 | + action="store_true", dest="excludeStopWords", |
308 | - help="Filtering stop words") | 181 | + help="Exclude stop words") |
309 | - parser.add_option("--filterSymbols", default=False, | 182 | + parser.add_option("--excludeSymbols", default=False, |
310 | - action="store_true", dest="filterSymbols", | 183 | + action="store_true", dest="excludeSymbols", |
311 | - help="Filtering punctuation marks") | 184 | + help="Exclude punctuation marks") |
312 | 185 | ||
313 | (options, args) = parser.parse_args() | 186 | (options, args) = parser.parse_args() |
314 | if len(args) > 0: | 187 | if len(args) > 0: |
... | @@ -320,10 +193,10 @@ if __name__ == "__main__": | ... | @@ -320,10 +193,10 @@ if __name__ == "__main__": |
320 | print("File with training data set: " + str(options.trainingFile)) | 193 | print("File with training data set: " + str(options.trainingFile)) |
321 | print("Path of test data set: " + options.inputPath) | 194 | print("Path of test data set: " + options.inputPath) |
322 | print("File with test data set: " + str(options.testFile)) | 195 | print("File with test data set: " + str(options.testFile)) |
323 | - print("Filtering stop words: " + str(options.filterStopWords)) | 196 | + print("Exclude stop words: " + str(options.excludeStopWords)) |
324 | symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | 197 | symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', |
325 | '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...'] | 198 | '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...'] |
326 | - print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols)) | 199 | + print("Exclude symbols " + str(symbols) + ': ' + str(options.excludeSymbols)) |
327 | 200 | ||
328 | print('-------------------------------- PROCESSING --------------------------------') | 201 | print('-------------------------------- PROCESSING --------------------------------') |
329 | print('Reading corpus...') | 202 | print('Reading corpus...') |
... | @@ -332,67 +205,50 @@ if __name__ == "__main__": | ... | @@ -332,67 +205,50 @@ if __name__ == "__main__": |
332 | sentencesTrainingData = [] | 205 | sentencesTrainingData = [] |
333 | sentencesTestData = [] | 206 | sentencesTestData = [] |
334 | 207 | ||
335 | - # Original: stopwords = [word.decode('utf-8') for word in stopwords.words('english')] | ||
336 | stopwords = [word for word in stopwords.words('english')] | 208 | stopwords = [word for word in stopwords.words('english')] |
337 | 209 | ||
338 | with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile: | 210 | with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile: |
339 | - # with open(os.path.join(options.inputPath, options.trainingFile), "r", encoding="utf-8", errors='replace') as iFile: | ||
340 | for line in iFile.readlines(): | 211 | for line in iFile.readlines(): |
341 | listLine = [] | 212 | listLine = [] |
342 | line = line.strip('\n') | 213 | line = line.strip('\n') |
343 | for token in line.split(): | 214 | for token in line.split(): |
344 | - if options.filterStopWords: | 215 | + if options.excludeStopWords: |
345 | listToken = token.split('|') | 216 | listToken = token.split('|') |
346 | lemma = listToken[1] | 217 | lemma = listToken[1] |
347 | - # Original: if lemma in stopwords.words('english'): | ||
348 | - # trainingTesting_Sklearn_crfsuite.py:269: | ||
349 | - # UnicodeWarning: Unicode equal comparison failed to | ||
350 | - # convert both arguments to Unicode - | ||
351 | - # interpreting them as being unequal | ||
352 | if lemma in stopwords: | 218 | if lemma in stopwords: |
353 | continue | 219 | continue |
354 | - if options.filterSymbols: | 220 | + if options.excludeSymbols: |
355 | listToken = token.split('|') | 221 | listToken = token.split('|') |
356 | lemma = listToken[1] | 222 | lemma = listToken[1] |
357 | if lemma in symbols: | 223 | if lemma in symbols: |
358 | - # if lemma == ',': | ||
359 | - # print "Coma , identificada" | ||
360 | continue | 224 | continue |
361 | listLine.append(token) | 225 | listLine.append(token) |
362 | sentencesTrainingData.append(listLine) | 226 | sentencesTrainingData.append(listLine) |
363 | print(" Sentences training data: " + str(len(sentencesTrainingData))) | 227 | print(" Sentences training data: " + str(len(sentencesTrainingData))) |
364 | - # print sentencesTrainingData[0] | ||
365 | 228 | ||
366 | with open(os.path.join(options.inputPath, options.testFile), "r") as iFile: | 229 | with open(os.path.join(options.inputPath, options.testFile), "r") as iFile: |
367 | - # with open(os.path.join(options.inputPath, options.testFile), "r", encoding="utf-8", errors='replace') as iFile: | ||
368 | for line in iFile.readlines(): | 230 | for line in iFile.readlines(): |
369 | listLine = [] | 231 | listLine = [] |
370 | line = line.strip('\n') | 232 | line = line.strip('\n') |
371 | for token in line.split(): | 233 | for token in line.split(): |
372 | - if options.filterStopWords: | 234 | + if options.excludeStopWords: |
373 | listToken = token.split('|') | 235 | listToken = token.split('|') |
374 | lemma = listToken[1] | 236 | lemma = listToken[1] |
375 | - # Original if lemma in stopwords.words('english'): | ||
376 | if lemma in stopwords: | 237 | if lemma in stopwords: |
377 | continue | 238 | continue |
378 | - if options.filterSymbols: | 239 | + if options.excludeSymbols: |
379 | listToken = token.split('|') | 240 | listToken = token.split('|') |
380 | lemma = listToken[1] | 241 | lemma = listToken[1] |
381 | if lemma in symbols: | 242 | if lemma in symbols: |
382 | - # if lemma == ',': | ||
383 | - # print "Coma , identificada" | ||
384 | continue | 243 | continue |
385 | listLine.append(token) | 244 | listLine.append(token) |
386 | sentencesTestData.append(listLine) | 245 | sentencesTestData.append(listLine) |
387 | print(" Sentences test data: " + str(len(sentencesTestData))) | 246 | print(" Sentences test data: " + str(len(sentencesTestData))) |
388 | - # print sentencesTestData[0] | ||
389 | 247 | ||
390 | print("Reading corpus done in: %fs" % (time() - t0)) | 248 | print("Reading corpus done in: %fs" % (time() - t0)) |
391 | 249 | ||
392 | print(sent2features(sentencesTrainingData[0])[0]) | 250 | print(sent2features(sentencesTrainingData[0])[0]) |
393 | print(sent2features(sentencesTestData[0])[0]) | 251 | print(sent2features(sentencesTestData[0])[0]) |
394 | - # print(sent2labels(sentencesTrainingData[0])) | ||
395 | - # print(sent2labels(sentencesTestData[0])) | ||
396 | t0 = time() | 252 | t0 = time() |
397 | 253 | ||
398 | X_train = [sent2features(s) for s in sentencesTrainingData] | 254 | X_train = [sent2features(s) for s in sentencesTrainingData] |
... | @@ -445,8 +301,8 @@ if __name__ == "__main__": | ... | @@ -445,8 +301,8 @@ if __name__ == "__main__": |
445 | 301 | ||
446 | # Best hiperparameters | 302 | # Best hiperparameters |
447 | # crf = rs.best_estimator_ | 303 | # crf = rs.best_estimator_ |
448 | - nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str( | 304 | + nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str( |
449 | - options.filterSymbols) + '.txt') | 305 | + options.excludeSymbols) + '.txt') |
450 | with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile: | 306 | with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile: |
451 | oFile.write("********** TRAINING AND TESTING REPORT **********\n") | 307 | oFile.write("********** TRAINING AND TESTING REPORT **********\n") |
452 | oFile.write("Training file: " + options.trainingFile + '\n') | 308 | oFile.write("Training file: " + options.trainingFile + '\n') |
... | @@ -464,23 +320,23 @@ if __name__ == "__main__": | ... | @@ -464,23 +320,23 @@ if __name__ == "__main__": |
464 | # Saving model | 320 | # Saving model |
465 | print(" Saving training model...") | 321 | print(" Saving training model...") |
466 | t1 = time() | 322 | t1 = time() |
467 | - nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str( | 323 | + nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str( |
468 | - options.filterSymbols) + '.mod') | 324 | + options.excludeSymbols) + '.mod') |
469 | joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel)) | 325 | joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel)) |
470 | print(" Saving training model done in: %fs" % (time() - t1)) | 326 | print(" Saving training model done in: %fs" % (time() - t1)) |
471 | 327 | ||
472 | # Evaluation against test data | 328 | # Evaluation against test data |
473 | y_pred = crf.predict(X_test) | 329 | y_pred = crf.predict(X_test) |
474 | print("*********************************") | 330 | print("*********************************") |
475 | - name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str( | 331 | + name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str( |
476 | - options.filterSymbols) + '.txt') | 332 | + options.excludeSymbols) + '.txt') |
477 | with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile: | 333 | with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile: |
478 | for y in y_pred: | 334 | for y in y_pred: |
479 | oFile.write(str(y) + '\n') | 335 | oFile.write(str(y) + '\n') |
480 | 336 | ||
481 | print("*********************************") | 337 | print("*********************************") |
482 | - name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str( | 338 | + name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str( |
483 | - options.filterSymbols) + '.txt') | 339 | + options.excludeSymbols) + '.txt') |
484 | with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile: | 340 | with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile: |
485 | for y in y_test: | 341 | for y in y_test: |
486 | oFile.write(str(y) + '\n') | 342 | oFile.write(str(y) + '\n') | ... | ... |
-
Please register or login to post a comment