Showing
1 changed file
with
30 additions
and
174 deletions
| ... | @@ -32,7 +32,7 @@ from nltk.corpus import stopwords | ... | @@ -32,7 +32,7 @@ from nltk.corpus import stopwords |
| 32 | # --testFile File with test data set | 32 | # --testFile File with test data set |
| 33 | # --outputPath=PATH Output path to place output files | 33 | # --outputPath=PATH Output path to place output files |
| 34 | # --filteringStopWords Filtering stop words | 34 | # --filteringStopWords Filtering stop words |
| 35 | -# --filterSymbols Filtering punctuation marks | 35 | +# --excludeSymbols Filtering punctuation marks |
| 36 | 36 | ||
| 37 | # Output | 37 | # Output |
| 38 | # 1) Best model | 38 | # 1) Best model |
| ... | @@ -42,116 +42,44 @@ from nltk.corpus import stopwords | ... | @@ -42,116 +42,44 @@ from nltk.corpus import stopwords |
| 42 | # --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets | 42 | # --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets |
| 43 | # --trainingFile training-data-set-70.txt | 43 | # --trainingFile training-data-set-70.txt |
| 44 | # --testFile test-data-set-30.txt | 44 | # --testFile test-data-set-30.txt |
| 45 | -# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/reports | 45 | +# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields |
| 46 | -# python3.4 training-validation-v1.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/reports | 46 | +# python3.4 training-validation-v1.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields |
| 47 | 47 | ||
| 48 | ################################# | 48 | ################################# |
| 49 | # FUNCTIONS # | 49 | # FUNCTIONS # |
| 50 | ################################# | 50 | ################################# |
| 51 | 51 | ||
| 52 | -def wordSize(text): | ||
| 53 | - lWord = len(text) | ||
| 54 | - if lWord == 1: | ||
| 55 | - return '1' | ||
| 56 | - elif lWord == 2: | ||
| 57 | - return '2' | ||
| 58 | - elif lWord == 3: | ||
| 59 | - return '3' | ||
| 60 | - elif lWord == 4: | ||
| 61 | - return '4' | ||
| 62 | - elif lWord == 5: | ||
| 63 | - return '5' | ||
| 64 | - elif 6 <= lWord <= 10: | ||
| 65 | - return '6-10' | ||
| 66 | - elif 11 <= lWord <= 15: | ||
| 67 | - return '11-15' | ||
| 68 | - elif 16 <= lWord <= 20: | ||
| 69 | - return '16-20' | ||
| 70 | - elif 21 <= lWord <= 30: | ||
| 71 | - return '21-30' | ||
| 72 | - else: | ||
| 73 | - return '>30' | ||
| 74 | - | ||
| 75 | -def hasUpperLower(text): | ||
| 76 | - has = False | ||
| 77 | - if len(text) < 3: | ||
| 78 | - return False | ||
| 79 | - regexUp = nltk.re.compile('[A-Z]') | ||
| 80 | - regexLo = nltk.re.compile('[a-z]') | ||
| 81 | - if (regexUp.search(text) != None) and (regexLo.search(text) != None): | ||
| 82 | - has = True | ||
| 83 | - return has | ||
| 84 | - | ||
| 85 | -def hasDigit(text): | ||
| 86 | - has = False | ||
| 87 | - if len(text) < 3: | ||
| 88 | - return False | ||
| 89 | - myRegex = nltk.re.compile('[0-9]') | ||
| 90 | - if myRegex.search(text) != None: | ||
| 91 | - has = True | ||
| 92 | - return has | ||
| 93 | - | ||
| 94 | - | ||
| 95 | -def hasNonAlphaNum(text): | ||
| 96 | - has = False | ||
| 97 | - if len(text) < 3: | ||
| 98 | - return False | ||
| 99 | - myRegex = nltk.re.compile('\W') | ||
| 100 | - if myRegex.search(text) != None: | ||
| 101 | - has = True | ||
| 102 | - return has | ||
| 103 | - | ||
| 104 | def word2features(sent, i): | 52 | def word2features(sent, i): |
| 105 | - # print "i: " + str(i) | ||
| 106 | - # print "sent[i]" + sent[i] | ||
| 107 | listElem = sent[i].split('|') | 53 | listElem = sent[i].split('|') |
| 108 | word = listElem[0] | 54 | word = listElem[0] |
| 109 | lemma = listElem[1] | 55 | lemma = listElem[1] |
| 110 | postag = listElem[2] | 56 | postag = listElem[2] |
| 111 | 57 | ||
| 112 | features = { | 58 | features = { |
| 113 | - # Names of TF and genes change by lower and upper characters: 'word.lower()': word.lower(), | ||
| 114 | # Suffixes | 59 | # Suffixes |
| 115 | 'word[-3:]': word[-3:], | 60 | 'word[-3:]': word[-3:], |
| 116 | 'word[-2:]': word[-2:], | 61 | 'word[-2:]': word[-2:], |
| 117 | 'word[-1:]': word[-1:], | 62 | 'word[-1:]': word[-1:], |
| 118 | - 'word.isupper()': word.isupper(), | 63 | + #'word.isupper()': word.isupper(), |
| 119 | - 'word.istitle()': word.istitle(), | ||
| 120 | - 'word.hasDigit()': hasDigit(word), | ||
| 121 | - 'word.hasNonAlphaNum': hasNonAlphaNum(word), | ||
| 122 | - # 'word.hasUpperLower': hasUpperLower(word), | ||
| 123 | - #'wordSize': wordSize(word), | ||
| 124 | - # 'word.isdigit()': word.isdigit(), | ||
| 125 | 'word': word, | 64 | 'word': word, |
| 126 | 'lemma': lemma, | 65 | 'lemma': lemma, |
| 127 | 'lemma[-3:]': lemma[-3:], | 66 | 'lemma[-3:]': lemma[-3:], |
| 128 | 'lemma[-2:]': lemma[-2:], | 67 | 'lemma[-2:]': lemma[-2:], |
| 129 | 'lemma[-1:]': lemma[-1:], | 68 | 'lemma[-1:]': lemma[-1:], |
| 130 | - 'postag': postag, | 69 | + 'word[:3]': word[:3], |
| 131 | - # Prefixes | 70 | + 'word[:2]': word[:2], |
| 132 | - 'postag[:2]': postag[:2], | 71 | + 'word[:1]': word[:1], |
| 133 | - 'postag[:1]': postag[:1], | ||
| 134 | } | 72 | } |
| 73 | + ''' | ||
| 135 | if i > 0: | 74 | if i > 0: |
| 136 | listElem = sent[i - 1].split('|') | 75 | listElem = sent[i - 1].split('|') |
| 137 | word1 = listElem[0] | 76 | word1 = listElem[0] |
| 138 | lemma1 = listElem[1] | 77 | lemma1 = listElem[1] |
| 139 | postag1 = listElem[2] | 78 | postag1 = listElem[2] |
| 140 | features.update({ | 79 | features.update({ |
| 141 | - '-1:word.lower()': word1.lower(), | ||
| 142 | - '-1:word.istitle()': word1.istitle(), | ||
| 143 | - '-1:word.isupper()': word1.isupper(), | ||
| 144 | - '-1:word.hasDigit()': hasDigit(word1), | ||
| 145 | - '-1:word.hasNonAlphaNum': hasNonAlphaNum(word1), | ||
| 146 | - # '-1:word.hasUpperLower': hasUpperLower(word1), | ||
| 147 | '-1:word': word1, | 80 | '-1:word': word1, |
| 148 | '-1:lemma': lemma1, | 81 | '-1:lemma': lemma1, |
| 149 | - '-1:postag': postag1, | ||
| 150 | - '-1:postag[:2]': postag1[:2], | ||
| 151 | - '-1:postag[:1]': postag1[:1], | ||
| 152 | }) | 82 | }) |
| 153 | - # else: | ||
| 154 | - # features['BOS'] = True | ||
| 155 | 83 | ||
| 156 | if i < len(sent) - 1: | 84 | if i < len(sent) - 1: |
| 157 | listElem = sent[i + 1].split('|') | 85 | listElem = sent[i + 1].split('|') |
| ... | @@ -159,37 +87,18 @@ def word2features(sent, i): | ... | @@ -159,37 +87,18 @@ def word2features(sent, i): |
| 159 | lemma1 = listElem[1] | 87 | lemma1 = listElem[1] |
| 160 | postag1 = listElem[2] | 88 | postag1 = listElem[2] |
| 161 | features.update({ | 89 | features.update({ |
| 162 | - '+1:word.lower()': word1.lower(), | ||
| 163 | - '+1:word.istitle()': word1.istitle(), | ||
| 164 | - '+1:word.isupper()': word1.isupper(), | ||
| 165 | - '+1:word.hasDigit()': hasDigit(word1), | ||
| 166 | - '+1:word.hasNonAlphaNum': hasNonAlphaNum(word1), | ||
| 167 | - # '+1:word.hasUpperLower': hasUpperLower(word1), | ||
| 168 | '+1:word': word1, | 90 | '+1:word': word1, |
| 169 | '+1:lemma': lemma1, | 91 | '+1:lemma': lemma1, |
| 170 | - '+1:postag': postag1, | ||
| 171 | - '+1:postag[:2]': postag1[:2], | ||
| 172 | - '+1:postag[:1]': postag1[:1], | ||
| 173 | }) | 92 | }) |
| 174 | - # else: | 93 | + |
| 175 | - # features['EOS'] = True | ||
| 176 | if i > 1: | 94 | if i > 1: |
| 177 | listElem = sent[i - 2].split('|') | 95 | listElem = sent[i - 2].split('|') |
| 178 | word2 = listElem[0] | 96 | word2 = listElem[0] |
| 179 | lemma2 = listElem[1] | 97 | lemma2 = listElem[1] |
| 180 | postag2 = listElem[2] | 98 | postag2 = listElem[2] |
| 181 | features.update({ | 99 | features.update({ |
| 182 | - '-2:word.lower()': word2.lower(), | ||
| 183 | - '-2:word.istitle()': word2.istitle(), | ||
| 184 | - '-2:word.isupper()': word2.isupper(), | ||
| 185 | - '-2:word.hasDigit()': hasDigit(word2), | ||
| 186 | - '-2:word.hasNonAlphaNum': hasNonAlphaNum(word2), | ||
| 187 | - # '-2:word.hasUpperLower': hasUpperLower(word2), | ||
| 188 | '-2:word': word2, | 100 | '-2:word': word2, |
| 189 | '-2:lemma': lemma2, | 101 | '-2:lemma': lemma2, |
| 190 | - '-2:postag': postag2, | ||
| 191 | - '-2:postag[:2]': postag2[:2], | ||
| 192 | - '-2:postag[:1]': postag2[:1], | ||
| 193 | }) | 102 | }) |
| 194 | 103 | ||
| 195 | if i < len(sent) - 2: | 104 | if i < len(sent) - 2: |
| ... | @@ -198,17 +107,8 @@ def word2features(sent, i): | ... | @@ -198,17 +107,8 @@ def word2features(sent, i): |
| 198 | lemma2 = listElem[1] | 107 | lemma2 = listElem[1] |
| 199 | postag2 = listElem[2] | 108 | postag2 = listElem[2] |
| 200 | features.update({ | 109 | features.update({ |
| 201 | - '+2:word.lower()': word2.lower(), | ||
| 202 | - '+2:word.istitle()': word2.istitle(), | ||
| 203 | - '+2:word.isupper()': word2.isupper(), | ||
| 204 | - '+2:word.hasDigit()': hasDigit(word2), | ||
| 205 | - '+2:word.hasNonAlphaNum': hasNonAlphaNum(word2), | ||
| 206 | - # '+2:word.hasUpperLower': hasUpperLower(word2), | ||
| 207 | '+2:word': word2, | 110 | '+2:word': word2, |
| 208 | '+2:lemma': lemma2, | 111 | '+2:lemma': lemma2, |
| 209 | - '+2:postag': postag2, | ||
| 210 | - '+2:postag[:2]': postag2[:2], | ||
| 211 | - '+2:postag[:1]': postag2[:1], | ||
| 212 | }) | 112 | }) |
| 213 | 113 | ||
| 214 | trigrams = False | 114 | trigrams = False |
| ... | @@ -219,17 +119,8 @@ def word2features(sent, i): | ... | @@ -219,17 +119,8 @@ def word2features(sent, i): |
| 219 | lemma3 = listElem[1] | 119 | lemma3 = listElem[1] |
| 220 | postag3 = listElem[2] | 120 | postag3 = listElem[2] |
| 221 | features.update({ | 121 | features.update({ |
| 222 | - '-3:word.lower()': word3.lower(), | ||
| 223 | - '-3:word.istitle()': word3.istitle(), | ||
| 224 | - '-3:word.isupper()': word3.isupper(), | ||
| 225 | - '-3:word.hasDigit()': hasDigit(word3), | ||
| 226 | - '-3:word.hasNonAlphaNum': hasNonAlphaNum(word3), | ||
| 227 | - # '-3:word.hasUpperLower': hasUpperLower(word3), | ||
| 228 | '-3:word': word3, | 122 | '-3:word': word3, |
| 229 | '-3:lemma': lemma3, | 123 | '-3:lemma': lemma3, |
| 230 | - '-3:postag': postag3, | ||
| 231 | - '-3:postag[:2]': postag3[:2], | ||
| 232 | - '-3:postag[:1]': postag3[:1], | ||
| 233 | }) | 124 | }) |
| 234 | 125 | ||
| 235 | if i < len(sent) - 3: | 126 | if i < len(sent) - 3: |
| ... | @@ -238,19 +129,10 @@ def word2features(sent, i): | ... | @@ -238,19 +129,10 @@ def word2features(sent, i): |
| 238 | lemma3 = listElem[1] | 129 | lemma3 = listElem[1] |
| 239 | postag3 = listElem[2] | 130 | postag3 = listElem[2] |
| 240 | features.update({ | 131 | features.update({ |
| 241 | - '+3:word.lower()': word3.lower(), | ||
| 242 | - '+3:word.istitle()': word3.istitle(), | ||
| 243 | - '+3:word.isupper()': word3.isupper(), | ||
| 244 | - '+3:word.hasDigit()': hasDigit(word3), | ||
| 245 | - '+3:word.hasNonAlphaNum': hasNonAlphaNum(word3), | ||
| 246 | - # '+3:word.hasUpperLower': hasUpperLower(word3), | ||
| 247 | '+3:word': word3, | 132 | '+3:word': word3, |
| 248 | '+3:lemma': lemma3, | 133 | '+3:lemma': lemma3, |
| 249 | - '+3:postag': postag3, | ||
| 250 | - '+3:postag[:2]': postag3[:2], | ||
| 251 | - '+3:postag[:1]': postag3[:1], | ||
| 252 | }) | 134 | }) |
| 253 | - | 135 | + ''' |
| 254 | return features | 136 | return features |
| 255 | 137 | ||
| 256 | 138 | ||
| ... | @@ -260,7 +142,6 @@ def sent2features(sent): | ... | @@ -260,7 +142,6 @@ def sent2features(sent): |
| 260 | 142 | ||
| 261 | def sent2labels(sent): | 143 | def sent2labels(sent): |
| 262 | return [elem.split('|')[3] for elem in sent] | 144 | return [elem.split('|')[3] for elem in sent] |
| 263 | - # return [label for token, postag, label in sent] | ||
| 264 | 145 | ||
| 265 | 146 | ||
| 266 | def sent2tokens(sent): | 147 | def sent2tokens(sent): |
| ... | @@ -269,19 +150,11 @@ def sent2tokens(sent): | ... | @@ -269,19 +150,11 @@ def sent2tokens(sent): |
| 269 | 150 | ||
| 270 | def print_transitions(trans_features, f): | 151 | def print_transitions(trans_features, f): |
| 271 | for (label_from, label_to), weight in trans_features: | 152 | for (label_from, label_to), weight in trans_features: |
| 272 | - # f.write("%-6s -> %-7s %0.6f\n" % (label_from, label_to, weight)) | ||
| 273 | - # f.write("label_from :" + label_from) | ||
| 274 | - # f.write("label_to :" + label_to) | ||
| 275 | - # f.write("label_weight :" + weight) | ||
| 276 | - # f.write("{} -> {} {:0.6f}\n".format(label_from.encode("utf-8"), label_to.encode("utf-8"), weight)) | ||
| 277 | f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight)) | 153 | f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight)) |
| 278 | 154 | ||
| 279 | 155 | ||
| 280 | def print_state_features(state_features, f): | 156 | def print_state_features(state_features, f): |
| 281 | for (attr, label), weight in state_features: | 157 | for (attr, label), weight in state_features: |
| 282 | - # f.write("%0.6f %-8s %s\n" % (weight, label, attr)) | ||
| 283 | - # f.write(attr.encode("utf-8")) | ||
| 284 | - # '{:06.2f}'.format(3.141592653589793) | ||
| 285 | f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8"))) | 158 | f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8"))) |
| 286 | 159 | ||
| 287 | 160 | ||
| ... | @@ -303,12 +176,12 @@ if __name__ == "__main__": | ... | @@ -303,12 +176,12 @@ if __name__ == "__main__": |
| 303 | help="File with training data set", metavar="FILE") | 176 | help="File with training data set", metavar="FILE") |
| 304 | parser.add_option("--testFile", dest="testFile", | 177 | parser.add_option("--testFile", dest="testFile", |
| 305 | help="File with test data set", metavar="FILE") | 178 | help="File with test data set", metavar="FILE") |
| 306 | - parser.add_option("--filterStopWords", default=False, | 179 | + parser.add_option("--excludeStopWords", default=False, |
| 307 | - action="store_true", dest="filterStopWords", | 180 | + action="store_true", dest="excludeStopWords", |
| 308 | - help="Filtering stop words") | 181 | + help="Exclude stop words") |
| 309 | - parser.add_option("--filterSymbols", default=False, | 182 | + parser.add_option("--excludeSymbols", default=False, |
| 310 | - action="store_true", dest="filterSymbols", | 183 | + action="store_true", dest="excludeSymbols", |
| 311 | - help="Filtering punctuation marks") | 184 | + help="Exclude punctuation marks") |
| 312 | 185 | ||
| 313 | (options, args) = parser.parse_args() | 186 | (options, args) = parser.parse_args() |
| 314 | if len(args) > 0: | 187 | if len(args) > 0: |
| ... | @@ -320,10 +193,10 @@ if __name__ == "__main__": | ... | @@ -320,10 +193,10 @@ if __name__ == "__main__": |
| 320 | print("File with training data set: " + str(options.trainingFile)) | 193 | print("File with training data set: " + str(options.trainingFile)) |
| 321 | print("Path of test data set: " + options.inputPath) | 194 | print("Path of test data set: " + options.inputPath) |
| 322 | print("File with test data set: " + str(options.testFile)) | 195 | print("File with test data set: " + str(options.testFile)) |
| 323 | - print("Filtering stop words: " + str(options.filterStopWords)) | 196 | + print("Exclude stop words: " + str(options.excludeStopWords)) |
| 324 | symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | 197 | symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', |
| 325 | '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...'] | 198 | '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...'] |
| 326 | - print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols)) | 199 | + print("Exclude symbols " + str(symbols) + ': ' + str(options.excludeSymbols)) |
| 327 | 200 | ||
| 328 | print('-------------------------------- PROCESSING --------------------------------') | 201 | print('-------------------------------- PROCESSING --------------------------------') |
| 329 | print('Reading corpus...') | 202 | print('Reading corpus...') |
| ... | @@ -332,67 +205,50 @@ if __name__ == "__main__": | ... | @@ -332,67 +205,50 @@ if __name__ == "__main__": |
| 332 | sentencesTrainingData = [] | 205 | sentencesTrainingData = [] |
| 333 | sentencesTestData = [] | 206 | sentencesTestData = [] |
| 334 | 207 | ||
| 335 | - # Original: stopwords = [word.decode('utf-8') for word in stopwords.words('english')] | ||
| 336 | stopwords = [word for word in stopwords.words('english')] | 208 | stopwords = [word for word in stopwords.words('english')] |
| 337 | 209 | ||
| 338 | with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile: | 210 | with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile: |
| 339 | - # with open(os.path.join(options.inputPath, options.trainingFile), "r", encoding="utf-8", errors='replace') as iFile: | ||
| 340 | for line in iFile.readlines(): | 211 | for line in iFile.readlines(): |
| 341 | listLine = [] | 212 | listLine = [] |
| 342 | line = line.strip('\n') | 213 | line = line.strip('\n') |
| 343 | for token in line.split(): | 214 | for token in line.split(): |
| 344 | - if options.filterStopWords: | 215 | + if options.excludeStopWords: |
| 345 | listToken = token.split('|') | 216 | listToken = token.split('|') |
| 346 | lemma = listToken[1] | 217 | lemma = listToken[1] |
| 347 | - # Original: if lemma in stopwords.words('english'): | ||
| 348 | - # trainingTesting_Sklearn_crfsuite.py:269: | ||
| 349 | - # UnicodeWarning: Unicode equal comparison failed to | ||
| 350 | - # convert both arguments to Unicode - | ||
| 351 | - # interpreting them as being unequal | ||
| 352 | if lemma in stopwords: | 218 | if lemma in stopwords: |
| 353 | continue | 219 | continue |
| 354 | - if options.filterSymbols: | 220 | + if options.excludeSymbols: |
| 355 | listToken = token.split('|') | 221 | listToken = token.split('|') |
| 356 | lemma = listToken[1] | 222 | lemma = listToken[1] |
| 357 | if lemma in symbols: | 223 | if lemma in symbols: |
| 358 | - # if lemma == ',': | ||
| 359 | - # print "Coma , identificada" | ||
| 360 | continue | 224 | continue |
| 361 | listLine.append(token) | 225 | listLine.append(token) |
| 362 | sentencesTrainingData.append(listLine) | 226 | sentencesTrainingData.append(listLine) |
| 363 | print(" Sentences training data: " + str(len(sentencesTrainingData))) | 227 | print(" Sentences training data: " + str(len(sentencesTrainingData))) |
| 364 | - # print sentencesTrainingData[0] | ||
| 365 | 228 | ||
| 366 | with open(os.path.join(options.inputPath, options.testFile), "r") as iFile: | 229 | with open(os.path.join(options.inputPath, options.testFile), "r") as iFile: |
| 367 | - # with open(os.path.join(options.inputPath, options.testFile), "r", encoding="utf-8", errors='replace') as iFile: | ||
| 368 | for line in iFile.readlines(): | 230 | for line in iFile.readlines(): |
| 369 | listLine = [] | 231 | listLine = [] |
| 370 | line = line.strip('\n') | 232 | line = line.strip('\n') |
| 371 | for token in line.split(): | 233 | for token in line.split(): |
| 372 | - if options.filterStopWords: | 234 | + if options.excludeStopWords: |
| 373 | listToken = token.split('|') | 235 | listToken = token.split('|') |
| 374 | lemma = listToken[1] | 236 | lemma = listToken[1] |
| 375 | - # Original if lemma in stopwords.words('english'): | ||
| 376 | if lemma in stopwords: | 237 | if lemma in stopwords: |
| 377 | continue | 238 | continue |
| 378 | - if options.filterSymbols: | 239 | + if options.excludeSymbols: |
| 379 | listToken = token.split('|') | 240 | listToken = token.split('|') |
| 380 | lemma = listToken[1] | 241 | lemma = listToken[1] |
| 381 | if lemma in symbols: | 242 | if lemma in symbols: |
| 382 | - # if lemma == ',': | ||
| 383 | - # print "Coma , identificada" | ||
| 384 | continue | 243 | continue |
| 385 | listLine.append(token) | 244 | listLine.append(token) |
| 386 | sentencesTestData.append(listLine) | 245 | sentencesTestData.append(listLine) |
| 387 | print(" Sentences test data: " + str(len(sentencesTestData))) | 246 | print(" Sentences test data: " + str(len(sentencesTestData))) |
| 388 | - # print sentencesTestData[0] | ||
| 389 | 247 | ||
| 390 | print("Reading corpus done in: %fs" % (time() - t0)) | 248 | print("Reading corpus done in: %fs" % (time() - t0)) |
| 391 | 249 | ||
| 392 | print(sent2features(sentencesTrainingData[0])[0]) | 250 | print(sent2features(sentencesTrainingData[0])[0]) |
| 393 | print(sent2features(sentencesTestData[0])[0]) | 251 | print(sent2features(sentencesTestData[0])[0]) |
| 394 | - # print(sent2labels(sentencesTrainingData[0])) | ||
| 395 | - # print(sent2labels(sentencesTestData[0])) | ||
| 396 | t0 = time() | 252 | t0 = time() |
| 397 | 253 | ||
| 398 | X_train = [sent2features(s) for s in sentencesTrainingData] | 254 | X_train = [sent2features(s) for s in sentencesTrainingData] |
| ... | @@ -445,8 +301,8 @@ if __name__ == "__main__": | ... | @@ -445,8 +301,8 @@ if __name__ == "__main__": |
| 445 | 301 | ||
| 446 | # Best hiperparameters | 302 | # Best hiperparameters |
| 447 | # crf = rs.best_estimator_ | 303 | # crf = rs.best_estimator_ |
| 448 | - nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str( | 304 | + nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str( |
| 449 | - options.filterSymbols) + '.txt') | 305 | + options.excludeSymbols) + '.txt') |
| 450 | with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile: | 306 | with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile: |
| 451 | oFile.write("********** TRAINING AND TESTING REPORT **********\n") | 307 | oFile.write("********** TRAINING AND TESTING REPORT **********\n") |
| 452 | oFile.write("Training file: " + options.trainingFile + '\n') | 308 | oFile.write("Training file: " + options.trainingFile + '\n') |
| ... | @@ -464,23 +320,23 @@ if __name__ == "__main__": | ... | @@ -464,23 +320,23 @@ if __name__ == "__main__": |
| 464 | # Saving model | 320 | # Saving model |
| 465 | print(" Saving training model...") | 321 | print(" Saving training model...") |
| 466 | t1 = time() | 322 | t1 = time() |
| 467 | - nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str( | 323 | + nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str( |
| 468 | - options.filterSymbols) + '.mod') | 324 | + options.excludeSymbols) + '.mod') |
| 469 | joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel)) | 325 | joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel)) |
| 470 | print(" Saving training model done in: %fs" % (time() - t1)) | 326 | print(" Saving training model done in: %fs" % (time() - t1)) |
| 471 | 327 | ||
| 472 | # Evaluation against test data | 328 | # Evaluation against test data |
| 473 | y_pred = crf.predict(X_test) | 329 | y_pred = crf.predict(X_test) |
| 474 | print("*********************************") | 330 | print("*********************************") |
| 475 | - name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str( | 331 | + name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str( |
| 476 | - options.filterSymbols) + '.txt') | 332 | + options.excludeSymbols) + '.txt') |
| 477 | with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile: | 333 | with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile: |
| 478 | for y in y_pred: | 334 | for y in y_pred: |
| 479 | oFile.write(str(y) + '\n') | 335 | oFile.write(str(y) + '\n') |
| 480 | 336 | ||
| 481 | print("*********************************") | 337 | print("*********************************") |
| 482 | - name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str( | 338 | + name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str( |
| 483 | - options.filterSymbols) + '.txt') | 339 | + options.excludeSymbols) + '.txt') |
| 484 | with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile: | 340 | with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile: |
| 485 | for y in y_test: | 341 | for y in y_test: |
| 486 | oFile.write(str(y) + '\n') | 342 | oFile.write(str(y) + '\n') | ... | ... |
-
Please register or login to post a comment