Showing
1 changed file
with
6 additions
and
5 deletions
| ... | @@ -332,14 +332,15 @@ if __name__ == "__main__": | ... | @@ -332,14 +332,15 @@ if __name__ == "__main__": |
| 332 | sentencesTrainingData = [] | 332 | sentencesTrainingData = [] |
| 333 | sentencesTestData = [] | 333 | sentencesTestData = [] |
| 334 | 334 | ||
| 335 | - stopwords = [word.decode('utf-8') for word in stopwords.words('english')] | 335 | + # Original: stopwords = [word.decode('utf-8') for word in stopwords.words('english')] |
| 336 | + stopwords = [word for word in stopwords.words('english')] | ||
| 336 | 337 | ||
| 337 | with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile: | 338 | with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile: |
| 338 | # with open(os.path.join(options.inputPath, options.trainingFile), "r", encoding="utf-8", errors='replace') as iFile: | 339 | # with open(os.path.join(options.inputPath, options.trainingFile), "r", encoding="utf-8", errors='replace') as iFile: |
| 339 | for line in iFile.readlines(): | 340 | for line in iFile.readlines(): |
| 340 | listLine = [] | 341 | listLine = [] |
| 341 | - line = line.decode("utf-8") | 342 | + line = line.strip('\n') |
| 342 | - for token in line.strip('\n').split(): | 343 | + for token in line.split(): |
| 343 | if options.filterStopWords: | 344 | if options.filterStopWords: |
| 344 | listToken = token.split('|') | 345 | listToken = token.split('|') |
| 345 | lemma = listToken[1] | 346 | lemma = listToken[1] |
| ... | @@ -366,8 +367,8 @@ if __name__ == "__main__": | ... | @@ -366,8 +367,8 @@ if __name__ == "__main__": |
| 366 | # with open(os.path.join(options.inputPath, options.testFile), "r", encoding="utf-8", errors='replace') as iFile: | 367 | # with open(os.path.join(options.inputPath, options.testFile), "r", encoding="utf-8", errors='replace') as iFile: |
| 367 | for line in iFile.readlines(): | 368 | for line in iFile.readlines(): |
| 368 | listLine = [] | 369 | listLine = [] |
| 369 | - line = line.decode("utf-8") | 370 | + line = line.strip('\n') |
| 370 | - for token in line.strip('\n').split(): | 371 | + for token in line.split(): |
| 371 | if options.filterStopWords: | 372 | if options.filterStopWords: |
| 372 | listToken = token.split('|') | 373 | listToken = token.split('|') |
| 373 | lemma = listToken[1] | 374 | lemma = listToken[1] | ... | ... |
-
Please register or login to post a comment