Carlos-Francisco Méndez-Cruz

Obtaining training and test data sets

...@@ -332,14 +332,15 @@ if __name__ == "__main__": ...@@ -332,14 +332,15 @@ if __name__ == "__main__":
332 sentencesTrainingData = [] 332 sentencesTrainingData = []
333 sentencesTestData = [] 333 sentencesTestData = []
334 334
335 - stopwords = [word.decode('utf-8') for word in stopwords.words('english')] 335 + # Original: stopwords = [word.decode('utf-8') for word in stopwords.words('english')]
336 + stopwords = [word for word in stopwords.words('english')]
336 337
337 with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile: 338 with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile:
338 # with open(os.path.join(options.inputPath, options.trainingFile), "r", encoding="utf-8", errors='replace') as iFile: 339 # with open(os.path.join(options.inputPath, options.trainingFile), "r", encoding="utf-8", errors='replace') as iFile:
339 for line in iFile.readlines(): 340 for line in iFile.readlines():
340 listLine = [] 341 listLine = []
341 - line = line.decode("utf-8") 342 + line = line.strip('\n')
342 - for token in line.strip('\n').split(): 343 + for token in line.split():
343 if options.filterStopWords: 344 if options.filterStopWords:
344 listToken = token.split('|') 345 listToken = token.split('|')
345 lemma = listToken[1] 346 lemma = listToken[1]
...@@ -366,8 +367,8 @@ if __name__ == "__main__": ...@@ -366,8 +367,8 @@ if __name__ == "__main__":
366 # with open(os.path.join(options.inputPath, options.testFile), "r", encoding="utf-8", errors='replace') as iFile: 367 # with open(os.path.join(options.inputPath, options.testFile), "r", encoding="utf-8", errors='replace') as iFile:
367 for line in iFile.readlines(): 368 for line in iFile.readlines():
368 listLine = [] 369 listLine = []
369 - line = line.decode("utf-8") 370 + line = line.strip('\n')
370 - for token in line.strip('\n').split(): 371 + for token in line.split():
371 if options.filterStopWords: 372 if options.filterStopWords:
372 listToken = token.split('|') 373 listToken = token.split('|')
373 lemma = listToken[1] 374 lemma = listToken[1]
......