Carlos-Francisco Méndez-Cruz

Obtaining training and test data sets

......@@ -332,14 +332,15 @@ if __name__ == "__main__":
sentencesTrainingData = []
sentencesTestData = []
stopwords = [word.decode('utf-8') for word in stopwords.words('english')]
# Original: stopwords = [word.decode('utf-8') for word in stopwords.words('english')]
stopwords = [word for word in stopwords.words('english')]
with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile:
# with open(os.path.join(options.inputPath, options.trainingFile), "r", encoding="utf-8", errors='replace') as iFile:
for line in iFile.readlines():
listLine = []
line = line.decode("utf-8")
for token in line.strip('\n').split():
line = line.strip('\n')
for token in line.split():
if options.filterStopWords:
listToken = token.split('|')
lemma = listToken[1]
......@@ -366,8 +367,8 @@ if __name__ == "__main__":
# with open(os.path.join(options.inputPath, options.testFile), "r", encoding="utf-8", errors='replace') as iFile:
for line in iFile.readlines():
listLine = []
line = line.decode("utf-8")
for token in line.strip('\n').split():
line = line.strip('\n')
for token in line.split():
if options.filterStopWords:
listToken = token.split('|')
lemma = listToken[1]
......