Carlos-Francisco Méndez-Cruz

Feature extraction and vectorizer three sentences

......@@ -58,15 +58,16 @@ if __name__ == "__main__":
print("Reading documents...")
documents = []
# Read documents from input path
for path, dirs, files in os.walk(args.outputPath):
for path, dirs, files in os.walk(args.inputPath):
for file in files:
with open(os.path.join(args.inputPath, file), mode="r", encoding="utf-8") as iFile:
print("...{}".format(file))
# Add file to document list
documents.append(iFile.read())
print(" Documents: {}".format(len(documents)))
# Create vectorizer
print('Vectorizer: {}'.format(args.vectorizer))
print(' Vectorizer: {}'.format(args.vectorizer))
if args.vectorizer == "b":
# Binary vectorizer
vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True)
......@@ -78,7 +79,7 @@ if __name__ == "__main__":
vectorizer = TfidfVectorizer(ngram_range=(1, 1))
matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double')
print(' matrix.shape: ', matrix.shape)
print(' matrix.shape: ', matrix.shape)
with open(os.path.join(args.outputPath, "report-vectorizer.{}.txt".format(args.vectorizer)), encoding="utf-8", mode="w") as oFile:
oFile.write("Vectorizer: {}".format(args.vectorizer))
......