Carlos-Francisco Méndez-Cruz

Feature extraction and vectorizer three sentences

...@@ -58,15 +58,16 @@ if __name__ == "__main__": ...@@ -58,15 +58,16 @@ if __name__ == "__main__":
58 print("Reading documents...") 58 print("Reading documents...")
59 documents = [] 59 documents = []
60 # Read documents from input path 60 # Read documents from input path
61 - for path, dirs, files in os.walk(args.outputPath): 61 + for path, dirs, files in os.walk(args.inputPath):
62 for file in files: 62 for file in files:
63 with open(os.path.join(args.inputPath, file), mode="r", encoding="utf-8") as iFile: 63 with open(os.path.join(args.inputPath, file), mode="r", encoding="utf-8") as iFile:
64 print("...{}".format(file)) 64 print("...{}".format(file))
65 # Add file to document list 65 # Add file to document list
66 documents.append(iFile.read()) 66 documents.append(iFile.read())
67 + print(" Documents: {}".format(len(documents)))
67 68
68 # Create vectorizer 69 # Create vectorizer
69 - print('Vectorizer: {}'.format(args.vectorizer)) 70 + print(' Vectorizer: {}'.format(args.vectorizer))
70 if args.vectorizer == "b": 71 if args.vectorizer == "b":
71 # Binary vectorizer 72 # Binary vectorizer
72 vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True) 73 vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True)
...@@ -78,7 +79,7 @@ if __name__ == "__main__": ...@@ -78,7 +79,7 @@ if __name__ == "__main__":
78 vectorizer = TfidfVectorizer(ngram_range=(1, 1)) 79 vectorizer = TfidfVectorizer(ngram_range=(1, 1))
79 80
80 matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double') 81 matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double')
81 - print(' matrix.shape: ', matrix.shape) 82 + print(' matrix.shape: ', matrix.shape)
82 83
83 with open(os.path.join(args.outputPath, "report-vectorizer.{}.txt".format(args.vectorizer)), encoding="utf-8", mode="w") as oFile: 84 with open(os.path.join(args.outputPath, "report-vectorizer.{}.txt".format(args.vectorizer)), encoding="utf-8", mode="w") as oFile:
84 oFile.write("Vectorizer: {}".format(args.vectorizer)) 85 oFile.write("Vectorizer: {}".format(args.vectorizer))
......