Carlos-Francisco Méndez-Cruz

Feature extraction and vectorizer three sentences

......@@ -7,6 +7,7 @@ import argparse
import sys
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
__author__ = 'CMendezC'
......@@ -81,10 +82,15 @@ if __name__ == "__main__":
matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double')
print(' matrix.shape: ', matrix.shape)
similarityMatrix = cosine_similarity(matrix)
print(" Cosine similarity matrix shape: {}".format(similarityMatrix.shape))
with open(os.path.join(args.outputPath, "report-vectorizer.{}.txt".format(args.vectorizer)), encoding="utf-8", mode="w") as oFile:
oFile.write("Vectorizer: {}\n".format(args.vectorizer))
oFile.write(str(vectorizer.get_feature_names()))
oFile.write("\n")
oFile.write(str(matrix.toarray()))
oFile.write("\n")
oFile.write(str(similarityMatrix.toarray()))
print("Feature extraction and vectorizer in: %fs" % (time() - t0))
......