Carlos-Francisco Méndez-Cruz

Feature extraction and vectorizer three sentences

...@@ -7,6 +7,7 @@ import argparse ...@@ -7,6 +7,7 @@ import argparse
7 import sys 7 import sys
8 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 8 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
9 from scipy.sparse import csr_matrix 9 from scipy.sparse import csr_matrix
10 +from sklearn.metrics.pairwise import cosine_similarity
10 11
11 __author__ = 'CMendezC' 12 __author__ = 'CMendezC'
12 13
...@@ -81,10 +82,15 @@ if __name__ == "__main__": ...@@ -81,10 +82,15 @@ if __name__ == "__main__":
81 matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double') 82 matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double')
82 print(' matrix.shape: ', matrix.shape) 83 print(' matrix.shape: ', matrix.shape)
83 84
85 + similarityMatrix = cosine_similarity(matrix)
86 + print(" Cosine similarity matrix shape: {}".format(similarityMatrix.shape))
87 +
84 with open(os.path.join(args.outputPath, "report-vectorizer.{}.txt".format(args.vectorizer)), encoding="utf-8", mode="w") as oFile: 88 with open(os.path.join(args.outputPath, "report-vectorizer.{}.txt".format(args.vectorizer)), encoding="utf-8", mode="w") as oFile:
85 oFile.write("Vectorizer: {}\n".format(args.vectorizer)) 89 oFile.write("Vectorizer: {}\n".format(args.vectorizer))
86 oFile.write(str(vectorizer.get_feature_names())) 90 oFile.write(str(vectorizer.get_feature_names()))
87 oFile.write("\n") 91 oFile.write("\n")
88 oFile.write(str(matrix.toarray())) 92 oFile.write(str(matrix.toarray()))
93 + oFile.write("\n")
94 + oFile.write(str(similarityMatrix.toarray()))
89 95
90 print("Feature extraction and vectorizer in: %fs" % (time() - t0)) 96 print("Feature extraction and vectorizer in: %fs" % (time() - t0))
......