Feature extraction and vectorizer three sentences

Carlos-Francisco Méndez-Cruz
Commit d3712e47bb291e1546c1d2ebc2101916559a0e04 d3712e47 1 parent 27174a6e
Showing 1 changed file with 6 additions and 0 deletions
representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
--- a/representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
View file @d3712e4
+++ b/representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
View file @d3712e4
@@ -7,6 +7,7 @@ import argparse
 import sys
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 from scipy.sparse import csr_matrix
+ from sklearn.metrics.pairwise import cosine_similarity
 
 __author__ = 'CMendezC'
 
@@ -81,10 +82,15 @@ if __name__ == "__main__":
     matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double')
     print('   matrix.shape: ', matrix.shape)
 
+     similarityMatrix = cosine_similarity(matrix)
+     print("   Cosine similarity matrix shape: {}".format(similarityMatrix.shape))
+ 
     with open(os.path.join(args.outputPath, "report-vectorizer.{}.txt".format(args.vectorizer)), encoding="utf-8", mode="w") as oFile:
         oFile.write("Vectorizer: {}\n".format(args.vectorizer))
         oFile.write(str(vectorizer.get_feature_names()))
         oFile.write("\n")
         oFile.write(str(matrix.toarray()))
+         oFile.write("\n")
+         oFile.write(str(similarityMatrix.toarray()))
 
     print("Feature extraction and vectorizer in: %fs" % (time() - t0))