Feature extraction and vectorizer three sentences

Carlos-Francisco Méndez-Cruz
Commit c7fdb2f7d276977906f3714e97b6fdb8dc97c0cc c7fdb2f7 0 parents
Showing 2 changed files with 95 additions and 0 deletions
.idea/vcs.xml
representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
--- a/.idea/vcs.xml 0 → 100644
View file @c7fdb2f
+++ b/.idea/vcs.xml 0 → 100644
View file @c7fdb2f
+ <?xml version="1.0" encoding="UTF-8"?>
+ <project version="4">
+   <component name="VcsDirectoryMappings">
+     <mapping directory="$PROJECT_DIR$" vcs="Git" />
+   </component>
+ </project>
\ No newline at end of file
--- a/representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py 0 → 100644
View file @c7fdb2f
+++ b/representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py 0 → 100644
View file @c7fdb2f
+ # -*- encoding: utf-8 -*-
+ 
+ import os
+ from time import time
+ from optparse import OptionParser
+ import sys
+ from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+ from scipy.sparse import csr_matrix
+ 
+ __author__ = 'CMendezC'
+ 
+ # Goal: Feature extraction, vectorizer and TF-IDF
+ 
+ # Parameters:
+ # 1) --inputPath Path to read input files.
+ # 2) --outputPath Path to save output files.
+ # 3) --vectorizer Vectorizer: b=binary, f=frequency, t=tf-idf.
+ 
+ # Ouput:
+ # 1) Files with vectors.
+ 
+ # Execution:
+ 
+ # C:\Anaconda3\python extraccion-caracteristicas-vectorizacion.py
+ # --inputPath
+ # --outputPath
+ # --vectorizer
+ 
+ ###########################################################
+ #                       MAIN PROGRAM                      #
+ ###########################################################
+ 
+ if __name__ == "__main__":
+     # Parameter definition
+     parser = OptionParser()
+     parser.add_option("--inputPath", dest="inputPath",
+                       help="Path to read input files", metavar="PATH")
+     parser.add_option("--outputPath", dest="outputPath",
+                           help="Path to place output files", metavar="PATH")
+     parser.add_option("--vectorizer", dest="vectorizer",
+                       help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR",
+                       choices=('b', 'f', 't'), default='b')
+ 
+     (options, args) = parser.parse_args()
+     print(len(args))
+     if len(args) != 3:
+         parser.error("Some parameters missed.")
+         sys.exit(1)
+ 
+     # Printing parameter values
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Path to read input files: " + str(options.inputPath))
+     print("Path to place output files: " + str(options.outputPath))
+     print("Vectorizer: " + str(options.vectorizer))
+ 
+     # Start time
+     t0 = time()
+ 
+     print("Reading documents...")
+     documents = []
+     # Read documents from input path
+     for path, dirs, files in os.walk(options.outputPath):
+         for file in files:
+             with open(os.path.join(options.inputPath, file), mode="r", encoding="utf-8") as iFile:
+                 print("...{}".format(file))
+                 # Add file to document list
+                 documents.append(iFile.read())
+ 
+     # Create vectorizer
+     print('Vectorizer: {}'.format(options.vectorizer))
+     if options.vectorizer == "b":
+         # Binary vectorizer
+         vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True)
+     elif options.vectorizer == "f":
+         # Frequency vectorizer
+         vectorizer = CountVectorizer(ngram_range=(1, 1))
+     else:
+         # Binary vectorizer
+         vectorizer = TfidfVectorizer(ngram_range=(1, 1))
+ 
+     matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double')
+     print('     matrix.shape: ', matrix.shape)
+ 
+     with open(os.path.join(options.outputPath, "report-vectorizer.{}.txt".format(options.vectorizer)), encoding="utf-8", mode="w") as oFile:
+         oFile.write("Vectorizer: {}".format(options.vectorizer))
+         oFile.write(vectorizer.get_feature_names())
+         oFile.write(matrix)
+ 
+     print("Feature extraction and vectorizer in: %fs" % (time() - t0))