Feature extraction and vectorizer three sentences

Carlos-Francisco Méndez-Cruz
Commit c7fdb2f7d276977906f3714e97b6fdb8dc97c0cc c7fdb2f7 0 parents
Showing 2 changed files with 95 additions and 0 deletions
.idea/vcs.xml
representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
--- a/.idea/vcs.xml 0 → 100644
View file @c7fdb2f
+++ b/.idea/vcs.xml 0 → 100644
View file @c7fdb2f
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
--- a/representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py 0 → 100644
View file @c7fdb2f
+++ b/representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py 0 → 100644
View file @c7fdb2f
+# -*- encoding: utf-8 -*-
+
+import os
+from time import time
+from optparse import OptionParser
+import sys
+from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
+from scipy.sparse import csr_matrix
+
+__author__ = 'CMendezC'
+
+# Goal: Feature extraction, vectorizer and TF-IDF
+
+# Parameters:
+# 1) --inputPath Path to read input files.
+# 2) --outputPath Path to save output files.
+# 3) --vectorizer Vectorizer: b=binary, f=frequency, t=tf-idf.
+
+# Ouput:
+# 1) Files with vectors.
+
+# Execution:
+
+# C:\Anaconda3\python extraccion-caracteristicas-vectorizacion.py
+# --inputPath
+# --outputPath
+# --vectorizer
+
+###########################################################
+#                       MAIN PROGRAM                      #
+###########################################################
+
+if __name__ == "__main__":
+    # Parameter definition
+    parser = OptionParser()
+    parser.add_option("--inputPath", dest="inputPath",
+                      help="Path to read input files", metavar="PATH")
+    parser.add_option("--outputPath", dest="outputPath",
+                          help="Path to place output files", metavar="PATH")
+    parser.add_option("--vectorizer", dest="vectorizer",
+                      help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR",
+                      choices=('b', 'f', 't'), default='b')
+
+    (options, args) = parser.parse_args()
+    print(len(args))
+    if len(args) != 3:
+        parser.error("Some parameters missed.")
+        sys.exit(1)
+
+    # Printing parameter values
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path to read input files: " + str(options.inputPath))
+    print("Path to place output files: " + str(options.outputPath))
+    print("Vectorizer: " + str(options.vectorizer))
+
+    # Start time
+    t0 = time()
+
+    print("Reading documents...")
+    documents = []
+    # Read documents from input path
+    for path, dirs, files in os.walk(options.outputPath):
+        for file in files:
+            with open(os.path.join(options.inputPath, file), mode="r", encoding="utf-8") as iFile:
+                print("...{}".format(file))
+                # Add file to document list
+                documents.append(iFile.read())
+
+    # Create vectorizer
+    print('Vectorizer: {}'.format(options.vectorizer))
+    if options.vectorizer == "b":
+        # Binary vectorizer
+        vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True)
+    elif options.vectorizer == "f":
+        # Frequency vectorizer
+        vectorizer = CountVectorizer(ngram_range=(1, 1))
+    else:
+        # Binary vectorizer
+        vectorizer = TfidfVectorizer(ngram_range=(1, 1))
+
+    matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double')
+    print('     matrix.shape: ', matrix.shape)
+
+    with open(os.path.join(options.outputPath, "report-vectorizer.{}.txt".format(options.vectorizer)), encoding="utf-8", mode="w") as oFile:
+        oFile.write("Vectorizer: {}".format(options.vectorizer))
+        oFile.write(vectorizer.get_feature_names())
+        oFile.write(matrix)
+
+    print("Feature extraction and vectorizer in: %fs" % (time() - t0))