extraccion-caracteristicas-vectorizacion.py 3.1 KB

Raw Blame History Permalink

# -*- encoding: utf-8 -*-

import os
from time import time
from optparse import OptionParser
import sys
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix

__author__ = 'CMendezC'

# Goal: Feature extraction, vectorizer and TF-IDF

# Parameters:
# 1) --inputPath Path to read input files.
# 2) --outputPath Path to save output files.
# 3) --vectorizer Vectorizer: b=binary, f=frequency, t=tf-idf.

# Ouput:
# 1) Files with vectors.

# Execution:

# C:\Anaconda3\python extraccion-caracteristicas-vectorizacion.py
# --inputPath
# --outputPath
# --vectorizer

###########################################################
#                       MAIN PROGRAM                      #
###########################################################

if __name__ == "__main__":
    # Parameter definition
    parser = OptionParser()
    parser.add_option("--inputPath", dest="inputPath",
                      help="Path to read input files", metavar="PATH")
    parser.add_option("--outputPath", dest="outputPath",
                          help="Path to place output files", metavar="PATH")
    parser.add_option("--vectorizer", dest="vectorizer",
                      help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR",
                      choices=('b', 'f', 't'), default='b')

    (options, args) = parser.parse_args()
    print(len(args))
    if len(args) != 3:
        parser.error("Some parameters missed.")
        sys.exit(1)

    # Printing parameter values
    print('-------------------------------- PARAMETERS --------------------------------')
    print("Path to read input files: " + str(options.inputPath))
    print("Path to place output files: " + str(options.outputPath))
    print("Vectorizer: " + str(options.vectorizer))

    # Start time
    t0 = time()

    print("Reading documents...")
    documents = []
    # Read documents from input path
    for path, dirs, files in os.walk(options.outputPath):
        for file in files:
            with open(os.path.join(options.inputPath, file), mode="r", encoding="utf-8") as iFile:
                print("...{}".format(file))
                # Add file to document list
                documents.append(iFile.read())

    # Create vectorizer
    print('Vectorizer: {}'.format(options.vectorizer))
    if options.vectorizer == "b":
        # Binary vectorizer
        vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True)
    elif options.vectorizer == "f":
        # Frequency vectorizer
        vectorizer = CountVectorizer(ngram_range=(1, 1))
    else:
        # Binary vectorizer
        vectorizer = TfidfVectorizer(ngram_range=(1, 1))

    matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double')
    print('     matrix.shape: ', matrix.shape)

    with open(os.path.join(options.outputPath, "report-vectorizer.{}.txt".format(options.vectorizer)), encoding="utf-8", mode="w") as oFile:
        oFile.write("Vectorizer: {}".format(options.vectorizer))
        oFile.write(vectorizer.get_feature_names())
        oFile.write(matrix)

    print("Feature extraction and vectorizer in: %fs" % (time() - t0))