extraccion-caracteristicas-vectorizacion.py 3.64 KB
# -*- encoding: utf-8 -*-

import os
from time import time
# from optparse import OptionParser
import argparse
import sys
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix

__author__ = 'CMendezC'

# Goal: Feature extraction, vectorizer and TF-IDF

# Parameters:
# 1) --inputPath Path to read input files.
# 2) --outputPath Path to save output files.
# 3) --vectorizer Vectorizer: b=binary, f=frequency, t=tf-idf.

# Ouput:
# 1) Files with vectors.

# Execution:
# python extraccion-caracteristicas-vectorizacion.py
# --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences
# --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences
# --vectorizer b

# source activate python3
# python extraccion-caracteristicas-vectorizacion.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences --vectorizer b

###########################################################
#                       MAIN PROGRAM                      #
###########################################################

if __name__ == "__main__":
    # Parameter definition
    parser = argparse.ArgumentParser(description='Feature extraction and vectorizer.')
    parser.add_argument("--inputPath", dest="inputPath", required=True,
                      help="Path to read input files", metavar="PATH")
    parser.add_argument("--outputPath", dest="outputPath", required=True,
                          help="Path to place output files", metavar="PATH")
    parser.add_argument("--vectorizer", dest="vectorizer", required=True,
                      help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR",
                      choices=('b', 'f', 't'), default='b')

    args = parser.parse_args()

    # Printing parameter values
    print('-------------------------------- PARAMETERS --------------------------------')
    print("Path to read input files: " + str(args.inputPath))
    print("Path to place output files: " + str(args.outputPath))
    print("Vectorizer: " + str(args.vectorizer))

    # Start time
    t0 = time()

    print("Reading documents...")
    documents = []
    # Read documents from input path
    for path, dirs, files in os.walk(args.inputPath):
        for file in files:
            with open(os.path.join(args.inputPath, file), mode="r", encoding="utf-8") as iFile:
                print("...{}".format(file))
                # Add file to document list
                documents.append(iFile.read())
    print("  Documents: {}".format(len(documents)))

    # Create vectorizer
    print('  Vectorizer: {}'.format(args.vectorizer))
    if args.vectorizer == "b":
        # Binary vectorizer
        vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True)
    elif args.vectorizer == "f":
        # Frequency vectorizer
        vectorizer = CountVectorizer(ngram_range=(1, 1))
    else:
        # Binary vectorizer
        vectorizer = TfidfVectorizer(ngram_range=(1, 1))

    matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double')
    print('   matrix.shape: ', matrix.shape)

    with open(os.path.join(args.outputPath, "report-vectorizer.{}.txt".format(args.vectorizer)), encoding="utf-8", mode="w") as oFile:
        oFile.write("Vectorizer: {}\n".format(args.vectorizer))
        oFile.write(str(vectorizer.get_feature_names()))
        oFile.write("\n")
        oFile.write(str(matrix.toarray()))

    print("Feature extraction and vectorizer in: %fs" % (time() - t0))