Showing
2 changed files
with
95 additions
and
0 deletions
.idea/vcs.xml
0 → 100644
| 1 | +# -*- encoding: utf-8 -*- | ||
| 2 | + | ||
| 3 | +import os | ||
| 4 | +from time import time | ||
| 5 | +from optparse import OptionParser | ||
| 6 | +import sys | ||
| 7 | +from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | ||
| 8 | +from scipy.sparse import csr_matrix | ||
| 9 | + | ||
| 10 | +__author__ = 'CMendezC' | ||
| 11 | + | ||
| 12 | +# Goal: Feature extraction, vectorizer and TF-IDF | ||
| 13 | + | ||
| 14 | +# Parameters: | ||
| 15 | +# 1) --inputPath Path to read input files. | ||
| 16 | +# 2) --outputPath Path to save output files. | ||
| 17 | +# 3) --vectorizer Vectorizer: b=binary, f=frequency, t=tf-idf. | ||
| 18 | + | ||
| 19 | +# Ouput: | ||
| 20 | +# 1) Files with vectors. | ||
| 21 | + | ||
| 22 | +# Execution: | ||
| 23 | + | ||
| 24 | +# C:\Anaconda3\python extraccion-caracteristicas-vectorizacion.py | ||
| 25 | +# --inputPath | ||
| 26 | +# --outputPath | ||
| 27 | +# --vectorizer | ||
| 28 | + | ||
| 29 | +########################################################### | ||
| 30 | +# MAIN PROGRAM # | ||
| 31 | +########################################################### | ||
| 32 | + | ||
| 33 | +if __name__ == "__main__": | ||
| 34 | + # Parameter definition | ||
| 35 | + parser = OptionParser() | ||
| 36 | + parser.add_option("--inputPath", dest="inputPath", | ||
| 37 | + help="Path to read input files", metavar="PATH") | ||
| 38 | + parser.add_option("--outputPath", dest="outputPath", | ||
| 39 | + help="Path to place output files", metavar="PATH") | ||
| 40 | + parser.add_option("--vectorizer", dest="vectorizer", | ||
| 41 | + help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR", | ||
| 42 | + choices=('b', 'f', 't'), default='b') | ||
| 43 | + | ||
| 44 | + (options, args) = parser.parse_args() | ||
| 45 | + print(len(args)) | ||
| 46 | + if len(args) != 3: | ||
| 47 | + parser.error("Some parameters missed.") | ||
| 48 | + sys.exit(1) | ||
| 49 | + | ||
| 50 | + # Printing parameter values | ||
| 51 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
| 52 | + print("Path to read input files: " + str(options.inputPath)) | ||
| 53 | + print("Path to place output files: " + str(options.outputPath)) | ||
| 54 | + print("Vectorizer: " + str(options.vectorizer)) | ||
| 55 | + | ||
| 56 | + # Start time | ||
| 57 | + t0 = time() | ||
| 58 | + | ||
| 59 | + print("Reading documents...") | ||
| 60 | + documents = [] | ||
| 61 | + # Read documents from input path | ||
| 62 | + for path, dirs, files in os.walk(options.outputPath): | ||
| 63 | + for file in files: | ||
| 64 | + with open(os.path.join(options.inputPath, file), mode="r", encoding="utf-8") as iFile: | ||
| 65 | + print("...{}".format(file)) | ||
| 66 | + # Add file to document list | ||
| 67 | + documents.append(iFile.read()) | ||
| 68 | + | ||
| 69 | + # Create vectorizer | ||
| 70 | + print('Vectorizer: {}'.format(options.vectorizer)) | ||
| 71 | + if options.vectorizer == "b": | ||
| 72 | + # Binary vectorizer | ||
| 73 | + vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True) | ||
| 74 | + elif options.vectorizer == "f": | ||
| 75 | + # Frequency vectorizer | ||
| 76 | + vectorizer = CountVectorizer(ngram_range=(1, 1)) | ||
| 77 | + else: | ||
| 78 | + # Binary vectorizer | ||
| 79 | + vectorizer = TfidfVectorizer(ngram_range=(1, 1)) | ||
| 80 | + | ||
| 81 | + matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double') | ||
| 82 | + print(' matrix.shape: ', matrix.shape) | ||
| 83 | + | ||
| 84 | + with open(os.path.join(options.outputPath, "report-vectorizer.{}.txt".format(options.vectorizer)), encoding="utf-8", mode="w") as oFile: | ||
| 85 | + oFile.write("Vectorizer: {}".format(options.vectorizer)) | ||
| 86 | + oFile.write(vectorizer.get_feature_names()) | ||
| 87 | + oFile.write(matrix) | ||
| 88 | + | ||
| 89 | + print("Feature extraction and vectorizer in: %fs" % (time() - t0)) |
-
Please register or login to post a comment