Showing
2 changed files
with
95 additions
and
0 deletions
.idea/vcs.xml
0 → 100644
1 | +# -*- encoding: utf-8 -*- | ||
2 | + | ||
3 | +import os | ||
4 | +from time import time | ||
5 | +from optparse import OptionParser | ||
6 | +import sys | ||
7 | +from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | ||
8 | +from scipy.sparse import csr_matrix | ||
9 | + | ||
10 | +__author__ = 'CMendezC' | ||
11 | + | ||
12 | +# Goal: Feature extraction, vectorizer and TF-IDF | ||
13 | + | ||
14 | +# Parameters: | ||
15 | +# 1) --inputPath Path to read input files. | ||
16 | +# 2) --outputPath Path to save output files. | ||
17 | +# 3) --vectorizer Vectorizer: b=binary, f=frequency, t=tf-idf. | ||
18 | + | ||
19 | +# Ouput: | ||
20 | +# 1) Files with vectors. | ||
21 | + | ||
22 | +# Execution: | ||
23 | + | ||
24 | +# C:\Anaconda3\python extraccion-caracteristicas-vectorizacion.py | ||
25 | +# --inputPath | ||
26 | +# --outputPath | ||
27 | +# --vectorizer | ||
28 | + | ||
29 | +########################################################### | ||
30 | +# MAIN PROGRAM # | ||
31 | +########################################################### | ||
32 | + | ||
33 | +if __name__ == "__main__": | ||
34 | + # Parameter definition | ||
35 | + parser = OptionParser() | ||
36 | + parser.add_option("--inputPath", dest="inputPath", | ||
37 | + help="Path to read input files", metavar="PATH") | ||
38 | + parser.add_option("--outputPath", dest="outputPath", | ||
39 | + help="Path to place output files", metavar="PATH") | ||
40 | + parser.add_option("--vectorizer", dest="vectorizer", | ||
41 | + help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR", | ||
42 | + choices=('b', 'f', 't'), default='b') | ||
43 | + | ||
44 | + (options, args) = parser.parse_args() | ||
45 | + print(len(args)) | ||
46 | + if len(args) != 3: | ||
47 | + parser.error("Some parameters missed.") | ||
48 | + sys.exit(1) | ||
49 | + | ||
50 | + # Printing parameter values | ||
51 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
52 | + print("Path to read input files: " + str(options.inputPath)) | ||
53 | + print("Path to place output files: " + str(options.outputPath)) | ||
54 | + print("Vectorizer: " + str(options.vectorizer)) | ||
55 | + | ||
56 | + # Start time | ||
57 | + t0 = time() | ||
58 | + | ||
59 | + print("Reading documents...") | ||
60 | + documents = [] | ||
61 | + # Read documents from input path | ||
62 | + for path, dirs, files in os.walk(options.outputPath): | ||
63 | + for file in files: | ||
64 | + with open(os.path.join(options.inputPath, file), mode="r", encoding="utf-8") as iFile: | ||
65 | + print("...{}".format(file)) | ||
66 | + # Add file to document list | ||
67 | + documents.append(iFile.read()) | ||
68 | + | ||
69 | + # Create vectorizer | ||
70 | + print('Vectorizer: {}'.format(options.vectorizer)) | ||
71 | + if options.vectorizer == "b": | ||
72 | + # Binary vectorizer | ||
73 | + vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True) | ||
74 | + elif options.vectorizer == "f": | ||
75 | + # Frequency vectorizer | ||
76 | + vectorizer = CountVectorizer(ngram_range=(1, 1)) | ||
77 | + else: | ||
78 | + # Binary vectorizer | ||
79 | + vectorizer = TfidfVectorizer(ngram_range=(1, 1)) | ||
80 | + | ||
81 | + matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double') | ||
82 | + print(' matrix.shape: ', matrix.shape) | ||
83 | + | ||
84 | + with open(os.path.join(options.outputPath, "report-vectorizer.{}.txt".format(options.vectorizer)), encoding="utf-8", mode="w") as oFile: | ||
85 | + oFile.write("Vectorizer: {}".format(options.vectorizer)) | ||
86 | + oFile.write(vectorizer.get_feature_names()) | ||
87 | + oFile.write(matrix) | ||
88 | + | ||
89 | + print("Feature extraction and vectorizer in: %fs" % (time() - t0)) |
-
Please register or login to post a comment