Carlos-Francisco Méndez-Cruz

Feature extraction and vectorizer three sentences

1 +<?xml version="1.0" encoding="UTF-8"?>
2 +<project version="4">
3 + <component name="VcsDirectoryMappings">
4 + <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 + </component>
6 +</project>
...\ No newline at end of file ...\ No newline at end of file
1 +# -*- encoding: utf-8 -*-
2 +
3 +import os
4 +from time import time
5 +from optparse import OptionParser
6 +import sys
7 +from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
8 +from scipy.sparse import csr_matrix
9 +
10 +__author__ = 'CMendezC'
11 +
12 +# Goal: Feature extraction, vectorizer and TF-IDF
13 +
14 +# Parameters:
15 +# 1) --inputPath Path to read input files.
16 +# 2) --outputPath Path to save output files.
17 +# 3) --vectorizer Vectorizer: b=binary, f=frequency, t=tf-idf.
18 +
19 +# Ouput:
20 +# 1) Files with vectors.
21 +
22 +# Execution:
23 +
24 +# C:\Anaconda3\python extraccion-caracteristicas-vectorizacion.py
25 +# --inputPath
26 +# --outputPath
27 +# --vectorizer
28 +
29 +###########################################################
30 +# MAIN PROGRAM #
31 +###########################################################
32 +
33 +if __name__ == "__main__":
34 + # Parameter definition
35 + parser = OptionParser()
36 + parser.add_option("--inputPath", dest="inputPath",
37 + help="Path to read input files", metavar="PATH")
38 + parser.add_option("--outputPath", dest="outputPath",
39 + help="Path to place output files", metavar="PATH")
40 + parser.add_option("--vectorizer", dest="vectorizer",
41 + help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR",
42 + choices=('b', 'f', 't'), default='b')
43 +
44 + (options, args) = parser.parse_args()
45 + print(len(args))
46 + if len(args) != 3:
47 + parser.error("Some parameters missed.")
48 + sys.exit(1)
49 +
50 + # Printing parameter values
51 + print('-------------------------------- PARAMETERS --------------------------------')
52 + print("Path to read input files: " + str(options.inputPath))
53 + print("Path to place output files: " + str(options.outputPath))
54 + print("Vectorizer: " + str(options.vectorizer))
55 +
56 + # Start time
57 + t0 = time()
58 +
59 + print("Reading documents...")
60 + documents = []
61 + # Read documents from input path
62 + for path, dirs, files in os.walk(options.outputPath):
63 + for file in files:
64 + with open(os.path.join(options.inputPath, file), mode="r", encoding="utf-8") as iFile:
65 + print("...{}".format(file))
66 + # Add file to document list
67 + documents.append(iFile.read())
68 +
69 + # Create vectorizer
70 + print('Vectorizer: {}'.format(options.vectorizer))
71 + if options.vectorizer == "b":
72 + # Binary vectorizer
73 + vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True)
74 + elif options.vectorizer == "f":
75 + # Frequency vectorizer
76 + vectorizer = CountVectorizer(ngram_range=(1, 1))
77 + else:
78 + # Binary vectorizer
79 + vectorizer = TfidfVectorizer(ngram_range=(1, 1))
80 +
81 + matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double')
82 + print(' matrix.shape: ', matrix.shape)
83 +
84 + with open(os.path.join(options.outputPath, "report-vectorizer.{}.txt".format(options.vectorizer)), encoding="utf-8", mode="w") as oFile:
85 + oFile.write("Vectorizer: {}".format(options.vectorizer))
86 + oFile.write(vectorizer.get_feature_names())
87 + oFile.write(matrix)
88 +
89 + print("Feature extraction and vectorizer in: %fs" % (time() - t0))