Showing
1 changed file
with
23 additions
and
20 deletions
... | @@ -2,7 +2,8 @@ | ... | @@ -2,7 +2,8 @@ |
2 | 2 | ||
3 | import os | 3 | import os |
4 | from time import time | 4 | from time import time |
5 | -from optparse import OptionParser | 5 | +# from optparse import OptionParser |
6 | +import argparse | ||
6 | import sys | 7 | import sys |
7 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | 8 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer |
8 | from scipy.sparse import csr_matrix | 9 | from scipy.sparse import csr_matrix |
... | @@ -20,11 +21,13 @@ __author__ = 'CMendezC' | ... | @@ -20,11 +21,13 @@ __author__ = 'CMendezC' |
20 | # 1) Files with vectors. | 21 | # 1) Files with vectors. |
21 | 22 | ||
22 | # Execution: | 23 | # Execution: |
24 | +# python extraccion-caracteristicas-vectorizacion.py | ||
25 | +# --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences | ||
26 | +# --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences | ||
27 | +# --vectorizer b | ||
23 | 28 | ||
24 | -# C:\Anaconda3\python extraccion-caracteristicas-vectorizacion.py | 29 | +# source activate python3 |
25 | -# --inputPath | 30 | +# python extraccion-caracteristicas-vectorizacion.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences --vectorizer b |
26 | -# --outputPath | ||
27 | -# --vectorizer | ||
28 | 31 | ||
29 | ########################################################### | 32 | ########################################################### |
30 | # MAIN PROGRAM # | 33 | # MAIN PROGRAM # |
... | @@ -32,16 +35,16 @@ __author__ = 'CMendezC' | ... | @@ -32,16 +35,16 @@ __author__ = 'CMendezC' |
32 | 35 | ||
33 | if __name__ == "__main__": | 36 | if __name__ == "__main__": |
34 | # Parameter definition | 37 | # Parameter definition |
35 | - parser = OptionParser() | 38 | + parser = argparse.ArgumentParser(description='Feature extraction and vectorizer.') |
36 | - parser.add_option("--inputPath", dest="inputPath", | 39 | + parser.add_argument("--inputPath", dest="inputPath", required=True, |
37 | help="Path to read input files", metavar="PATH") | 40 | help="Path to read input files", metavar="PATH") |
38 | - parser.add_option("--outputPath", dest="outputPath", | 41 | + parser.add_argument("--outputPath", dest="outputPath", required=True, |
39 | help="Path to place output files", metavar="PATH") | 42 | help="Path to place output files", metavar="PATH") |
40 | - parser.add_option("--vectorizer", dest="vectorizer", | 43 | + parser.add_argument("--vectorizer", dest="vectorizer", required=True, |
41 | help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR", | 44 | help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR", |
42 | choices=('b', 'f', 't'), default='b') | 45 | choices=('b', 'f', 't'), default='b') |
43 | 46 | ||
44 | - (options, args) = parser.parse_args() | 47 | + args = parser.parse_args() |
45 | print(len(args)) | 48 | print(len(args)) |
46 | if len(args) != 3: | 49 | if len(args) != 3: |
47 | parser.error("Some parameters missed.") | 50 | parser.error("Some parameters missed.") |
... | @@ -49,9 +52,9 @@ if __name__ == "__main__": | ... | @@ -49,9 +52,9 @@ if __name__ == "__main__": |
49 | 52 | ||
50 | # Printing parameter values | 53 | # Printing parameter values |
51 | print('-------------------------------- PARAMETERS --------------------------------') | 54 | print('-------------------------------- PARAMETERS --------------------------------') |
52 | - print("Path to read input files: " + str(options.inputPath)) | 55 | + print("Path to read input files: " + str(args.inputPath)) |
53 | - print("Path to place output files: " + str(options.outputPath)) | 56 | + print("Path to place output files: " + str(args.outputPath)) |
54 | - print("Vectorizer: " + str(options.vectorizer)) | 57 | + print("Vectorizer: " + str(args.vectorizer)) |
55 | 58 | ||
56 | # Start time | 59 | # Start time |
57 | t0 = time() | 60 | t0 = time() |
... | @@ -59,19 +62,19 @@ if __name__ == "__main__": | ... | @@ -59,19 +62,19 @@ if __name__ == "__main__": |
59 | print("Reading documents...") | 62 | print("Reading documents...") |
60 | documents = [] | 63 | documents = [] |
61 | # Read documents from input path | 64 | # Read documents from input path |
62 | - for path, dirs, files in os.walk(options.outputPath): | 65 | + for path, dirs, files in os.walk(args.outputPath): |
63 | for file in files: | 66 | for file in files: |
64 | - with open(os.path.join(options.inputPath, file), mode="r", encoding="utf-8") as iFile: | 67 | + with open(os.path.join(args.inputPath, file), mode="r", encoding="utf-8") as iFile: |
65 | print("...{}".format(file)) | 68 | print("...{}".format(file)) |
66 | # Add file to document list | 69 | # Add file to document list |
67 | documents.append(iFile.read()) | 70 | documents.append(iFile.read()) |
68 | 71 | ||
69 | # Create vectorizer | 72 | # Create vectorizer |
70 | - print('Vectorizer: {}'.format(options.vectorizer)) | 73 | + print('Vectorizer: {}'.format(args.vectorizer)) |
71 | - if options.vectorizer == "b": | 74 | + if args.vectorizer == "b": |
72 | # Binary vectorizer | 75 | # Binary vectorizer |
73 | vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True) | 76 | vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True) |
74 | - elif options.vectorizer == "f": | 77 | + elif args.vectorizer == "f": |
75 | # Frequency vectorizer | 78 | # Frequency vectorizer |
76 | vectorizer = CountVectorizer(ngram_range=(1, 1)) | 79 | vectorizer = CountVectorizer(ngram_range=(1, 1)) |
77 | else: | 80 | else: |
... | @@ -81,8 +84,8 @@ if __name__ == "__main__": | ... | @@ -81,8 +84,8 @@ if __name__ == "__main__": |
81 | matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double') | 84 | matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double') |
82 | print(' matrix.shape: ', matrix.shape) | 85 | print(' matrix.shape: ', matrix.shape) |
83 | 86 | ||
84 | - with open(os.path.join(options.outputPath, "report-vectorizer.{}.txt".format(options.vectorizer)), encoding="utf-8", mode="w") as oFile: | 87 | + with open(os.path.join(args.outputPath, "report-vectorizer.{}.txt".format(args.vectorizer)), encoding="utf-8", mode="w") as oFile: |
85 | - oFile.write("Vectorizer: {}".format(options.vectorizer)) | 88 | + oFile.write("Vectorizer: {}".format(args.vectorizer)) |
86 | oFile.write(vectorizer.get_feature_names()) | 89 | oFile.write(vectorizer.get_feature_names()) |
87 | oFile.write(matrix) | 90 | oFile.write(matrix) |
88 | 91 | ... | ... |
-
Please register or login to post a comment