Showing
1 changed file
with
23 additions
and
20 deletions
| ... | @@ -2,7 +2,8 @@ | ... | @@ -2,7 +2,8 @@ |
| 2 | 2 | ||
| 3 | import os | 3 | import os |
| 4 | from time import time | 4 | from time import time |
| 5 | -from optparse import OptionParser | 5 | +# from optparse import OptionParser |
| 6 | +import argparse | ||
| 6 | import sys | 7 | import sys |
| 7 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | 8 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer |
| 8 | from scipy.sparse import csr_matrix | 9 | from scipy.sparse import csr_matrix |
| ... | @@ -20,11 +21,13 @@ __author__ = 'CMendezC' | ... | @@ -20,11 +21,13 @@ __author__ = 'CMendezC' |
| 20 | # 1) Files with vectors. | 21 | # 1) Files with vectors. |
| 21 | 22 | ||
| 22 | # Execution: | 23 | # Execution: |
| 24 | +# python extraccion-caracteristicas-vectorizacion.py | ||
| 25 | +# --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences | ||
| 26 | +# --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences | ||
| 27 | +# --vectorizer b | ||
| 23 | 28 | ||
| 24 | -# C:\Anaconda3\python extraccion-caracteristicas-vectorizacion.py | 29 | +# source activate python3 |
| 25 | -# --inputPath | 30 | +# python extraccion-caracteristicas-vectorizacion.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences --vectorizer b |
| 26 | -# --outputPath | ||
| 27 | -# --vectorizer | ||
| 28 | 31 | ||
| 29 | ########################################################### | 32 | ########################################################### |
| 30 | # MAIN PROGRAM # | 33 | # MAIN PROGRAM # |
| ... | @@ -32,16 +35,16 @@ __author__ = 'CMendezC' | ... | @@ -32,16 +35,16 @@ __author__ = 'CMendezC' |
| 32 | 35 | ||
| 33 | if __name__ == "__main__": | 36 | if __name__ == "__main__": |
| 34 | # Parameter definition | 37 | # Parameter definition |
| 35 | - parser = OptionParser() | 38 | + parser = argparse.ArgumentParser(description='Feature extraction and vectorizer.') |
| 36 | - parser.add_option("--inputPath", dest="inputPath", | 39 | + parser.add_argument("--inputPath", dest="inputPath", required=True, |
| 37 | help="Path to read input files", metavar="PATH") | 40 | help="Path to read input files", metavar="PATH") |
| 38 | - parser.add_option("--outputPath", dest="outputPath", | 41 | + parser.add_argument("--outputPath", dest="outputPath", required=True, |
| 39 | help="Path to place output files", metavar="PATH") | 42 | help="Path to place output files", metavar="PATH") |
| 40 | - parser.add_option("--vectorizer", dest="vectorizer", | 43 | + parser.add_argument("--vectorizer", dest="vectorizer", required=True, |
| 41 | help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR", | 44 | help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR", |
| 42 | choices=('b', 'f', 't'), default='b') | 45 | choices=('b', 'f', 't'), default='b') |
| 43 | 46 | ||
| 44 | - (options, args) = parser.parse_args() | 47 | + args = parser.parse_args() |
| 45 | print(len(args)) | 48 | print(len(args)) |
| 46 | if len(args) != 3: | 49 | if len(args) != 3: |
| 47 | parser.error("Some parameters missed.") | 50 | parser.error("Some parameters missed.") |
| ... | @@ -49,9 +52,9 @@ if __name__ == "__main__": | ... | @@ -49,9 +52,9 @@ if __name__ == "__main__": |
| 49 | 52 | ||
| 50 | # Printing parameter values | 53 | # Printing parameter values |
| 51 | print('-------------------------------- PARAMETERS --------------------------------') | 54 | print('-------------------------------- PARAMETERS --------------------------------') |
| 52 | - print("Path to read input files: " + str(options.inputPath)) | 55 | + print("Path to read input files: " + str(args.inputPath)) |
| 53 | - print("Path to place output files: " + str(options.outputPath)) | 56 | + print("Path to place output files: " + str(args.outputPath)) |
| 54 | - print("Vectorizer: " + str(options.vectorizer)) | 57 | + print("Vectorizer: " + str(args.vectorizer)) |
| 55 | 58 | ||
| 56 | # Start time | 59 | # Start time |
| 57 | t0 = time() | 60 | t0 = time() |
| ... | @@ -59,19 +62,19 @@ if __name__ == "__main__": | ... | @@ -59,19 +62,19 @@ if __name__ == "__main__": |
| 59 | print("Reading documents...") | 62 | print("Reading documents...") |
| 60 | documents = [] | 63 | documents = [] |
| 61 | # Read documents from input path | 64 | # Read documents from input path |
| 62 | - for path, dirs, files in os.walk(options.outputPath): | 65 | + for path, dirs, files in os.walk(args.outputPath): |
| 63 | for file in files: | 66 | for file in files: |
| 64 | - with open(os.path.join(options.inputPath, file), mode="r", encoding="utf-8") as iFile: | 67 | + with open(os.path.join(args.inputPath, file), mode="r", encoding="utf-8") as iFile: |
| 65 | print("...{}".format(file)) | 68 | print("...{}".format(file)) |
| 66 | # Add file to document list | 69 | # Add file to document list |
| 67 | documents.append(iFile.read()) | 70 | documents.append(iFile.read()) |
| 68 | 71 | ||
| 69 | # Create vectorizer | 72 | # Create vectorizer |
| 70 | - print('Vectorizer: {}'.format(options.vectorizer)) | 73 | + print('Vectorizer: {}'.format(args.vectorizer)) |
| 71 | - if options.vectorizer == "b": | 74 | + if args.vectorizer == "b": |
| 72 | # Binary vectorizer | 75 | # Binary vectorizer |
| 73 | vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True) | 76 | vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True) |
| 74 | - elif options.vectorizer == "f": | 77 | + elif args.vectorizer == "f": |
| 75 | # Frequency vectorizer | 78 | # Frequency vectorizer |
| 76 | vectorizer = CountVectorizer(ngram_range=(1, 1)) | 79 | vectorizer = CountVectorizer(ngram_range=(1, 1)) |
| 77 | else: | 80 | else: |
| ... | @@ -81,8 +84,8 @@ if __name__ == "__main__": | ... | @@ -81,8 +84,8 @@ if __name__ == "__main__": |
| 81 | matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double') | 84 | matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double') |
| 82 | print(' matrix.shape: ', matrix.shape) | 85 | print(' matrix.shape: ', matrix.shape) |
| 83 | 86 | ||
| 84 | - with open(os.path.join(options.outputPath, "report-vectorizer.{}.txt".format(options.vectorizer)), encoding="utf-8", mode="w") as oFile: | 87 | + with open(os.path.join(args.outputPath, "report-vectorizer.{}.txt".format(args.vectorizer)), encoding="utf-8", mode="w") as oFile: |
| 85 | - oFile.write("Vectorizer: {}".format(options.vectorizer)) | 88 | + oFile.write("Vectorizer: {}".format(args.vectorizer)) |
| 86 | oFile.write(vectorizer.get_feature_names()) | 89 | oFile.write(vectorizer.get_feature_names()) |
| 87 | oFile.write(matrix) | 90 | oFile.write(matrix) |
| 88 | 91 | ... | ... |
-
Please register or login to post a comment