Carlos-Francisco Méndez-Cruz

Feature extraction and vectorizer three sentences

...@@ -2,7 +2,8 @@ ...@@ -2,7 +2,8 @@
2 2
3 import os 3 import os
4 from time import time 4 from time import time
5 -from optparse import OptionParser 5 +# from optparse import OptionParser
6 +import argparse
6 import sys 7 import sys
7 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 8 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
8 from scipy.sparse import csr_matrix 9 from scipy.sparse import csr_matrix
...@@ -20,11 +21,13 @@ __author__ = 'CMendezC' ...@@ -20,11 +21,13 @@ __author__ = 'CMendezC'
20 # 1) Files with vectors. 21 # 1) Files with vectors.
21 22
22 # Execution: 23 # Execution:
24 +# python extraccion-caracteristicas-vectorizacion.py
25 +# --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences
26 +# --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences
27 +# --vectorizer b
23 28
24 -# C:\Anaconda3\python extraccion-caracteristicas-vectorizacion.py 29 +# source activate python3
25 -# --inputPath 30 +# python extraccion-caracteristicas-vectorizacion.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences --vectorizer b
26 -# --outputPath
27 -# --vectorizer
28 31
29 ########################################################### 32 ###########################################################
30 # MAIN PROGRAM # 33 # MAIN PROGRAM #
...@@ -32,16 +35,16 @@ __author__ = 'CMendezC' ...@@ -32,16 +35,16 @@ __author__ = 'CMendezC'
32 35
33 if __name__ == "__main__": 36 if __name__ == "__main__":
34 # Parameter definition 37 # Parameter definition
35 - parser = OptionParser() 38 + parser = argparse.ArgumentParser(description='Feature extraction and vectorizer.')
36 - parser.add_option("--inputPath", dest="inputPath", 39 + parser.add_argument("--inputPath", dest="inputPath", required=True,
37 help="Path to read input files", metavar="PATH") 40 help="Path to read input files", metavar="PATH")
38 - parser.add_option("--outputPath", dest="outputPath", 41 + parser.add_argument("--outputPath", dest="outputPath", required=True,
39 help="Path to place output files", metavar="PATH") 42 help="Path to place output files", metavar="PATH")
40 - parser.add_option("--vectorizer", dest="vectorizer", 43 + parser.add_argument("--vectorizer", dest="vectorizer", required=True,
41 help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR", 44 help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR",
42 choices=('b', 'f', 't'), default='b') 45 choices=('b', 'f', 't'), default='b')
43 46
44 - (options, args) = parser.parse_args() 47 + args = parser.parse_args()
45 print(len(args)) 48 print(len(args))
46 if len(args) != 3: 49 if len(args) != 3:
47 parser.error("Some parameters missed.") 50 parser.error("Some parameters missed.")
...@@ -49,9 +52,9 @@ if __name__ == "__main__": ...@@ -49,9 +52,9 @@ if __name__ == "__main__":
49 52
50 # Printing parameter values 53 # Printing parameter values
51 print('-------------------------------- PARAMETERS --------------------------------') 54 print('-------------------------------- PARAMETERS --------------------------------')
52 - print("Path to read input files: " + str(options.inputPath)) 55 + print("Path to read input files: " + str(args.inputPath))
53 - print("Path to place output files: " + str(options.outputPath)) 56 + print("Path to place output files: " + str(args.outputPath))
54 - print("Vectorizer: " + str(options.vectorizer)) 57 + print("Vectorizer: " + str(args.vectorizer))
55 58
56 # Start time 59 # Start time
57 t0 = time() 60 t0 = time()
...@@ -59,19 +62,19 @@ if __name__ == "__main__": ...@@ -59,19 +62,19 @@ if __name__ == "__main__":
59 print("Reading documents...") 62 print("Reading documents...")
60 documents = [] 63 documents = []
61 # Read documents from input path 64 # Read documents from input path
62 - for path, dirs, files in os.walk(options.outputPath): 65 + for path, dirs, files in os.walk(args.outputPath):
63 for file in files: 66 for file in files:
64 - with open(os.path.join(options.inputPath, file), mode="r", encoding="utf-8") as iFile: 67 + with open(os.path.join(args.inputPath, file), mode="r", encoding="utf-8") as iFile:
65 print("...{}".format(file)) 68 print("...{}".format(file))
66 # Add file to document list 69 # Add file to document list
67 documents.append(iFile.read()) 70 documents.append(iFile.read())
68 71
69 # Create vectorizer 72 # Create vectorizer
70 - print('Vectorizer: {}'.format(options.vectorizer)) 73 + print('Vectorizer: {}'.format(args.vectorizer))
71 - if options.vectorizer == "b": 74 + if args.vectorizer == "b":
72 # Binary vectorizer 75 # Binary vectorizer
73 vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True) 76 vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True)
74 - elif options.vectorizer == "f": 77 + elif args.vectorizer == "f":
75 # Frequency vectorizer 78 # Frequency vectorizer
76 vectorizer = CountVectorizer(ngram_range=(1, 1)) 79 vectorizer = CountVectorizer(ngram_range=(1, 1))
77 else: 80 else:
...@@ -81,8 +84,8 @@ if __name__ == "__main__": ...@@ -81,8 +84,8 @@ if __name__ == "__main__":
81 matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double') 84 matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double')
82 print(' matrix.shape: ', matrix.shape) 85 print(' matrix.shape: ', matrix.shape)
83 86
84 - with open(os.path.join(options.outputPath, "report-vectorizer.{}.txt".format(options.vectorizer)), encoding="utf-8", mode="w") as oFile: 87 + with open(os.path.join(args.outputPath, "report-vectorizer.{}.txt".format(args.vectorizer)), encoding="utf-8", mode="w") as oFile:
85 - oFile.write("Vectorizer: {}".format(options.vectorizer)) 88 + oFile.write("Vectorizer: {}".format(args.vectorizer))
86 oFile.write(vectorizer.get_feature_names()) 89 oFile.write(vectorizer.get_feature_names())
87 oFile.write(matrix) 90 oFile.write(matrix)
88 91
......