Feature extraction and vectorizer three sentences

Carlos-Francisco Méndez-Cruz
Commit cf9f770f8606fa016d74988a46ed77ee77431556 cf9f770f 1 parent c7fdb2f7
Showing 1 changed file with 23 additions and 20 deletions
representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
--- a/representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
View file @cf9f770
+++ b/representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
View file @cf9f770
@@ -2,7 +2,8 @@
 
 import os
 from time import time
- from optparse import OptionParser
+ # from optparse import OptionParser
+ import argparse
 import sys
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 from scipy.sparse import csr_matrix
@@ -20,11 +21,13 @@ __author__ = 'CMendezC'
 # 1) Files with vectors.
 
 # Execution:
+ # python extraccion-caracteristicas-vectorizacion.py
+ # --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences
+ # --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences
+ # --vectorizer b
 
- # C:\Anaconda3\python extraccion-caracteristicas-vectorizacion.py
- # --inputPath
- # --outputPath
- # --vectorizer
+ # source activate python3
+ # python extraccion-caracteristicas-vectorizacion.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences --vectorizer b
 
 ###########################################################
 #                       MAIN PROGRAM                      #
@@ -32,16 +35,16 @@ __author__ = 'CMendezC'
 
 if __name__ == "__main__":
     # Parameter definition
-     parser = OptionParser()
-     parser.add_option("--inputPath", dest="inputPath",
+     parser = argparse.ArgumentParser(description='Feature extraction and vectorizer.')
+     parser.add_argument("--inputPath", dest="inputPath", required=True,
                       help="Path to read input files", metavar="PATH")
-     parser.add_option("--outputPath", dest="outputPath",
+     parser.add_argument("--outputPath", dest="outputPath", required=True,
                           help="Path to place output files", metavar="PATH")
-     parser.add_option("--vectorizer", dest="vectorizer",
+     parser.add_argument("--vectorizer", dest="vectorizer", required=True,
                       help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR",
                       choices=('b', 'f', 't'), default='b')
 
-     (options, args) = parser.parse_args()
+     args = parser.parse_args()
     print(len(args))
     if len(args) != 3:
         parser.error("Some parameters missed.")
@@ -49,9 +52,9 @@ if __name__ == "__main__":
 
     # Printing parameter values
     print('-------------------------------- PARAMETERS --------------------------------')
-     print("Path to read input files: " + str(options.inputPath))
-     print("Path to place output files: " + str(options.outputPath))
-     print("Vectorizer: " + str(options.vectorizer))
+     print("Path to read input files: " + str(args.inputPath))
+     print("Path to place output files: " + str(args.outputPath))
+     print("Vectorizer: " + str(args.vectorizer))
 
     # Start time
     t0 = time()
@@ -59,19 +62,19 @@ if __name__ == "__main__":
     print("Reading documents...")
     documents = []
     # Read documents from input path
-     for path, dirs, files in os.walk(options.outputPath):
+     for path, dirs, files in os.walk(args.outputPath):
         for file in files:
-             with open(os.path.join(options.inputPath, file), mode="r", encoding="utf-8") as iFile:
+             with open(os.path.join(args.inputPath, file), mode="r", encoding="utf-8") as iFile:
                 print("...{}".format(file))
                 # Add file to document list
                 documents.append(iFile.read())
 
     # Create vectorizer
-     print('Vectorizer: {}'.format(options.vectorizer))
-     if options.vectorizer == "b":
+     print('Vectorizer: {}'.format(args.vectorizer))
+     if args.vectorizer == "b":
         # Binary vectorizer
         vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True)
-     elif options.vectorizer == "f":
+     elif args.vectorizer == "f":
         # Frequency vectorizer
         vectorizer = CountVectorizer(ngram_range=(1, 1))
     else:
@@ -81,8 +84,8 @@ if __name__ == "__main__":
     matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double')
     print('     matrix.shape: ', matrix.shape)
 
-     with open(os.path.join(options.outputPath, "report-vectorizer.{}.txt".format(options.vectorizer)), encoding="utf-8", mode="w") as oFile:
-         oFile.write("Vectorizer: {}".format(options.vectorizer))
+     with open(os.path.join(args.outputPath, "report-vectorizer.{}.txt".format(args.vectorizer)), encoding="utf-8", mode="w") as oFile:
+         oFile.write("Vectorizer: {}".format(args.vectorizer))
         oFile.write(vectorizer.get_feature_names())
         oFile.write(matrix)