Feature extraction and vectorizer three sentences

Carlos-Francisco Méndez-Cruz
Commit cf9f770f8606fa016d74988a46ed77ee77431556 cf9f770f 1 parent c7fdb2f7
Showing 1 changed file with 23 additions and 20 deletions
representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
--- a/representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
View file @cf9f770
+++ b/representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
View file @cf9f770
@@ -2,7 +2,8 @@
 import os
 from time import time
-from optparse import OptionParser
+# from optparse import OptionParser
+import argparse
 import sys
 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
 from scipy.sparse import csr_matrix
@@ -20,11 +21,13 @@ __author__ = 'CMendezC'
 # 1) Files with vectors.
 # Execution:
+# python extraccion-caracteristicas-vectorizacion.py
+# --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences
+# --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences
+# --vectorizer b
-# C:\Anaconda3\python extraccion-caracteristicas-vectorizacion.py
+# source activate python3
-# --inputPath
+# python extraccion-caracteristicas-vectorizacion.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences --vectorizer b
-# --outputPath
-# --vectorizer
 ###########################################################
 #                       MAIN PROGRAM                      #
@@ -32,16 +35,16 @@ __author__ = 'CMendezC'
 if __name__ == "__main__":
     # Parameter definition
-    parser = OptionParser()
+    parser = argparse.ArgumentParser(description='Feature extraction and vectorizer.')
-    parser.add_option("--inputPath", dest="inputPath",
+    parser.add_argument("--inputPath", dest="inputPath", required=True,
                       help="Path to read input files", metavar="PATH")
-    parser.add_option("--outputPath", dest="outputPath",
+    parser.add_argument("--outputPath", dest="outputPath", required=True,
                           help="Path to place output files", metavar="PATH")
-    parser.add_option("--vectorizer", dest="vectorizer",
+    parser.add_argument("--vectorizer", dest="vectorizer", required=True,
                       help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR",
                       choices=('b', 'f', 't'), default='b')
-    (options, args) = parser.parse_args()
+    args = parser.parse_args()
     print(len(args))
     if len(args) != 3:
         parser.error("Some parameters missed.")
@@ -49,9 +52,9 @@ if __name__ == "__main__":
     # Printing parameter values
     print('-------------------------------- PARAMETERS --------------------------------')
-    print("Path to read input files: " + str(options.inputPath))
+    print("Path to read input files: " + str(args.inputPath))
-    print("Path to place output files: " + str(options.outputPath))
+    print("Path to place output files: " + str(args.outputPath))
-    print("Vectorizer: " + str(options.vectorizer))
+    print("Vectorizer: " + str(args.vectorizer))
     # Start time
     t0 = time()
@@ -59,19 +62,19 @@ if __name__ == "__main__":
     print("Reading documents...")
     documents = []
     # Read documents from input path
-    for path, dirs, files in os.walk(options.outputPath):
+    for path, dirs, files in os.walk(args.outputPath):
         for file in files:
-            with open(os.path.join(options.inputPath, file), mode="r", encoding="utf-8") as iFile:
+            with open(os.path.join(args.inputPath, file), mode="r", encoding="utf-8") as iFile:
                 print("...{}".format(file))
                 # Add file to document list
                 documents.append(iFile.read())
     # Create vectorizer
-    print('Vectorizer: {}'.format(options.vectorizer))
+    print('Vectorizer: {}'.format(args.vectorizer))
-    if options.vectorizer == "b":
+    if args.vectorizer == "b":
         # Binary vectorizer
         vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True)
-    elif options.vectorizer == "f":
+    elif args.vectorizer == "f":
         # Frequency vectorizer
         vectorizer = CountVectorizer(ngram_range=(1, 1))
     else:
@@ -81,8 +84,8 @@ if __name__ == "__main__":
     matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double')
     print('     matrix.shape: ', matrix.shape)
-    with open(os.path.join(options.outputPath, "report-vectorizer.{}.txt".format(options.vectorizer)), encoding="utf-8", mode="w") as oFile:
+    with open(os.path.join(args.outputPath, "report-vectorizer.{}.txt".format(args.vectorizer)), encoding="utf-8", mode="w") as oFile:
-        oFile.write("Vectorizer: {}".format(options.vectorizer))
+        oFile.write("Vectorizer: {}".format(args.vectorizer))
         oFile.write(vectorizer.get_feature_names())
         oFile.write(matrix)