Carlos-Francisco Méndez-Cruz

Feature extraction and vectorizer three sentences

......@@ -2,7 +2,8 @@
import os
from time import time
from optparse import OptionParser
# from optparse import OptionParser
import argparse
import sys
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix
......@@ -20,11 +21,13 @@ __author__ = 'CMendezC'
# 1) Files with vectors.
# Execution:
# python extraccion-caracteristicas-vectorizacion.py
# --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences
# --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences
# --vectorizer b
# C:\Anaconda3\python extraccion-caracteristicas-vectorizacion.py
# --inputPath
# --outputPath
# --vectorizer
# source activate python3
# python extraccion-caracteristicas-vectorizacion.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences --vectorizer b
###########################################################
# MAIN PROGRAM #
......@@ -32,16 +35,16 @@ __author__ = 'CMendezC'
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
parser = argparse.ArgumentParser(description='Feature extraction and vectorizer.')
parser.add_argument("--inputPath", dest="inputPath", required=True,
help="Path to read input files", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
parser.add_argument("--outputPath", dest="outputPath", required=True,
help="Path to place output files", metavar="PATH")
parser.add_option("--vectorizer", dest="vectorizer",
parser.add_argument("--vectorizer", dest="vectorizer", required=True,
help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR",
choices=('b', 'f', 't'), default='b')
(options, args) = parser.parse_args()
args = parser.parse_args()
print(len(args))
if len(args) != 3:
parser.error("Some parameters missed.")
......@@ -49,9 +52,9 @@ if __name__ == "__main__":
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + str(options.inputPath))
print("Path to place output files: " + str(options.outputPath))
print("Vectorizer: " + str(options.vectorizer))
print("Path to read input files: " + str(args.inputPath))
print("Path to place output files: " + str(args.outputPath))
print("Vectorizer: " + str(args.vectorizer))
# Start time
t0 = time()
......@@ -59,19 +62,19 @@ if __name__ == "__main__":
print("Reading documents...")
documents = []
# Read documents from input path
for path, dirs, files in os.walk(options.outputPath):
for path, dirs, files in os.walk(args.outputPath):
for file in files:
with open(os.path.join(options.inputPath, file), mode="r", encoding="utf-8") as iFile:
with open(os.path.join(args.inputPath, file), mode="r", encoding="utf-8") as iFile:
print("...{}".format(file))
# Add file to document list
documents.append(iFile.read())
# Create vectorizer
print('Vectorizer: {}'.format(options.vectorizer))
if options.vectorizer == "b":
print('Vectorizer: {}'.format(args.vectorizer))
if args.vectorizer == "b":
# Binary vectorizer
vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True)
elif options.vectorizer == "f":
elif args.vectorizer == "f":
# Frequency vectorizer
vectorizer = CountVectorizer(ngram_range=(1, 1))
else:
......@@ -81,8 +84,8 @@ if __name__ == "__main__":
matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double')
print(' matrix.shape: ', matrix.shape)
with open(os.path.join(options.outputPath, "report-vectorizer.{}.txt".format(options.vectorizer)), encoding="utf-8", mode="w") as oFile:
oFile.write("Vectorizer: {}".format(options.vectorizer))
with open(os.path.join(args.outputPath, "report-vectorizer.{}.txt".format(args.vectorizer)), encoding="utf-8", mode="w") as oFile:
oFile.write("Vectorizer: {}".format(args.vectorizer))
oFile.write(vectorizer.get_feature_names())
oFile.write(matrix)
......