Carlos-Francisco Méndez-Cruz

Feature extraction and vectorizer three sentences

......@@ -19,16 +19,17 @@ __author__ = 'CMendezC'
# 3) --vectorizer Vectorizer: b=binary, f=frequency, t=tf-idf.
# Ouput:
# 1) Files with vectors.
# 1) Report with dictionary, vectors, cosine similarity matrix.
# Execution:
# python extraccion-caracteristicas-vectorizacion.py
# --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences
# --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences
# --vectorizer b
# --feature word
# source activate python3
# python extraccion-caracteristicas-vectorizacion.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences --vectorizer b
# python extraccion-caracteristicas-vectorizacion.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences --vectorizer b --feature word
###########################################################
# MAIN PROGRAM #
......@@ -44,6 +45,9 @@ if __name__ == "__main__":
parser.add_argument("--vectorizer", dest="vectorizer", required=True,
help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR",
choices=('b', 'f', 't'), default='b')
parser.add_argument("--feature", dest="feature", required=True,
help="Feature: word, lemma, pos", metavar="TEXT",
choices=('word', 'lemma', 'pos'), default='b')
args = parser.parse_args()
......@@ -61,10 +65,11 @@ if __name__ == "__main__":
# Read documents from input path
for path, dirs, files in os.walk(args.inputPath):
for file in files:
with open(os.path.join(args.inputPath, file), mode="r", encoding="utf-8") as iFile:
print("...{}".format(file))
# Add file to document list
documents.append(iFile.read())
if file.endswith(args.feature):
with open(os.path.join(args.inputPath, file), mode="r", encoding="utf-8") as iFile:
print("...{}".format(file))
# Add file to document list
documents.append(iFile.read())
print(" Documents: {}".format(len(documents)))
# Create vectorizer
......@@ -85,7 +90,7 @@ if __name__ == "__main__":
similarityMatrix = cosine_similarity(matrix)
print(" Cosine similarity matrix shape: {}".format(similarityMatrix.shape))
with open(os.path.join(args.outputPath, "report-vectorizer.{}.txt".format(args.vectorizer)), encoding="utf-8", mode="w") as oFile:
with open(os.path.join(args.outputPath, "report-vectorizer.{}.{}.txt".format(args.feature, args.vectorizer)), encoding="utf-8", mode="w") as oFile:
oFile.write("Vectorizer: {}\n".format(args.vectorizer))
oFile.write(str(vectorizer.get_feature_names()))
oFile.write("\n")
......