extraccion-caracteristicas-vectorizacion.py
4.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# -*- encoding: utf-8 -*-
import os
from time import time
import argparse
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
__author__ = 'CMendezC'
# Goal: Feature extraction, vectorizer and TF-IDF
# Parameters:
# 1) --inputPath Path to read input files.
# 2) --outputPath Path to save output files.
# 3) --vectorizer Vectorizer: b=binary, f=frequency, t=tf-idf.
# 4) --feature Extracted feature from documents: word, lemma, pos, ner
# Ouput:
# 1) Report with dictionary, vectors, cosine similarity matrix.
# Execution:
# python extraccion-caracteristicas-vectorizacion.py
# --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences
# --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences
# --vectorizer b
# --feature word
# source activate python3
# python extraccion-caracteristicas-vectorizacion.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences --vectorizer b --feature word
###########################################################
# MAIN PROGRAM #
###########################################################
if __name__ == "__main__":
# Parameter definition
parser = argparse.ArgumentParser(description='Feature extraction and vectorizer.')
parser.add_argument("--inputPath", dest="inputPath", required=True,
help="Path to read input files", metavar="PATH")
parser.add_argument("--outputPath", dest="outputPath", required=True,
help="Path to place output files", metavar="PATH")
parser.add_argument("--vectorizer", dest="vectorizer", required=True,
help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR",
choices=('b', 'f', 't'), default='b')
parser.add_argument("--feature", dest="feature", required=True,
help="Feature: word, lemma, pos, ner", metavar="TEXT",
choices=('word', 'lemma', 'pos', 'ner'), default='word')
args = parser.parse_args()
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + str(args.inputPath))
print("Path to place output files: " + str(args.outputPath))
print("Vectorizer: " + str(args.vectorizer))
print("Feature: " + str(args.feature))
# Start time
t0 = time()
print("Reading documents...")
documents = []
# Read documents from input path
for path, dirs, files in os.walk(args.inputPath):
for file in files:
if file.endswith(args.feature):
with open(os.path.join(args.inputPath, file), mode="r", encoding="utf-8") as iFile:
print("...{}".format(file))
# Add file to document list
documents.append(iFile.read())
print(" Documents: {}".format(len(documents)))
# Create vectorizer
print(' Vectorizer: {}'.format(args.vectorizer))
if args.vectorizer == "b":
# Binary vectorizer
vectorizer = CountVectorizer(ngram_range=(1, 1), binary=True)
elif args.vectorizer == "f":
# Frequency vectorizer
vectorizer = CountVectorizer(ngram_range=(1, 1))
else:
# Binary vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 1))
matrix = csr_matrix(vectorizer.fit_transform(documents), dtype='double')
print(' matrix.shape: ', matrix.shape)
similarityMatrix = cosine_similarity(matrix)
print(" Cosine similarity matrix shape: {}".format(similarityMatrix.shape))
with open(os.path.join(args.outputPath, "report-vectorizer.{}.{}.txt".format(args.feature, args.vectorizer)), encoding="utf-8", mode="w") as oFile:
oFile.write("Vectorizer: {}\n".format(args.vectorizer))
oFile.write(str(vectorizer.get_feature_names()))
oFile.write("\n")
oFile.write(str(matrix.toarray()))
oFile.write("\n")
oFile.write(str(similarityMatrix))
print("Feature extraction and vectorizer in: %fs" % (time() - t0))