Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Carlos-Francisco Méndez-Cruz
/
lcg-bioinfoI-bionlp
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
Carlos-Francisco Méndez-Cruz
2018-09-12 22:09:19 -0500
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
d3712e47bb291e1546c1d2ebc2101916559a0e04
d3712e47
1 parent
27174a6e
Feature extraction and vectorizer three sentences
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
6 additions
and
0 deletions
representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
View file @
d3712e4
...
...
@@ -7,6 +7,7 @@ import argparse
import
sys
from
sklearn.feature_extraction.text
import
TfidfVectorizer
,
CountVectorizer
from
scipy.sparse
import
csr_matrix
from
sklearn.metrics.pairwise
import
cosine_similarity
__author__
=
'CMendezC'
...
...
@@ -81,10 +82,15 @@ if __name__ == "__main__":
matrix
=
csr_matrix
(
vectorizer
.
fit_transform
(
documents
),
dtype
=
'double'
)
print
(
' matrix.shape: '
,
matrix
.
shape
)
similarityMatrix
=
cosine_similarity
(
matrix
)
print
(
" Cosine similarity matrix shape: {}"
.
format
(
similarityMatrix
.
shape
))
with
open
(
os
.
path
.
join
(
args
.
outputPath
,
"report-vectorizer.{}.txt"
.
format
(
args
.
vectorizer
)),
encoding
=
"utf-8"
,
mode
=
"w"
)
as
oFile
:
oFile
.
write
(
"Vectorizer: {}
\n
"
.
format
(
args
.
vectorizer
))
oFile
.
write
(
str
(
vectorizer
.
get_feature_names
()))
oFile
.
write
(
"
\n
"
)
oFile
.
write
(
str
(
matrix
.
toarray
()))
oFile
.
write
(
"
\n
"
)
oFile
.
write
(
str
(
similarityMatrix
.
toarray
()))
print
(
"Feature extraction and vectorizer in:
%
fs"
%
(
time
()
-
t0
))
...
...
Please
register
or
login
to post a comment