Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Carlos-Francisco Méndez-Cruz
/
lcg-bioinfoI-bionlp
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
Carlos-Francisco Méndez-Cruz
2018-09-12 21:33:07 -0500
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
83155741c5e4b10ae22ad23cb336e714c87a1bc5
83155741
1 parent
b7c3abcd
Feature extraction and vectorizer three sentences
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
4 additions
and
3 deletions
representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
View file @
8315574
...
...
@@ -58,15 +58,16 @@ if __name__ == "__main__":
print
(
"Reading documents..."
)
documents
=
[]
# Read documents from input path
for
path
,
dirs
,
files
in
os
.
walk
(
args
.
out
putPath
):
for
path
,
dirs
,
files
in
os
.
walk
(
args
.
in
putPath
):
for
file
in
files
:
with
open
(
os
.
path
.
join
(
args
.
inputPath
,
file
),
mode
=
"r"
,
encoding
=
"utf-8"
)
as
iFile
:
print
(
"...{}"
.
format
(
file
))
# Add file to document list
documents
.
append
(
iFile
.
read
())
print
(
" Documents: {}"
.
format
(
len
(
documents
)))
# Create vectorizer
print
(
'Vectorizer: {}'
.
format
(
args
.
vectorizer
))
print
(
'
Vectorizer: {}'
.
format
(
args
.
vectorizer
))
if
args
.
vectorizer
==
"b"
:
# Binary vectorizer
vectorizer
=
CountVectorizer
(
ngram_range
=
(
1
,
1
),
binary
=
True
)
...
...
@@ -78,7 +79,7 @@ if __name__ == "__main__":
vectorizer
=
TfidfVectorizer
(
ngram_range
=
(
1
,
1
))
matrix
=
csr_matrix
(
vectorizer
.
fit_transform
(
documents
),
dtype
=
'double'
)
print
(
'
matrix.shape: '
,
matrix
.
shape
)
print
(
' matrix.shape: '
,
matrix
.
shape
)
with
open
(
os
.
path
.
join
(
args
.
outputPath
,
"report-vectorizer.{}.txt"
.
format
(
args
.
vectorizer
)),
encoding
=
"utf-8"
,
mode
=
"w"
)
as
oFile
:
oFile
.
write
(
"Vectorizer: {}"
.
format
(
args
.
vectorizer
))
...
...
Please
register
or
login
to post a comment