Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Carlos-Francisco Méndez-Cruz
/
lcg-bioinfoI-bionlp
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
Carlos-Francisco Méndez-Cruz
2018-09-12 22:18:28 -0500
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
d7ae81dbef72b0c6d83a2ef2da3d4cdbcde10c85
d7ae81db
1 parent
f8091a1b
Feature extraction and vectorizer three sentences
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
12 additions
and
7 deletions
representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
View file @
d7ae81d
...
...
@@ -19,16 +19,17 @@ __author__ = 'CMendezC'
# 3) --vectorizer Vectorizer: b=binary, f=frequency, t=tf-idf.
# Ouput:
# 1)
Files with vectors
.
# 1)
Report with dictionary, vectors, cosine similarity matrix
.
# Execution:
# python extraccion-caracteristicas-vectorizacion.py
# --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences
# --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences
# --vectorizer b
# --feature word
# source activate python3
# python extraccion-caracteristicas-vectorizacion.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences --vectorizer b
# python extraccion-caracteristicas-vectorizacion.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences --vectorizer b
--feature word
###########################################################
# MAIN PROGRAM #
...
...
@@ -44,6 +45,9 @@ if __name__ == "__main__":
parser
.
add_argument
(
"--vectorizer"
,
dest
=
"vectorizer"
,
required
=
True
,
help
=
"Vectorizer: b=binary, f=frequency, t=tf-idf"
,
metavar
=
"CHAR"
,
choices
=
(
'b'
,
'f'
,
't'
),
default
=
'b'
)
parser
.
add_argument
(
"--feature"
,
dest
=
"feature"
,
required
=
True
,
help
=
"Feature: word, lemma, pos"
,
metavar
=
"TEXT"
,
choices
=
(
'word'
,
'lemma'
,
'pos'
),
default
=
'b'
)
args
=
parser
.
parse_args
()
...
...
@@ -61,10 +65,11 @@ if __name__ == "__main__":
# Read documents from input path
for
path
,
dirs
,
files
in
os
.
walk
(
args
.
inputPath
):
for
file
in
files
:
with
open
(
os
.
path
.
join
(
args
.
inputPath
,
file
),
mode
=
"r"
,
encoding
=
"utf-8"
)
as
iFile
:
print
(
"...{}"
.
format
(
file
))
# Add file to document list
documents
.
append
(
iFile
.
read
())
if
file
.
endswith
(
args
.
feature
):
with
open
(
os
.
path
.
join
(
args
.
inputPath
,
file
),
mode
=
"r"
,
encoding
=
"utf-8"
)
as
iFile
:
print
(
"...{}"
.
format
(
file
))
# Add file to document list
documents
.
append
(
iFile
.
read
())
print
(
" Documents: {}"
.
format
(
len
(
documents
)))
# Create vectorizer
...
...
@@ -85,7 +90,7 @@ if __name__ == "__main__":
similarityMatrix
=
cosine_similarity
(
matrix
)
print
(
" Cosine similarity matrix shape: {}"
.
format
(
similarityMatrix
.
shape
))
with
open
(
os
.
path
.
join
(
args
.
outputPath
,
"report-vectorizer.{}.
txt"
.
format
(
args
.
vectorizer
)),
encoding
=
"utf-8"
,
mode
=
"w"
)
as
oFile
:
with
open
(
os
.
path
.
join
(
args
.
outputPath
,
"report-vectorizer.{}.
{}.txt"
.
format
(
args
.
feature
,
args
.
vectorizer
)),
encoding
=
"utf-8"
,
mode
=
"w"
)
as
oFile
:
oFile
.
write
(
"Vectorizer: {}
\n
"
.
format
(
args
.
vectorizer
))
oFile
.
write
(
str
(
vectorizer
.
get_feature_names
()))
oFile
.
write
(
"
\n
"
)
...
...
Please
register
or
login
to post a comment