Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Carlos-Francisco Méndez-Cruz
/
lcg-bioinfoI-bionlp
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
Carlos-Francisco Méndez-Cruz
2018-09-12 21:30:09 -0500
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
cf9f770f8606fa016d74988a46ed77ee77431556
cf9f770f
1 parent
c7fdb2f7
Feature extraction and vectorizer three sentences
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
23 additions
and
20 deletions
representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
View file @
cf9f770
...
...
@@ -2,7 +2,8 @@
import
os
from
time
import
time
from
optparse
import
OptionParser
# from optparse import OptionParser
import
argparse
import
sys
from
sklearn.feature_extraction.text
import
TfidfVectorizer
,
CountVectorizer
from
scipy.sparse
import
csr_matrix
...
...
@@ -20,11 +21,13 @@ __author__ = 'CMendezC'
# 1) Files with vectors.
# Execution:
# python extraccion-caracteristicas-vectorizacion.py
# --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences
# --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences
# --vectorizer b
# C:\Anaconda3\python extraccion-caracteristicas-vectorizacion.py
# --inputPath
# --outputPath
# --vectorizer
# source activate python3
# python extraccion-caracteristicas-vectorizacion.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences --vectorizer b
###########################################################
# MAIN PROGRAM #
...
...
@@ -32,16 +35,16 @@ __author__ = 'CMendezC'
if
__name__
==
"__main__"
:
# Parameter definition
parser
=
OptionParser
(
)
parser
.
add_
option
(
"--inputPath"
,
dest
=
"inputPath"
,
parser
=
argparse
.
ArgumentParser
(
description
=
'Feature extraction and vectorizer.'
)
parser
.
add_
argument
(
"--inputPath"
,
dest
=
"inputPath"
,
required
=
True
,
help
=
"Path to read input files"
,
metavar
=
"PATH"
)
parser
.
add_
option
(
"--outputPath"
,
dest
=
"outputPath"
,
parser
.
add_
argument
(
"--outputPath"
,
dest
=
"outputPath"
,
required
=
True
,
help
=
"Path to place output files"
,
metavar
=
"PATH"
)
parser
.
add_
option
(
"--vectorizer"
,
dest
=
"vectorizer"
,
parser
.
add_
argument
(
"--vectorizer"
,
dest
=
"vectorizer"
,
required
=
True
,
help
=
"Vectorizer: b=binary, f=frequency, t=tf-idf"
,
metavar
=
"CHAR"
,
choices
=
(
'b'
,
'f'
,
't'
),
default
=
'b'
)
(
options
,
args
)
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
print
(
len
(
args
))
if
len
(
args
)
!=
3
:
parser
.
error
(
"Some parameters missed."
)
...
...
@@ -49,9 +52,9 @@ if __name__ == "__main__":
# Printing parameter values
print
(
'-------------------------------- PARAMETERS --------------------------------'
)
print
(
"Path to read input files: "
+
str
(
option
s
.
inputPath
))
print
(
"Path to place output files: "
+
str
(
option
s
.
outputPath
))
print
(
"Vectorizer: "
+
str
(
option
s
.
vectorizer
))
print
(
"Path to read input files: "
+
str
(
arg
s
.
inputPath
))
print
(
"Path to place output files: "
+
str
(
arg
s
.
outputPath
))
print
(
"Vectorizer: "
+
str
(
arg
s
.
vectorizer
))
# Start time
t0
=
time
()
...
...
@@ -59,19 +62,19 @@ if __name__ == "__main__":
print
(
"Reading documents..."
)
documents
=
[]
# Read documents from input path
for
path
,
dirs
,
files
in
os
.
walk
(
option
s
.
outputPath
):
for
path
,
dirs
,
files
in
os
.
walk
(
arg
s
.
outputPath
):
for
file
in
files
:
with
open
(
os
.
path
.
join
(
option
s
.
inputPath
,
file
),
mode
=
"r"
,
encoding
=
"utf-8"
)
as
iFile
:
with
open
(
os
.
path
.
join
(
arg
s
.
inputPath
,
file
),
mode
=
"r"
,
encoding
=
"utf-8"
)
as
iFile
:
print
(
"...{}"
.
format
(
file
))
# Add file to document list
documents
.
append
(
iFile
.
read
())
# Create vectorizer
print
(
'Vectorizer: {}'
.
format
(
option
s
.
vectorizer
))
if
option
s
.
vectorizer
==
"b"
:
print
(
'Vectorizer: {}'
.
format
(
arg
s
.
vectorizer
))
if
arg
s
.
vectorizer
==
"b"
:
# Binary vectorizer
vectorizer
=
CountVectorizer
(
ngram_range
=
(
1
,
1
),
binary
=
True
)
elif
option
s
.
vectorizer
==
"f"
:
elif
arg
s
.
vectorizer
==
"f"
:
# Frequency vectorizer
vectorizer
=
CountVectorizer
(
ngram_range
=
(
1
,
1
))
else
:
...
...
@@ -81,8 +84,8 @@ if __name__ == "__main__":
matrix
=
csr_matrix
(
vectorizer
.
fit_transform
(
documents
),
dtype
=
'double'
)
print
(
' matrix.shape: '
,
matrix
.
shape
)
with
open
(
os
.
path
.
join
(
options
.
outputPath
,
"report-vectorizer.{}.txt"
.
format
(
option
s
.
vectorizer
)),
encoding
=
"utf-8"
,
mode
=
"w"
)
as
oFile
:
oFile
.
write
(
"Vectorizer: {}"
.
format
(
option
s
.
vectorizer
))
with
open
(
os
.
path
.
join
(
args
.
outputPath
,
"report-vectorizer.{}.txt"
.
format
(
arg
s
.
vectorizer
)),
encoding
=
"utf-8"
,
mode
=
"w"
)
as
oFile
:
oFile
.
write
(
"Vectorizer: {}"
.
format
(
arg
s
.
vectorizer
))
oFile
.
write
(
vectorizer
.
get_feature_names
())
oFile
.
write
(
matrix
)
...
...
Please
register
or
login
to post a comment