Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Carlos-Francisco Méndez-Cruz
/
lcg-bioinfoI-bionlp
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
Carlos-Francisco Méndez-Cruz
2018-09-12 21:08:51 -0500
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
c7fdb2f7d276977906f3714e97b6fdb8dc97c0cc
c7fdb2f7
0 parents
Feature extraction and vectorizer three sentences
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
95 additions
and
0 deletions
.idea/vcs.xml
representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
.idea/vcs.xml
0 → 100644
View file @
c7fdb2f
<?xml version="1.0" encoding="UTF-8"?>
<project
version=
"4"
>
<component
name=
"VcsDirectoryMappings"
>
<mapping
directory=
"$PROJECT_DIR$"
vcs=
"Git"
/>
</component>
</project>
\ No newline at end of file
representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
0 → 100644
View file @
c7fdb2f
# -*- encoding: utf-8 -*-
import
os
from
time
import
time
from
optparse
import
OptionParser
import
sys
from
sklearn.feature_extraction.text
import
TfidfVectorizer
,
CountVectorizer
from
scipy.sparse
import
csr_matrix
__author__
=
'CMendezC'
# Goal: Feature extraction, vectorizer and TF-IDF
# Parameters:
# 1) --inputPath Path to read input files.
# 2) --outputPath Path to save output files.
# 3) --vectorizer Vectorizer: b=binary, f=frequency, t=tf-idf.
# Ouput:
# 1) Files with vectors.
# Execution:
# C:\Anaconda3\python extraccion-caracteristicas-vectorizacion.py
# --inputPath
# --outputPath
# --vectorizer
###########################################################
# MAIN PROGRAM #
###########################################################
if
__name__
==
"__main__"
:
# Parameter definition
parser
=
OptionParser
()
parser
.
add_option
(
"--inputPath"
,
dest
=
"inputPath"
,
help
=
"Path to read input files"
,
metavar
=
"PATH"
)
parser
.
add_option
(
"--outputPath"
,
dest
=
"outputPath"
,
help
=
"Path to place output files"
,
metavar
=
"PATH"
)
parser
.
add_option
(
"--vectorizer"
,
dest
=
"vectorizer"
,
help
=
"Vectorizer: b=binary, f=frequency, t=tf-idf"
,
metavar
=
"CHAR"
,
choices
=
(
'b'
,
'f'
,
't'
),
default
=
'b'
)
(
options
,
args
)
=
parser
.
parse_args
()
print
(
len
(
args
))
if
len
(
args
)
!=
3
:
parser
.
error
(
"Some parameters missed."
)
sys
.
exit
(
1
)
# Printing parameter values
print
(
'-------------------------------- PARAMETERS --------------------------------'
)
print
(
"Path to read input files: "
+
str
(
options
.
inputPath
))
print
(
"Path to place output files: "
+
str
(
options
.
outputPath
))
print
(
"Vectorizer: "
+
str
(
options
.
vectorizer
))
# Start time
t0
=
time
()
print
(
"Reading documents..."
)
documents
=
[]
# Read documents from input path
for
path
,
dirs
,
files
in
os
.
walk
(
options
.
outputPath
):
for
file
in
files
:
with
open
(
os
.
path
.
join
(
options
.
inputPath
,
file
),
mode
=
"r"
,
encoding
=
"utf-8"
)
as
iFile
:
print
(
"...{}"
.
format
(
file
))
# Add file to document list
documents
.
append
(
iFile
.
read
())
# Create vectorizer
print
(
'Vectorizer: {}'
.
format
(
options
.
vectorizer
))
if
options
.
vectorizer
==
"b"
:
# Binary vectorizer
vectorizer
=
CountVectorizer
(
ngram_range
=
(
1
,
1
),
binary
=
True
)
elif
options
.
vectorizer
==
"f"
:
# Frequency vectorizer
vectorizer
=
CountVectorizer
(
ngram_range
=
(
1
,
1
))
else
:
# Binary vectorizer
vectorizer
=
TfidfVectorizer
(
ngram_range
=
(
1
,
1
))
matrix
=
csr_matrix
(
vectorizer
.
fit_transform
(
documents
),
dtype
=
'double'
)
print
(
' matrix.shape: '
,
matrix
.
shape
)
with
open
(
os
.
path
.
join
(
options
.
outputPath
,
"report-vectorizer.{}.txt"
.
format
(
options
.
vectorizer
)),
encoding
=
"utf-8"
,
mode
=
"w"
)
as
oFile
:
oFile
.
write
(
"Vectorizer: {}"
.
format
(
options
.
vectorizer
))
oFile
.
write
(
vectorizer
.
get_feature_names
())
oFile
.
write
(
matrix
)
print
(
"Feature extraction and vectorizer in:
%
fs"
%
(
time
()
-
t0
))
Please
register
or
login
to post a comment