Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Carlos-Francisco Méndez-Cruz
/
lcg-faaa
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
Carlos-Francisco Méndez-Cruz
2018-10-04 22:04:43 -0500
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
5b5b84828ae18e9dc8c3e32799593f1319bc837a
5b5b8482
1 parent
f475f0ed
LSA soft clustering
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
52 additions
and
46 deletions
agrupamiento-datos-categoricos/lsa-soft-clustering.py
agrupamiento-datos-categoricos/lsa-soft-clustering.py
View file @
5b5b848
...
...
@@ -11,23 +11,28 @@ from six import iteritems
from
gensim
import
corpora
import
argparse
from
pdb
import
set_trace
as
st
# Debug the program step by step calling st()
# anywhere.
from
pdb
import
set_trace
as
st
# Debug the program step by step calling st()
# anywhere.
class
corpus_streamer
(
object
):
""" This Object streams the input raw text file row by row.
"""
def
__init__
(
self
,
file_name
,
dictionary
=
None
,
strings
=
None
):
self
.
file_name
=
file_name
self
.
dictionary
=
dictionary
self
.
strings
=
strings
self
.
file_name
=
file_name
self
.
dictionary
=
dictionary
self
.
strings
=
strings
def
__iter__
(
self
):
for
line
in
open
(
self
.
file_name
):
# assume there's one document per line, tokens separated by whitespace
# assume there's one document per line, tokens separated by whitespace
if
self
.
dictionary
and
not
self
.
strings
:
yield
self
.
dictionary
.
doc2bow
(
line
.
lower
()
.
split
())
elif
not
self
.
dictionary
and
self
.
strings
:
yield
line
.
strip
()
.
lower
()
# Logging all our program
logging
.
basicConfig
(
format
=
'
%(asctime)
s :
%(levelname)
s :
%(message)
s'
,
level
=
logging
.
INFO
)
...
...
@@ -40,23 +45,23 @@ parser.add_argument("--input", help="Input file to perform LSA.",
args
=
parser
.
parse_args
()
n_topics
=
args
.
n_topics
n_docs
=
0
input_file
=
args
.
input
#input_file='lsa_example.csv'
#input_file='wiki_sample/wiki_75_AA.txt.cln'
#input_file='wiki_sample/wiki_77_AA.txt'
n_topics
=
args
.
n_topics
n_docs
=
0
input_file
=
args
.
input
#
input_file='lsa_example.csv'
#
input_file='wiki_sample/wiki_75_AA.txt.cln'
#
input_file='wiki_sample/wiki_77_AA.txt'
# A little stopwords list
stoplist
=
set
(
'for a of the and to in _ [ ]'
.
split
())
# Do not load the text corpus into memory, but stream it!
fille
=
corpus_streamer
(
input_file
,
strings
=
True
)
dictionary
=
corpora
.
Dictionary
(
line
.
lower
()
.
split
()
for
line
in
fille
)
#
open(input_file))
fille
=
corpus_streamer
(
input_file
,
strings
=
True
)
dictionary
=
corpora
.
Dictionary
(
line
.
lower
()
.
split
()
for
line
in
fille
)
#
open(input_file))
# remove stop words and words that appear only once
stop_ids
=
[
dictionary
.
token2id
[
stopword
]
for
stopword
in
stoplist
if
stopword
in
dictionary
.
token2id
]
once_ids
=
[
tokenid
for
tokenid
,
docfreq
in
iteritems
(
dictionary
.
dfs
)
if
docfreq
==
1
]
stop_ids
=
[
dictionary
.
token2id
[
stopword
]
for
stopword
in
stoplist
if
stopword
in
dictionary
.
token2id
]
once_ids
=
[
tokenid
for
tokenid
,
docfreq
in
iteritems
(
dictionary
.
dfs
)
if
docfreq
==
1
]
dictionary
.
filter_tokens
(
stop_ids
+
once_ids
)
# remove gaps in id sequence after words that were removed
dictionary
.
compactify
()
...
...
@@ -66,52 +71,53 @@ dictionary.save('lsa_mini.dict')
# Use instead streaming objects:
# Load stored word-id map (dictionary)
stream_it
=
corpus_streamer
(
input_file
,
dictionary
=
dictionary
)
#for vector in stream_it: # load one vector into memory at a time
#
for vector in stream_it: # load one vector into memory at a time
# print vector
# Convert to sparse matrix
sparse_corpus
=
[
text
for
text
in
stream_it
]
# Store to disk, for later use collect statistics about all tokens
corpora
.
MmCorpus
.
serialize
(
'lsa_mini.mm'
,
sparse_corpus
)
sparse_corpus
)
## LSA zone
# load the dictionary saved before
id2word
=
dictionary
.
load
(
'lsa_mini.dict'
)
# Now load the sparse matrix corpus from file into a (memory friendly) streaming
# object.
corpus
=
corpora
.
MmCorpus
(
'lsa_mini.mm'
)
corpus
=
corpora
.
MmCorpus
(
'lsa_mini.mm'
)
## IF TfidfModel
tfidf
=
gensim
.
models
.
TfidfModel
(
corpus
)
# step 1 -- initialize a model
tfidf
=
gensim
.
models
.
TfidfModel
(
corpus
)
# step 1 -- initialize a model
corpus
=
tfidf
[
corpus
]
## FI TfidfModel
# Compute the LSA vectors
lsa
=
gensim
.
models
.
lsimodel
.
LsiModel
(
corpus
,
id2word
=
dictionary
,
num_topics
=
n_topics
)
lsa
=
gensim
.
models
.
lsimodel
.
LsiModel
(
corpus
,
id2word
=
dictionary
,
num_topics
=
n_topics
)
# Print the n topics in our corpus:
#lsa.print_topics(n_topics)
f
=
open
(
"topics_file.txt"
,
"w"
)
f
.
write
(
"-------------------------------------------------
\n
"
)
for
t
in
lsa
.
show_topics
(
num_words
=
200
):
f
.
write
(
"
%
s
\n
"
%
str
(
t
))
# lsa.print_topics(n_topics)
with
open
(
"topics_file.txt"
,
"w"
)
as
f
:
f
.
write
(
"-------------------------------------------------
\n
"
)
for
t
in
lsa
.
show_topics
(
num_words
=
200
):
f
.
write
(
"
%
s
\n
"
%
str
(
t
))
f
.
write
(
"-------------------------------------------------
\n
"
)
f
.
write
(
"-------------------------------------------------
\n
"
)
f
.
close
()
# create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
corpus_lsa
=
lsa
[
corpus
]
# Stream sentences from file into a list of strings called "sentences"
sentences
=
corpus_streamer
(
input_file
,
strings
=
True
)
n
=
0
for
pertenence
,
sentence
in
zip
(
corpus_lsa
,
sentences
):
if
n_docs
<=
0
:
#print "%s\t\t%s" % (pertenence, sentence.split("\t")[0])
p
=
[
dict
(
pertenence
)[
x
]
if
x
in
dict
(
pertenence
)
else
0.0
for
x
in
range
(
n_topics
)]
print
(
"{} {}"
.
format
(
""
.
join
(
sentence
.
split
(
"
\t
"
)[
0
]
.
split
()),
""
.
join
(
str
(
p
)[
1
:]
.
strip
(
"]"
)
.
split
(
","
))))
else
:
if
n
<
n_docs
:
pertenence
=
[
dict
(
pertenence
)[
x
]
if
x
in
dict
(
pertenence
)
else
0.0
for
x
in
range
(
n_topics
)]
print
(
"
%
s
\t\t
%
s"
%
(
pertenence
,
sentence
))
n
+=
1
sentences
=
corpus_streamer
(
input_file
,
strings
=
True
)
n
=
0
with
open
(
"vectors_file.txt"
,
"w"
)
as
f
:
for
pertenence
,
sentence
in
zip
(
corpus_lsa
,
sentences
):
if
n_docs
<=
0
:
# print "%s\t\t%s" % (pertenence, sentence.split("\t")[0])
p
=
[
dict
(
pertenence
)[
x
]
if
x
in
dict
(
pertenence
)
else
0.0
for
x
in
range
(
n_topics
)]
f
.
write
(
"{}
\t
{}"
.
format
(
""
.
join
(
sentence
.
split
(
"
\t
"
)[
0
]
.
split
()),
""
.
join
(
str
(
p
)[
1
:]
.
strip
(
"]"
)
.
split
(
","
))))
else
:
break
\ No newline at end of file
if
n
<
n_docs
:
pertenence
=
[
dict
(
pertenence
)[
x
]
if
x
in
dict
(
pertenence
)
else
0.0
for
x
in
range
(
n_topics
)]
f
.
write
(
"
%
s
\t\t
%
s"
%
(
pertenence
,
sentence
))
n
+=
1
else
:
break
...
...
Please
register
or
login
to post a comment