Carlos-Francisco Méndez-Cruz

LSA soft clustering

......@@ -11,23 +11,28 @@ from six import iteritems
from gensim import corpora
import argparse
from pdb import set_trace as st # Debug the program step by step calling st()
# anywhere.
from pdb import set_trace as st # Debug the program step by step calling st()
# anywhere.
class corpus_streamer(object):
""" This Object streams the input raw text file row by row.
"""
def __init__(self, file_name, dictionary=None, strings=None):
self.file_name=file_name
self.dictionary=dictionary
self.strings=strings
self.file_name = file_name
self.dictionary = dictionary
self.strings = strings
def __iter__(self):
for line in open(self.file_name):
# assume there's one document per line, tokens separated by whitespace
# assume there's one document per line, tokens separated by whitespace
if self.dictionary and not self.strings:
yield self.dictionary.doc2bow(line.lower().split())
elif not self.dictionary and self.strings:
yield line.strip().lower()
# Logging all our program
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
......@@ -40,23 +45,23 @@ parser.add_argument("--input", help="Input file to perform LSA.",
args = parser.parse_args()
n_topics=args.n_topics
n_docs=0
input_file=args.input
#input_file='lsa_example.csv'
#input_file='wiki_sample/wiki_75_AA.txt.cln'
#input_file='wiki_sample/wiki_77_AA.txt'
n_topics = args.n_topics
n_docs = 0
input_file = args.input
# input_file='lsa_example.csv'
# input_file='wiki_sample/wiki_75_AA.txt.cln'
# input_file='wiki_sample/wiki_77_AA.txt'
# A little stopwords list
stoplist = set('for a of the and to in _ [ ]'.split())
# Do not load the text corpus into memory, but stream it!
fille=corpus_streamer(input_file, strings=True)
dictionary=corpora.Dictionary(line.lower().split() for line in fille)#open(input_file))
fille = corpus_streamer(input_file, strings=True)
dictionary = corpora.Dictionary(line.lower().split() for line in fille) # open(input_file))
# remove stop words and words that appear only once
stop_ids=[dictionary.token2id[stopword] for stopword in stoplist
if stopword in dictionary.token2id]
once_ids=[tokenid for tokenid, docfreq in iteritems(dictionary.dfs)
if docfreq == 1]
stop_ids = [dictionary.token2id[stopword] for stopword in stoplist
if stopword in dictionary.token2id]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs)
if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids)
# remove gaps in id sequence after words that were removed
dictionary.compactify()
......@@ -66,52 +71,53 @@ dictionary.save('lsa_mini.dict')
# Use instead streaming objects:
# Load stored word-id map (dictionary)
stream_it = corpus_streamer(input_file, dictionary=dictionary)
#for vector in stream_it: # load one vector into memory at a time
# for vector in stream_it: # load one vector into memory at a time
# print vector
# Convert to sparse matrix
sparse_corpus = [text for text in stream_it]
# Store to disk, for later use collect statistics about all tokens
corpora.MmCorpus.serialize('lsa_mini.mm',
sparse_corpus)
sparse_corpus)
## LSA zone
# load the dictionary saved before
id2word = dictionary.load('lsa_mini.dict')
# Now load the sparse matrix corpus from file into a (memory friendly) streaming
# object.
corpus=corpora.MmCorpus('lsa_mini.mm')
corpus = corpora.MmCorpus('lsa_mini.mm')
## IF TfidfModel
tfidf = gensim.models.TfidfModel(corpus) # step 1 -- initialize a model
tfidf = gensim.models.TfidfModel(corpus) # step 1 -- initialize a model
corpus = tfidf[corpus]
## FI TfidfModel
# Compute the LSA vectors
lsa=gensim.models.lsimodel.LsiModel(corpus, id2word=dictionary,
num_topics=n_topics)
lsa = gensim.models.lsimodel.LsiModel(corpus, id2word=dictionary,
num_topics=n_topics)
# Print the n topics in our corpus:
#lsa.print_topics(n_topics)
f=open("topics_file.txt","w")
f.write("-------------------------------------------------\n")
for t in lsa.show_topics(num_words=200):
f.write("%s\n" % str(t))
# lsa.print_topics(n_topics)
with open("topics_file.txt", "w") as f:
f.write("-------------------------------------------------\n")
for t in lsa.show_topics(num_words=200):
f.write("%s\n" % str(t))
f.write("-------------------------------------------------\n")
f.write("-------------------------------------------------\n")
f.close()
# create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
corpus_lsa = lsa[corpus]
# Stream sentences from file into a list of strings called "sentences"
sentences=corpus_streamer(input_file, strings=True)
n=0
for pertenence, sentence in zip(corpus_lsa, sentences):
if n_docs <= 0:
#print "%s\t\t%s" % (pertenence, sentence.split("\t")[0])
p=[dict(pertenence)[x] if x in dict(pertenence) else 0.0
for x in range(n_topics)]
print("{} {}".format("".join(sentence.split("\t")[0].split()), "".join(str(p)[1:].strip("]").split(","))))
else:
if n<n_docs:
pertenence=[dict(pertenence)[x] if x in dict(pertenence) else 0.0
for x in range(n_topics)]
print("%s\t\t%s" % (pertenence, sentence))
n+=1
sentences = corpus_streamer(input_file, strings=True)
n = 0
with open("vectors_file.txt", "w") as f:
for pertenence, sentence in zip(corpus_lsa, sentences):
if n_docs <= 0:
# print "%s\t\t%s" % (pertenence, sentence.split("\t")[0])
p = [dict(pertenence)[x] if x in dict(pertenence) else 0.0
for x in range(n_topics)]
f.write(
"{}\t{}".format("".join(sentence.split("\t")[0].split()), "".join(str(p)[1:].strip("]").split(","))))
else:
break
\ No newline at end of file
if n < n_docs:
pertenence = [dict(pertenence)[x] if x in dict(pertenence) else 0.0
for x in range(n_topics)]
f.write("%s\t\t%s" % (pertenence, sentence))
n += 1
else:
break
......