LSA soft clustering

Carlos-Francisco Méndez-Cruz
Commit 5b5b84828ae18e9dc8c3e32799593f1319bc837a 5b5b8482 1 parent f475f0ed
Showing 1 changed file with 52 additions and 46 deletions
agrupamiento-datos-categoricos/lsa-soft-clustering.py
--- a/agrupamiento-datos-categoricos/lsa-soft-clustering.py
View file @5b5b848
+++ b/agrupamiento-datos-categoricos/lsa-soft-clustering.py
View file @5b5b848
@@ -11,23 +11,28 @@ from six import iteritems
 from gensim import corpora
 import argparse
 
- from pdb import set_trace as st # Debug the program step by step calling st()
-                                 # anywhere.
+ from pdb import set_trace as st  # Debug the program step by step calling st()
+ 
+ 
+ # anywhere.
 class corpus_streamer(object):
     """ This Object streams the input raw text file row by row.
     """
+ 
     def __init__(self, file_name, dictionary=None, strings=None):
-         self.file_name=file_name
-         self.dictionary=dictionary
-         self.strings=strings
+         self.file_name = file_name
+         self.dictionary = dictionary
+         self.strings = strings
 
     def __iter__(self):
         for line in open(self.file_name):
-         # assume there's one document per line, tokens separated by whitespace
+             # assume there's one document per line, tokens separated by whitespace
             if self.dictionary and not self.strings:
                 yield self.dictionary.doc2bow(line.lower().split())
             elif not self.dictionary and self.strings:
                 yield line.strip().lower()
+ 
+ 
 # Logging all our program
 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                     level=logging.INFO)
@@ -40,23 +45,23 @@ parser.add_argument("--input", help="Input file to perform LSA.",
 
 args = parser.parse_args()
 
- n_topics=args.n_topics
- n_docs=0
- input_file=args.input
- #input_file='lsa_example.csv'
- #input_file='wiki_sample/wiki_75_AA.txt.cln'
- #input_file='wiki_sample/wiki_77_AA.txt'
+ n_topics = args.n_topics
+ n_docs = 0
+ input_file = args.input
+ # input_file='lsa_example.csv'
+ # input_file='wiki_sample/wiki_75_AA.txt.cln'
+ # input_file='wiki_sample/wiki_77_AA.txt'
 
 # A little stopwords list
 stoplist = set('for a of the and to in _ [ ]'.split())
 # Do not load the text corpus into memory, but stream it!
- fille=corpus_streamer(input_file, strings=True)
- dictionary=corpora.Dictionary(line.lower().split() for line in fille)#open(input_file))
+ fille = corpus_streamer(input_file, strings=True)
+ dictionary = corpora.Dictionary(line.lower().split() for line in fille)  # open(input_file))
 # remove stop words and words that appear only once
- stop_ids=[dictionary.token2id[stopword] for stopword in stoplist
-                                              if stopword in dictionary.token2id]
- once_ids=[tokenid for tokenid, docfreq in iteritems(dictionary.dfs)
-                                                             if docfreq == 1]
+ stop_ids = [dictionary.token2id[stopword] for stopword in stoplist
+             if stopword in dictionary.token2id]
+ once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs)
+             if docfreq == 1]
 dictionary.filter_tokens(stop_ids + once_ids)
 # remove gaps in id sequence after words that were removed
 dictionary.compactify()
@@ -66,52 +71,53 @@ dictionary.save('lsa_mini.dict')
 # Use instead streaming objects:
 # Load stored word-id map (dictionary)
 stream_it = corpus_streamer(input_file, dictionary=dictionary)
- #for vector in stream_it:  # load one vector into memory at a time
+ # for vector in stream_it:  # load one vector into memory at a time
 #    print vector
 # Convert to sparse matrix
 sparse_corpus = [text for text in stream_it]
 # Store to disk, for later use collect statistics about all tokens
 corpora.MmCorpus.serialize('lsa_mini.mm',
-                             sparse_corpus)
+                            sparse_corpus)
 ## LSA zone
 # load the dictionary saved before
 id2word = dictionary.load('lsa_mini.dict')
 # Now load the sparse matrix corpus from file into a (memory friendly) streaming
 # object.
- corpus=corpora.MmCorpus('lsa_mini.mm')
+ corpus = corpora.MmCorpus('lsa_mini.mm')
 
 ## IF TfidfModel
- tfidf = gensim.models.TfidfModel(corpus) # step 1 -- initialize a model
+ tfidf = gensim.models.TfidfModel(corpus)  # step 1 -- initialize a model
 corpus = tfidf[corpus]
 ## FI TfidfModel
 # Compute the LSA vectors
- lsa=gensim.models.lsimodel.LsiModel(corpus, id2word=dictionary,
-                                                      num_topics=n_topics)
+ lsa = gensim.models.lsimodel.LsiModel(corpus, id2word=dictionary,
+                                       num_topics=n_topics)
 # Print the n topics in our corpus:
- #lsa.print_topics(n_topics)
- f=open("topics_file.txt","w")
- f.write("-------------------------------------------------\n")
- for t in lsa.show_topics(num_words=200):
-     f.write("%s\n" % str(t))
+ # lsa.print_topics(n_topics)
+ with open("topics_file.txt", "w") as f:
+     f.write("-------------------------------------------------\n")
+     for t in lsa.show_topics(num_words=200):
+         f.write("%s\n" % str(t))
+     f.write("-------------------------------------------------\n")
 
- f.write("-------------------------------------------------\n")
- f.close()
 # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
 corpus_lsa = lsa[corpus]
 # Stream sentences from file into a list of strings called "sentences"
- sentences=corpus_streamer(input_file, strings=True)
- n=0
- for pertenence, sentence in zip(corpus_lsa, sentences):
-     if n_docs <= 0:
-     	#print "%s\t\t%s" % (pertenence, sentence.split("\t")[0])
-     	p=[dict(pertenence)[x] if x in dict(pertenence) else 0.0
-                                             for x in range(n_topics)]
-     	print("{} {}".format("".join(sentence.split("\t")[0].split()), "".join(str(p)[1:].strip("]").split(","))))
-     else:
-         if n<n_docs:
-             pertenence=[dict(pertenence)[x] if x in dict(pertenence) else 0.0
-                                                     for x in range(n_topics)]
-             print("%s\t\t%s" % (pertenence, sentence))
-             n+=1
+ sentences = corpus_streamer(input_file, strings=True)
+ n = 0
+ with open("vectors_file.txt", "w") as f:
+     for pertenence, sentence in zip(corpus_lsa, sentences):
+         if n_docs <= 0:
+             # print "%s\t\t%s" % (pertenence, sentence.split("\t")[0])
+             p = [dict(pertenence)[x] if x in dict(pertenence) else 0.0
+                  for x in range(n_topics)]
+             f.write(
+                 "{}\t{}".format("".join(sentence.split("\t")[0].split()), "".join(str(p)[1:].strip("]").split(","))))
         else:
-             break
\ No newline at end of file
+             if n < n_docs:
+                 pertenence = [dict(pertenence)[x] if x in dict(pertenence) else 0.0
+                               for x in range(n_topics)]
+                 f.write("%s\t\t%s" % (pertenence, sentence))
+                 n += 1
+             else:
+                 break