Carlos-Francisco Méndez-Cruz

LSA soft clustering

...@@ -11,23 +11,28 @@ from six import iteritems ...@@ -11,23 +11,28 @@ from six import iteritems
11 from gensim import corpora 11 from gensim import corpora
12 import argparse 12 import argparse
13 13
14 -from pdb import set_trace as st # Debug the program step by step calling st() 14 +from pdb import set_trace as st # Debug the program step by step calling st()
15 - # anywhere. 15 +
16 +
17 +# anywhere.
16 class corpus_streamer(object): 18 class corpus_streamer(object):
17 """ This Object streams the input raw text file row by row. 19 """ This Object streams the input raw text file row by row.
18 """ 20 """
21 +
19 def __init__(self, file_name, dictionary=None, strings=None): 22 def __init__(self, file_name, dictionary=None, strings=None):
20 - self.file_name=file_name 23 + self.file_name = file_name
21 - self.dictionary=dictionary 24 + self.dictionary = dictionary
22 - self.strings=strings 25 + self.strings = strings
23 26
24 def __iter__(self): 27 def __iter__(self):
25 for line in open(self.file_name): 28 for line in open(self.file_name):
26 - # assume there's one document per line, tokens separated by whitespace 29 + # assume there's one document per line, tokens separated by whitespace
27 if self.dictionary and not self.strings: 30 if self.dictionary and not self.strings:
28 yield self.dictionary.doc2bow(line.lower().split()) 31 yield self.dictionary.doc2bow(line.lower().split())
29 elif not self.dictionary and self.strings: 32 elif not self.dictionary and self.strings:
30 yield line.strip().lower() 33 yield line.strip().lower()
34 +
35 +
31 # Logging all our program 36 # Logging all our program
32 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 37 logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
33 level=logging.INFO) 38 level=logging.INFO)
...@@ -40,23 +45,23 @@ parser.add_argument("--input", help="Input file to perform LSA.", ...@@ -40,23 +45,23 @@ parser.add_argument("--input", help="Input file to perform LSA.",
40 45
41 args = parser.parse_args() 46 args = parser.parse_args()
42 47
43 -n_topics=args.n_topics 48 +n_topics = args.n_topics
44 -n_docs=0 49 +n_docs = 0
45 -input_file=args.input 50 +input_file = args.input
46 -#input_file='lsa_example.csv' 51 +# input_file='lsa_example.csv'
47 -#input_file='wiki_sample/wiki_75_AA.txt.cln' 52 +# input_file='wiki_sample/wiki_75_AA.txt.cln'
48 -#input_file='wiki_sample/wiki_77_AA.txt' 53 +# input_file='wiki_sample/wiki_77_AA.txt'
49 54
50 # A little stopwords list 55 # A little stopwords list
51 stoplist = set('for a of the and to in _ [ ]'.split()) 56 stoplist = set('for a of the and to in _ [ ]'.split())
52 # Do not load the text corpus into memory, but stream it! 57 # Do not load the text corpus into memory, but stream it!
53 -fille=corpus_streamer(input_file, strings=True) 58 +fille = corpus_streamer(input_file, strings=True)
54 -dictionary=corpora.Dictionary(line.lower().split() for line in fille)#open(input_file)) 59 +dictionary = corpora.Dictionary(line.lower().split() for line in fille) # open(input_file))
55 # remove stop words and words that appear only once 60 # remove stop words and words that appear only once
56 -stop_ids=[dictionary.token2id[stopword] for stopword in stoplist 61 +stop_ids = [dictionary.token2id[stopword] for stopword in stoplist
57 - if stopword in dictionary.token2id] 62 + if stopword in dictionary.token2id]
58 -once_ids=[tokenid for tokenid, docfreq in iteritems(dictionary.dfs) 63 +once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs)
59 - if docfreq == 1] 64 + if docfreq == 1]
60 dictionary.filter_tokens(stop_ids + once_ids) 65 dictionary.filter_tokens(stop_ids + once_ids)
61 # remove gaps in id sequence after words that were removed 66 # remove gaps in id sequence after words that were removed
62 dictionary.compactify() 67 dictionary.compactify()
...@@ -66,52 +71,53 @@ dictionary.save('lsa_mini.dict') ...@@ -66,52 +71,53 @@ dictionary.save('lsa_mini.dict')
66 # Use instead streaming objects: 71 # Use instead streaming objects:
67 # Load stored word-id map (dictionary) 72 # Load stored word-id map (dictionary)
68 stream_it = corpus_streamer(input_file, dictionary=dictionary) 73 stream_it = corpus_streamer(input_file, dictionary=dictionary)
69 -#for vector in stream_it: # load one vector into memory at a time 74 +# for vector in stream_it: # load one vector into memory at a time
70 # print vector 75 # print vector
71 # Convert to sparse matrix 76 # Convert to sparse matrix
72 sparse_corpus = [text for text in stream_it] 77 sparse_corpus = [text for text in stream_it]
73 # Store to disk, for later use collect statistics about all tokens 78 # Store to disk, for later use collect statistics about all tokens
74 corpora.MmCorpus.serialize('lsa_mini.mm', 79 corpora.MmCorpus.serialize('lsa_mini.mm',
75 - sparse_corpus) 80 + sparse_corpus)
76 ## LSA zone 81 ## LSA zone
77 # load the dictionary saved before 82 # load the dictionary saved before
78 id2word = dictionary.load('lsa_mini.dict') 83 id2word = dictionary.load('lsa_mini.dict')
79 # Now load the sparse matrix corpus from file into a (memory friendly) streaming 84 # Now load the sparse matrix corpus from file into a (memory friendly) streaming
80 # object. 85 # object.
81 -corpus=corpora.MmCorpus('lsa_mini.mm') 86 +corpus = corpora.MmCorpus('lsa_mini.mm')
82 87
83 ## IF TfidfModel 88 ## IF TfidfModel
84 -tfidf = gensim.models.TfidfModel(corpus) # step 1 -- initialize a model 89 +tfidf = gensim.models.TfidfModel(corpus) # step 1 -- initialize a model
85 corpus = tfidf[corpus] 90 corpus = tfidf[corpus]
86 ## FI TfidfModel 91 ## FI TfidfModel
87 # Compute the LSA vectors 92 # Compute the LSA vectors
88 -lsa=gensim.models.lsimodel.LsiModel(corpus, id2word=dictionary, 93 +lsa = gensim.models.lsimodel.LsiModel(corpus, id2word=dictionary,
89 - num_topics=n_topics) 94 + num_topics=n_topics)
90 # Print the n topics in our corpus: 95 # Print the n topics in our corpus:
91 -#lsa.print_topics(n_topics) 96 +# lsa.print_topics(n_topics)
92 -f=open("topics_file.txt","w") 97 +with open("topics_file.txt", "w") as f:
93 -f.write("-------------------------------------------------\n") 98 + f.write("-------------------------------------------------\n")
94 -for t in lsa.show_topics(num_words=200): 99 + for t in lsa.show_topics(num_words=200):
95 - f.write("%s\n" % str(t)) 100 + f.write("%s\n" % str(t))
101 + f.write("-------------------------------------------------\n")
96 102
97 -f.write("-------------------------------------------------\n")
98 -f.close()
99 # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi 103 # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
100 corpus_lsa = lsa[corpus] 104 corpus_lsa = lsa[corpus]
101 # Stream sentences from file into a list of strings called "sentences" 105 # Stream sentences from file into a list of strings called "sentences"
102 -sentences=corpus_streamer(input_file, strings=True) 106 +sentences = corpus_streamer(input_file, strings=True)
103 -n=0 107 +n = 0
104 -for pertenence, sentence in zip(corpus_lsa, sentences): 108 +with open("vectors_file.txt", "w") as f:
105 - if n_docs <= 0: 109 + for pertenence, sentence in zip(corpus_lsa, sentences):
106 - #print "%s\t\t%s" % (pertenence, sentence.split("\t")[0]) 110 + if n_docs <= 0:
107 - p=[dict(pertenence)[x] if x in dict(pertenence) else 0.0 111 + # print "%s\t\t%s" % (pertenence, sentence.split("\t")[0])
108 - for x in range(n_topics)] 112 + p = [dict(pertenence)[x] if x in dict(pertenence) else 0.0
109 - print("{} {}".format("".join(sentence.split("\t")[0].split()), "".join(str(p)[1:].strip("]").split(",")))) 113 + for x in range(n_topics)]
110 - else: 114 + f.write(
111 - if n<n_docs: 115 + "{}\t{}".format("".join(sentence.split("\t")[0].split()), "".join(str(p)[1:].strip("]").split(","))))
112 - pertenence=[dict(pertenence)[x] if x in dict(pertenence) else 0.0
113 - for x in range(n_topics)]
114 - print("%s\t\t%s" % (pertenence, sentence))
115 - n+=1
116 else: 116 else:
117 - break
...\ No newline at end of file ...\ No newline at end of file
117 + if n < n_docs:
118 + pertenence = [dict(pertenence)[x] if x in dict(pertenence) else 0.0
119 + for x in range(n_topics)]
120 + f.write("%s\t\t%s" % (pertenence, sentence))
121 + n += 1
122 + else:
123 + break
......