Showing
1 changed file
with
52 additions
and
46 deletions
... | @@ -11,23 +11,28 @@ from six import iteritems | ... | @@ -11,23 +11,28 @@ from six import iteritems |
11 | from gensim import corpora | 11 | from gensim import corpora |
12 | import argparse | 12 | import argparse |
13 | 13 | ||
14 | -from pdb import set_trace as st # Debug the program step by step calling st() | 14 | +from pdb import set_trace as st # Debug the program step by step calling st() |
15 | - # anywhere. | 15 | + |
16 | + | ||
17 | +# anywhere. | ||
16 | class corpus_streamer(object): | 18 | class corpus_streamer(object): |
17 | """ This Object streams the input raw text file row by row. | 19 | """ This Object streams the input raw text file row by row. |
18 | """ | 20 | """ |
21 | + | ||
19 | def __init__(self, file_name, dictionary=None, strings=None): | 22 | def __init__(self, file_name, dictionary=None, strings=None): |
20 | - self.file_name=file_name | 23 | + self.file_name = file_name |
21 | - self.dictionary=dictionary | 24 | + self.dictionary = dictionary |
22 | - self.strings=strings | 25 | + self.strings = strings |
23 | 26 | ||
24 | def __iter__(self): | 27 | def __iter__(self): |
25 | for line in open(self.file_name): | 28 | for line in open(self.file_name): |
26 | - # assume there's one document per line, tokens separated by whitespace | 29 | + # assume there's one document per line, tokens separated by whitespace |
27 | if self.dictionary and not self.strings: | 30 | if self.dictionary and not self.strings: |
28 | yield self.dictionary.doc2bow(line.lower().split()) | 31 | yield self.dictionary.doc2bow(line.lower().split()) |
29 | elif not self.dictionary and self.strings: | 32 | elif not self.dictionary and self.strings: |
30 | yield line.strip().lower() | 33 | yield line.strip().lower() |
34 | + | ||
35 | + | ||
31 | # Logging all our program | 36 | # Logging all our program |
32 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', | 37 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', |
33 | level=logging.INFO) | 38 | level=logging.INFO) |
... | @@ -40,23 +45,23 @@ parser.add_argument("--input", help="Input file to perform LSA.", | ... | @@ -40,23 +45,23 @@ parser.add_argument("--input", help="Input file to perform LSA.", |
40 | 45 | ||
41 | args = parser.parse_args() | 46 | args = parser.parse_args() |
42 | 47 | ||
43 | -n_topics=args.n_topics | 48 | +n_topics = args.n_topics |
44 | -n_docs=0 | 49 | +n_docs = 0 |
45 | -input_file=args.input | 50 | +input_file = args.input |
46 | -#input_file='lsa_example.csv' | 51 | +# input_file='lsa_example.csv' |
47 | -#input_file='wiki_sample/wiki_75_AA.txt.cln' | 52 | +# input_file='wiki_sample/wiki_75_AA.txt.cln' |
48 | -#input_file='wiki_sample/wiki_77_AA.txt' | 53 | +# input_file='wiki_sample/wiki_77_AA.txt' |
49 | 54 | ||
50 | # A little stopwords list | 55 | # A little stopwords list |
51 | stoplist = set('for a of the and to in _ [ ]'.split()) | 56 | stoplist = set('for a of the and to in _ [ ]'.split()) |
52 | # Do not load the text corpus into memory, but stream it! | 57 | # Do not load the text corpus into memory, but stream it! |
53 | -fille=corpus_streamer(input_file, strings=True) | 58 | +fille = corpus_streamer(input_file, strings=True) |
54 | -dictionary=corpora.Dictionary(line.lower().split() for line in fille)#open(input_file)) | 59 | +dictionary = corpora.Dictionary(line.lower().split() for line in fille) # open(input_file)) |
55 | # remove stop words and words that appear only once | 60 | # remove stop words and words that appear only once |
56 | -stop_ids=[dictionary.token2id[stopword] for stopword in stoplist | 61 | +stop_ids = [dictionary.token2id[stopword] for stopword in stoplist |
57 | - if stopword in dictionary.token2id] | 62 | + if stopword in dictionary.token2id] |
58 | -once_ids=[tokenid for tokenid, docfreq in iteritems(dictionary.dfs) | 63 | +once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) |
59 | - if docfreq == 1] | 64 | + if docfreq == 1] |
60 | dictionary.filter_tokens(stop_ids + once_ids) | 65 | dictionary.filter_tokens(stop_ids + once_ids) |
61 | # remove gaps in id sequence after words that were removed | 66 | # remove gaps in id sequence after words that were removed |
62 | dictionary.compactify() | 67 | dictionary.compactify() |
... | @@ -66,52 +71,53 @@ dictionary.save('lsa_mini.dict') | ... | @@ -66,52 +71,53 @@ dictionary.save('lsa_mini.dict') |
66 | # Use instead streaming objects: | 71 | # Use instead streaming objects: |
67 | # Load stored word-id map (dictionary) | 72 | # Load stored word-id map (dictionary) |
68 | stream_it = corpus_streamer(input_file, dictionary=dictionary) | 73 | stream_it = corpus_streamer(input_file, dictionary=dictionary) |
69 | -#for vector in stream_it: # load one vector into memory at a time | 74 | +# for vector in stream_it: # load one vector into memory at a time |
70 | # print vector | 75 | # print vector |
71 | # Convert to sparse matrix | 76 | # Convert to sparse matrix |
72 | sparse_corpus = [text for text in stream_it] | 77 | sparse_corpus = [text for text in stream_it] |
73 | # Store to disk, for later use collect statistics about all tokens | 78 | # Store to disk, for later use collect statistics about all tokens |
74 | corpora.MmCorpus.serialize('lsa_mini.mm', | 79 | corpora.MmCorpus.serialize('lsa_mini.mm', |
75 | - sparse_corpus) | 80 | + sparse_corpus) |
76 | ## LSA zone | 81 | ## LSA zone |
77 | # load the dictionary saved before | 82 | # load the dictionary saved before |
78 | id2word = dictionary.load('lsa_mini.dict') | 83 | id2word = dictionary.load('lsa_mini.dict') |
79 | # Now load the sparse matrix corpus from file into a (memory friendly) streaming | 84 | # Now load the sparse matrix corpus from file into a (memory friendly) streaming |
80 | # object. | 85 | # object. |
81 | -corpus=corpora.MmCorpus('lsa_mini.mm') | 86 | +corpus = corpora.MmCorpus('lsa_mini.mm') |
82 | 87 | ||
83 | ## IF TfidfModel | 88 | ## IF TfidfModel |
84 | -tfidf = gensim.models.TfidfModel(corpus) # step 1 -- initialize a model | 89 | +tfidf = gensim.models.TfidfModel(corpus) # step 1 -- initialize a model |
85 | corpus = tfidf[corpus] | 90 | corpus = tfidf[corpus] |
86 | ## FI TfidfModel | 91 | ## FI TfidfModel |
87 | # Compute the LSA vectors | 92 | # Compute the LSA vectors |
88 | -lsa=gensim.models.lsimodel.LsiModel(corpus, id2word=dictionary, | 93 | +lsa = gensim.models.lsimodel.LsiModel(corpus, id2word=dictionary, |
89 | - num_topics=n_topics) | 94 | + num_topics=n_topics) |
90 | # Print the n topics in our corpus: | 95 | # Print the n topics in our corpus: |
91 | -#lsa.print_topics(n_topics) | 96 | +# lsa.print_topics(n_topics) |
92 | -f=open("topics_file.txt","w") | 97 | +with open("topics_file.txt", "w") as f: |
93 | -f.write("-------------------------------------------------\n") | 98 | + f.write("-------------------------------------------------\n") |
94 | -for t in lsa.show_topics(num_words=200): | 99 | + for t in lsa.show_topics(num_words=200): |
95 | - f.write("%s\n" % str(t)) | 100 | + f.write("%s\n" % str(t)) |
101 | + f.write("-------------------------------------------------\n") | ||
96 | 102 | ||
97 | -f.write("-------------------------------------------------\n") | ||
98 | -f.close() | ||
99 | # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi | 103 | # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi |
100 | corpus_lsa = lsa[corpus] | 104 | corpus_lsa = lsa[corpus] |
101 | # Stream sentences from file into a list of strings called "sentences" | 105 | # Stream sentences from file into a list of strings called "sentences" |
102 | -sentences=corpus_streamer(input_file, strings=True) | 106 | +sentences = corpus_streamer(input_file, strings=True) |
103 | -n=0 | 107 | +n = 0 |
104 | -for pertenence, sentence in zip(corpus_lsa, sentences): | 108 | +with open("vectors_file.txt", "w") as f: |
105 | - if n_docs <= 0: | 109 | + for pertenence, sentence in zip(corpus_lsa, sentences): |
106 | - #print "%s\t\t%s" % (pertenence, sentence.split("\t")[0]) | 110 | + if n_docs <= 0: |
107 | - p=[dict(pertenence)[x] if x in dict(pertenence) else 0.0 | 111 | + # print "%s\t\t%s" % (pertenence, sentence.split("\t")[0]) |
108 | - for x in range(n_topics)] | 112 | + p = [dict(pertenence)[x] if x in dict(pertenence) else 0.0 |
109 | - print("{} {}".format("".join(sentence.split("\t")[0].split()), "".join(str(p)[1:].strip("]").split(",")))) | 113 | + for x in range(n_topics)] |
110 | - else: | 114 | + f.write( |
111 | - if n<n_docs: | 115 | + "{}\t{}".format("".join(sentence.split("\t")[0].split()), "".join(str(p)[1:].strip("]").split(",")))) |
112 | - pertenence=[dict(pertenence)[x] if x in dict(pertenence) else 0.0 | ||
113 | - for x in range(n_topics)] | ||
114 | - print("%s\t\t%s" % (pertenence, sentence)) | ||
115 | - n+=1 | ||
116 | else: | 116 | else: |
117 | - break | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
117 | + if n < n_docs: | ||
118 | + pertenence = [dict(pertenence)[x] if x in dict(pertenence) else 0.0 | ||
119 | + for x in range(n_topics)] | ||
120 | + f.write("%s\t\t%s" % (pertenence, sentence)) | ||
121 | + n += 1 | ||
122 | + else: | ||
123 | + break | ... | ... |
-
Please register or login to post a comment