Showing
1 changed file
with
52 additions
and
46 deletions
| ... | @@ -11,23 +11,28 @@ from six import iteritems | ... | @@ -11,23 +11,28 @@ from six import iteritems |
| 11 | from gensim import corpora | 11 | from gensim import corpora |
| 12 | import argparse | 12 | import argparse |
| 13 | 13 | ||
| 14 | -from pdb import set_trace as st # Debug the program step by step calling st() | 14 | +from pdb import set_trace as st # Debug the program step by step calling st() |
| 15 | - # anywhere. | 15 | + |
| 16 | + | ||
| 17 | +# anywhere. | ||
| 16 | class corpus_streamer(object): | 18 | class corpus_streamer(object): |
| 17 | """ This Object streams the input raw text file row by row. | 19 | """ This Object streams the input raw text file row by row. |
| 18 | """ | 20 | """ |
| 21 | + | ||
| 19 | def __init__(self, file_name, dictionary=None, strings=None): | 22 | def __init__(self, file_name, dictionary=None, strings=None): |
| 20 | - self.file_name=file_name | 23 | + self.file_name = file_name |
| 21 | - self.dictionary=dictionary | 24 | + self.dictionary = dictionary |
| 22 | - self.strings=strings | 25 | + self.strings = strings |
| 23 | 26 | ||
| 24 | def __iter__(self): | 27 | def __iter__(self): |
| 25 | for line in open(self.file_name): | 28 | for line in open(self.file_name): |
| 26 | - # assume there's one document per line, tokens separated by whitespace | 29 | + # assume there's one document per line, tokens separated by whitespace |
| 27 | if self.dictionary and not self.strings: | 30 | if self.dictionary and not self.strings: |
| 28 | yield self.dictionary.doc2bow(line.lower().split()) | 31 | yield self.dictionary.doc2bow(line.lower().split()) |
| 29 | elif not self.dictionary and self.strings: | 32 | elif not self.dictionary and self.strings: |
| 30 | yield line.strip().lower() | 33 | yield line.strip().lower() |
| 34 | + | ||
| 35 | + | ||
| 31 | # Logging all our program | 36 | # Logging all our program |
| 32 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', | 37 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', |
| 33 | level=logging.INFO) | 38 | level=logging.INFO) |
| ... | @@ -40,23 +45,23 @@ parser.add_argument("--input", help="Input file to perform LSA.", | ... | @@ -40,23 +45,23 @@ parser.add_argument("--input", help="Input file to perform LSA.", |
| 40 | 45 | ||
| 41 | args = parser.parse_args() | 46 | args = parser.parse_args() |
| 42 | 47 | ||
| 43 | -n_topics=args.n_topics | 48 | +n_topics = args.n_topics |
| 44 | -n_docs=0 | 49 | +n_docs = 0 |
| 45 | -input_file=args.input | 50 | +input_file = args.input |
| 46 | -#input_file='lsa_example.csv' | 51 | +# input_file='lsa_example.csv' |
| 47 | -#input_file='wiki_sample/wiki_75_AA.txt.cln' | 52 | +# input_file='wiki_sample/wiki_75_AA.txt.cln' |
| 48 | -#input_file='wiki_sample/wiki_77_AA.txt' | 53 | +# input_file='wiki_sample/wiki_77_AA.txt' |
| 49 | 54 | ||
| 50 | # A little stopwords list | 55 | # A little stopwords list |
| 51 | stoplist = set('for a of the and to in _ [ ]'.split()) | 56 | stoplist = set('for a of the and to in _ [ ]'.split()) |
| 52 | # Do not load the text corpus into memory, but stream it! | 57 | # Do not load the text corpus into memory, but stream it! |
| 53 | -fille=corpus_streamer(input_file, strings=True) | 58 | +fille = corpus_streamer(input_file, strings=True) |
| 54 | -dictionary=corpora.Dictionary(line.lower().split() for line in fille)#open(input_file)) | 59 | +dictionary = corpora.Dictionary(line.lower().split() for line in fille) # open(input_file)) |
| 55 | # remove stop words and words that appear only once | 60 | # remove stop words and words that appear only once |
| 56 | -stop_ids=[dictionary.token2id[stopword] for stopword in stoplist | 61 | +stop_ids = [dictionary.token2id[stopword] for stopword in stoplist |
| 57 | - if stopword in dictionary.token2id] | 62 | + if stopword in dictionary.token2id] |
| 58 | -once_ids=[tokenid for tokenid, docfreq in iteritems(dictionary.dfs) | 63 | +once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) |
| 59 | - if docfreq == 1] | 64 | + if docfreq == 1] |
| 60 | dictionary.filter_tokens(stop_ids + once_ids) | 65 | dictionary.filter_tokens(stop_ids + once_ids) |
| 61 | # remove gaps in id sequence after words that were removed | 66 | # remove gaps in id sequence after words that were removed |
| 62 | dictionary.compactify() | 67 | dictionary.compactify() |
| ... | @@ -66,52 +71,53 @@ dictionary.save('lsa_mini.dict') | ... | @@ -66,52 +71,53 @@ dictionary.save('lsa_mini.dict') |
| 66 | # Use instead streaming objects: | 71 | # Use instead streaming objects: |
| 67 | # Load stored word-id map (dictionary) | 72 | # Load stored word-id map (dictionary) |
| 68 | stream_it = corpus_streamer(input_file, dictionary=dictionary) | 73 | stream_it = corpus_streamer(input_file, dictionary=dictionary) |
| 69 | -#for vector in stream_it: # load one vector into memory at a time | 74 | +# for vector in stream_it: # load one vector into memory at a time |
| 70 | # print vector | 75 | # print vector |
| 71 | # Convert to sparse matrix | 76 | # Convert to sparse matrix |
| 72 | sparse_corpus = [text for text in stream_it] | 77 | sparse_corpus = [text for text in stream_it] |
| 73 | # Store to disk, for later use collect statistics about all tokens | 78 | # Store to disk, for later use collect statistics about all tokens |
| 74 | corpora.MmCorpus.serialize('lsa_mini.mm', | 79 | corpora.MmCorpus.serialize('lsa_mini.mm', |
| 75 | - sparse_corpus) | 80 | + sparse_corpus) |
| 76 | ## LSA zone | 81 | ## LSA zone |
| 77 | # load the dictionary saved before | 82 | # load the dictionary saved before |
| 78 | id2word = dictionary.load('lsa_mini.dict') | 83 | id2word = dictionary.load('lsa_mini.dict') |
| 79 | # Now load the sparse matrix corpus from file into a (memory friendly) streaming | 84 | # Now load the sparse matrix corpus from file into a (memory friendly) streaming |
| 80 | # object. | 85 | # object. |
| 81 | -corpus=corpora.MmCorpus('lsa_mini.mm') | 86 | +corpus = corpora.MmCorpus('lsa_mini.mm') |
| 82 | 87 | ||
| 83 | ## IF TfidfModel | 88 | ## IF TfidfModel |
| 84 | -tfidf = gensim.models.TfidfModel(corpus) # step 1 -- initialize a model | 89 | +tfidf = gensim.models.TfidfModel(corpus) # step 1 -- initialize a model |
| 85 | corpus = tfidf[corpus] | 90 | corpus = tfidf[corpus] |
| 86 | ## FI TfidfModel | 91 | ## FI TfidfModel |
| 87 | # Compute the LSA vectors | 92 | # Compute the LSA vectors |
| 88 | -lsa=gensim.models.lsimodel.LsiModel(corpus, id2word=dictionary, | 93 | +lsa = gensim.models.lsimodel.LsiModel(corpus, id2word=dictionary, |
| 89 | - num_topics=n_topics) | 94 | + num_topics=n_topics) |
| 90 | # Print the n topics in our corpus: | 95 | # Print the n topics in our corpus: |
| 91 | -#lsa.print_topics(n_topics) | 96 | +# lsa.print_topics(n_topics) |
| 92 | -f=open("topics_file.txt","w") | 97 | +with open("topics_file.txt", "w") as f: |
| 93 | -f.write("-------------------------------------------------\n") | 98 | + f.write("-------------------------------------------------\n") |
| 94 | -for t in lsa.show_topics(num_words=200): | 99 | + for t in lsa.show_topics(num_words=200): |
| 95 | - f.write("%s\n" % str(t)) | 100 | + f.write("%s\n" % str(t)) |
| 101 | + f.write("-------------------------------------------------\n") | ||
| 96 | 102 | ||
| 97 | -f.write("-------------------------------------------------\n") | ||
| 98 | -f.close() | ||
| 99 | # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi | 103 | # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi |
| 100 | corpus_lsa = lsa[corpus] | 104 | corpus_lsa = lsa[corpus] |
| 101 | # Stream sentences from file into a list of strings called "sentences" | 105 | # Stream sentences from file into a list of strings called "sentences" |
| 102 | -sentences=corpus_streamer(input_file, strings=True) | 106 | +sentences = corpus_streamer(input_file, strings=True) |
| 103 | -n=0 | 107 | +n = 0 |
| 104 | -for pertenence, sentence in zip(corpus_lsa, sentences): | 108 | +with open("vectors_file.txt", "w") as f: |
| 105 | - if n_docs <= 0: | 109 | + for pertenence, sentence in zip(corpus_lsa, sentences): |
| 106 | - #print "%s\t\t%s" % (pertenence, sentence.split("\t")[0]) | 110 | + if n_docs <= 0: |
| 107 | - p=[dict(pertenence)[x] if x in dict(pertenence) else 0.0 | 111 | + # print "%s\t\t%s" % (pertenence, sentence.split("\t")[0]) |
| 108 | - for x in range(n_topics)] | 112 | + p = [dict(pertenence)[x] if x in dict(pertenence) else 0.0 |
| 109 | - print("{} {}".format("".join(sentence.split("\t")[0].split()), "".join(str(p)[1:].strip("]").split(",")))) | 113 | + for x in range(n_topics)] |
| 110 | - else: | 114 | + f.write( |
| 111 | - if n<n_docs: | 115 | + "{}\t{}".format("".join(sentence.split("\t")[0].split()), "".join(str(p)[1:].strip("]").split(",")))) |
| 112 | - pertenence=[dict(pertenence)[x] if x in dict(pertenence) else 0.0 | ||
| 113 | - for x in range(n_topics)] | ||
| 114 | - print("%s\t\t%s" % (pertenence, sentence)) | ||
| 115 | - n+=1 | ||
| 116 | else: | 116 | else: |
| 117 | - break | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file |
| 117 | + if n < n_docs: | ||
| 118 | + pertenence = [dict(pertenence)[x] if x in dict(pertenence) else 0.0 | ||
| 119 | + for x in range(n_topics)] | ||
| 120 | + f.write("%s\t\t%s" % (pertenence, sentence)) | ||
| 121 | + n += 1 | ||
| 122 | + else: | ||
| 123 | + break | ... | ... |
-
Please register or login to post a comment