Showing
5 changed files
with
138 additions
and
0 deletions
sentence-representation/lsa_example.csv
0 → 100644
1 | +c1: Human machine interface for ABC computer applications | ||
2 | +c2: A survey of user opinion of computer system response time | ||
3 | +c3: The EPS user interface management system | ||
4 | +c4: System and human system engineering testing of EPS | ||
5 | +c5: Relation of user perceived response time to error measurement | ||
6 | +m1: The generation of random, binary, ordered trees | ||
7 | +m2: The intersection graph of paths in trees | ||
8 | +m3: Graph minors IV: Widths of trees and well-quasi-ordering | ||
9 | +m4: Graph minors: A survey |
sentence-representation/lsa_example.py
0 → 100644
1 | +"""Pirated example from Gensim library (a NLP specialized tool): | ||
2 | +https://radimrehurek.com/gensim/tut2.html | ||
3 | +https://radimrehurek.com/gensim/wiki.html#latent-semantic-analysis | ||
4 | + | ||
5 | +Ignacio Arroyo | ||
6 | +""" | ||
7 | + | ||
8 | +import gensim | ||
9 | +import logging | ||
10 | +from six import iteritems | ||
11 | +from gensim import corpora | ||
12 | +import argparse | ||
13 | + | ||
14 | +from pdb import set_trace as st # Debug the program step by step calling st() | ||
15 | + # anywhere. | ||
16 | +class corpus_streamer(object): | ||
17 | + """ This Object streams the input raw text file row by row. | ||
18 | + """ | ||
19 | + def __init__(self, file_name, dictionary=None, strings=None): | ||
20 | + self.file_name=file_name | ||
21 | + self.dictionary=dictionary | ||
22 | + self.strings=strings | ||
23 | + | ||
24 | + def __iter__(self): | ||
25 | + for line in open(self.file_name): | ||
26 | + # assume there's one document per line, tokens separated by whitespace | ||
27 | + if self.dictionary and not self.strings: | ||
28 | + yield self.dictionary.doc2bow(line.lower().split()) | ||
29 | + elif not self.dictionary and self.strings: | ||
30 | + yield line.strip().lower() | ||
31 | +# Logging all our program | ||
32 | +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', | ||
33 | + level=logging.INFO) | ||
34 | + | ||
35 | +parser = argparse.ArgumentParser() | ||
36 | +parser.add_argument("--n_topics", help="Number of eigenvectors picked up.", | ||
37 | + default=2, type=int) | ||
38 | +parser.add_argument("--input", help="Input file to perform LSA.", | ||
39 | + required=True) | ||
40 | + | ||
41 | +args = parser.parse_args() | ||
42 | + | ||
43 | +n_topics=args.n_topics | ||
44 | +n_docs=0 | ||
45 | +input_file=args.input | ||
46 | +#input_file='/medargsia/iarroyof/Volumen de 384 GB/data/GUs_textform_noPeriods.txt' | ||
47 | +#input_file='lsa_example.csv' | ||
48 | +#input_file='wiki_sample/wiki_75_AA.txt.cln' | ||
49 | +#input_file='wiki_sample/wiki_77_AA.txt' | ||
50 | + | ||
51 | +# A little stopwords list | ||
52 | +stoplist = set('for a of the and to in _ [ ]'.split()) | ||
53 | +# Do not load the text corpus into memory, but stream it! | ||
54 | +fille=corpus_streamer(input_file, strings=True) | ||
55 | +dictionary=corpora.Dictionary(line.lower().split() for line in fille)#open(input_file)) | ||
56 | +# remove stop words and words that appear only once | ||
57 | +stop_ids=[dictionary.token2id[stopword] for stopword in stoplist | ||
58 | + if stopword in dictionary.token2id] | ||
59 | +once_ids=[tokenid for tokenid, docfreq in iteritems(dictionary.dfs) | ||
60 | + if docfreq == 1] | ||
61 | +dictionary.filter_tokens(stop_ids + once_ids) | ||
62 | +# remove gaps in id sequence after words that were removed | ||
63 | +dictionary.compactify() | ||
64 | +# Store the dictionary | ||
65 | +dictionary.save('lsa_mini.dict') | ||
66 | +# Reading sentences from file into a list of strings. | ||
67 | +# Use instead streaming objects: | ||
68 | +# Load stored word-id map (dictionary) | ||
69 | +stream_it = corpus_streamer(input_file, dictionary=dictionary) | ||
70 | +#for vector in stream_it: # load one vector into memory at a time | ||
71 | +# print vector | ||
72 | +# Convert to sparse matrix | ||
73 | +sparse_corpus = [text for text in stream_it] | ||
74 | +# Store to disk, for later use collect statistics about all tokens | ||
75 | +corpora.MmCorpus.serialize('lsa_mini.mm', | ||
76 | + sparse_corpus) | ||
77 | +## LSA zone | ||
78 | +# load the dictionary saved before | ||
79 | +id2word = dictionary.load('lsa_mini.dict') | ||
80 | +# Now load the sparse matrix corpus from file into a (memory friendly) streaming | ||
81 | +# object. | ||
82 | +corpus=corpora.MmCorpus('lsa_mini.mm') | ||
83 | + | ||
84 | +## IF TfidfModel | ||
85 | +tfidf = gensim.models.TfidfModel(corpus) # step 1 -- initialize a model | ||
86 | +corpus = tfidf[corpus] | ||
87 | +## FI TfidfModel | ||
88 | +# Compute the LSA vectors | ||
89 | +lsa=gensim.models.lsimodel.LsiModel(corpus, id2word=dictionary, | ||
90 | + num_topics=n_topics) | ||
91 | +# Print the n topics in our corpus: | ||
92 | +#lsa.print_topics(n_topics) | ||
93 | +f=open("topics_file.txt","wb") | ||
94 | +f.write("-------------------------------------------------\n") | ||
95 | +for t in lsa.show_topics(): | ||
96 | + f.write("%s\n" % str(t)) | ||
97 | + | ||
98 | +f.write("-------------------------------------------------\n") | ||
99 | +f.close() | ||
100 | +# create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi | ||
101 | +corpus_lsa = lsa[corpus] | ||
102 | +# Stream sentences from file into a list of strings called "sentences" | ||
103 | +sentences=corpus_streamer(input_file, strings=True) | ||
104 | +n=0 | ||
105 | +for pertenence, sentence in zip(corpus_lsa, sentences): | ||
106 | + if n_docs <= 0: | ||
107 | + #print "%s\t\t%s" % (pertenence, sentence.split("\t")[0]) | ||
108 | + p=[dict(pertenence)[x] if x in dict(pertenence) else 0.0 | ||
109 | + for x in xrange(n_topics)] | ||
110 | + print "%s %s" % ("".join(sentence.split("\t")[0].split()), | ||
111 | + "".join(str(p)[1:].strip("]").split(",")) ) | ||
112 | + else: | ||
113 | + if n<n_docs: | ||
114 | + pertenence=[dict(pertenence)[x] if x in dict(pertenence) else 0.0 | ||
115 | + for x in xrange(n_topics)] | ||
116 | + print "%s\t\t%s" % (pertenence, sentence) | ||
117 | + n+=1 | ||
118 | + else: | ||
119 | + break | ||
120 | + | ||
121 | + | ||
122 | + | ||
123 | +# ============================== Homework ====================================== | ||
124 | +# Modify the program for doing this for a sample of the English Wikipedia. | ||
125 | +# Compute LSA for 20 topics and print the fist 10 topics. | ||
126 | +# Take care of avoiding loading and printing documents of a large corpus, so | ||
127 | +# change the number of documents to print or sample the entire set randomly and | ||
128 | +# print a subset. | ||
129 | +# ============================================================================== |
sentence-representation/lsa_mini.dict
0 → 100644
No preview for this file type
sentence-representation/lsa_mini.mm
0 → 100644
This diff is collapsed. Click to expand it.
sentence-representation/lsa_mini.mm.index
0 → 100644
No preview for this file type
-
Please register or login to post a comment