Carlos-Francisco Méndez-Cruz

LSA word embedding examples

1 +c1: Human machine interface for ABC computer applications
2 +c2: A survey of user opinion of computer system response time
3 +c3: The EPS user interface management system
4 +c4: System and human system engineering testing of EPS
5 +c5: Relation of user perceived response time to error measurement
6 +m1: The generation of random, binary, ordered trees
7 +m2: The intersection graph of paths in trees
8 +m3: Graph minors IV: Widths of trees and well-quasi-ordering
9 +m4: Graph minors: A survey
1 +"""Pirated example from Gensim library (a NLP specialized tool):
2 +https://radimrehurek.com/gensim/tut2.html
3 +https://radimrehurek.com/gensim/wiki.html#latent-semantic-analysis
4 +
5 +Ignacio Arroyo
6 +"""
7 +
8 +import gensim
9 +import logging
10 +from six import iteritems
11 +from gensim import corpora
12 +import argparse
13 +
14 +from pdb import set_trace as st # Debug the program step by step calling st()
15 + # anywhere.
16 +class corpus_streamer(object):
17 + """ This Object streams the input raw text file row by row.
18 + """
19 + def __init__(self, file_name, dictionary=None, strings=None):
20 + self.file_name=file_name
21 + self.dictionary=dictionary
22 + self.strings=strings
23 +
24 + def __iter__(self):
25 + for line in open(self.file_name):
26 + # assume there's one document per line, tokens separated by whitespace
27 + if self.dictionary and not self.strings:
28 + yield self.dictionary.doc2bow(line.lower().split())
29 + elif not self.dictionary and self.strings:
30 + yield line.strip().lower()
31 +# Logging all our program
32 +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
33 + level=logging.INFO)
34 +
35 +parser = argparse.ArgumentParser()
36 +parser.add_argument("--n_topics", help="Number of eigenvectors picked up.",
37 + default=2, type=int)
38 +parser.add_argument("--input", help="Input file to perform LSA.",
39 + required=True)
40 +
41 +args = parser.parse_args()
42 +
43 +n_topics=args.n_topics
44 +n_docs=0
45 +input_file=args.input
46 +#input_file='/medargsia/iarroyof/Volumen de 384 GB/data/GUs_textform_noPeriods.txt'
47 +#input_file='lsa_example.csv'
48 +#input_file='wiki_sample/wiki_75_AA.txt.cln'
49 +#input_file='wiki_sample/wiki_77_AA.txt'
50 +
51 +# A little stopwords list
52 +stoplist = set('for a of the and to in _ [ ]'.split())
53 +# Do not load the text corpus into memory, but stream it!
54 +fille=corpus_streamer(input_file, strings=True)
55 +dictionary=corpora.Dictionary(line.lower().split() for line in fille)#open(input_file))
56 +# remove stop words and words that appear only once
57 +stop_ids=[dictionary.token2id[stopword] for stopword in stoplist
58 + if stopword in dictionary.token2id]
59 +once_ids=[tokenid for tokenid, docfreq in iteritems(dictionary.dfs)
60 + if docfreq == 1]
61 +dictionary.filter_tokens(stop_ids + once_ids)
62 +# remove gaps in id sequence after words that were removed
63 +dictionary.compactify()
64 +# Store the dictionary
65 +dictionary.save('lsa_mini.dict')
66 +# Reading sentences from file into a list of strings.
67 +# Use instead streaming objects:
68 +# Load stored word-id map (dictionary)
69 +stream_it = corpus_streamer(input_file, dictionary=dictionary)
70 +#for vector in stream_it: # load one vector into memory at a time
71 +# print vector
72 +# Convert to sparse matrix
73 +sparse_corpus = [text for text in stream_it]
74 +# Store to disk, for later use collect statistics about all tokens
75 +corpora.MmCorpus.serialize('lsa_mini.mm',
76 + sparse_corpus)
77 +## LSA zone
78 +# load the dictionary saved before
79 +id2word = dictionary.load('lsa_mini.dict')
80 +# Now load the sparse matrix corpus from file into a (memory friendly) streaming
81 +# object.
82 +corpus=corpora.MmCorpus('lsa_mini.mm')
83 +
84 +## IF TfidfModel
85 +tfidf = gensim.models.TfidfModel(corpus) # step 1 -- initialize a model
86 +corpus = tfidf[corpus]
87 +## FI TfidfModel
88 +# Compute the LSA vectors
89 +lsa=gensim.models.lsimodel.LsiModel(corpus, id2word=dictionary,
90 + num_topics=n_topics)
91 +# Print the n topics in our corpus:
92 +#lsa.print_topics(n_topics)
93 +f=open("topics_file.txt","wb")
94 +f.write("-------------------------------------------------\n")
95 +for t in lsa.show_topics():
96 + f.write("%s\n" % str(t))
97 +
98 +f.write("-------------------------------------------------\n")
99 +f.close()
100 +# create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
101 +corpus_lsa = lsa[corpus]
102 +# Stream sentences from file into a list of strings called "sentences"
103 +sentences=corpus_streamer(input_file, strings=True)
104 +n=0
105 +for pertenence, sentence in zip(corpus_lsa, sentences):
106 + if n_docs <= 0:
107 + #print "%s\t\t%s" % (pertenence, sentence.split("\t")[0])
108 + p=[dict(pertenence)[x] if x in dict(pertenence) else 0.0
109 + for x in xrange(n_topics)]
110 + print "%s %s" % ("".join(sentence.split("\t")[0].split()),
111 + "".join(str(p)[1:].strip("]").split(",")) )
112 + else:
113 + if n<n_docs:
114 + pertenence=[dict(pertenence)[x] if x in dict(pertenence) else 0.0
115 + for x in xrange(n_topics)]
116 + print "%s\t\t%s" % (pertenence, sentence)
117 + n+=1
118 + else:
119 + break
120 +
121 +
122 +
123 +# ============================== Homework ======================================
124 +# Modify the program for doing this for a sample of the English Wikipedia.
125 +# Compute LSA for 20 topics and print the fist 10 topics.
126 +# Take care of avoiding loading and printing documents of a large corpus, so
127 +# change the number of documents to print or sample the entire set randomly and
128 +# print a subset.
129 +# ==============================================================================
No preview for this file type
This diff is collapsed. Click to expand it.
No preview for this file type