Ignacio Arroyo

added all vectors and eigenvectors 2-120 groups

Showing 243 changed files with 129 additions and 0 deletions
This diff is collapsed. Click to expand it.
No preview for this file type
1 +"""Pirated example from Gensim library (a NLP specialized tool):
2 +https://radimrehurek.com/gensim/tut2.html
3 +https://radimrehurek.com/gensim/wiki.html#latent-semantic-analysis
4 +
5 +Ignacio Arroyo
6 +"""
7 +
8 +import gensim
9 +import logging
10 +from six import iteritems
11 +from gensim import corpora
12 +import argparse
13 +
14 +from pdb import set_trace as st # Debug the program step by step calling st()
15 + # anywhere.
16 +class corpus_streamer(object):
17 + """ This Object streams the input raw text file row by row.
18 + """
19 + def __init__(self, file_name, dictionary=None, strings=None):
20 + self.file_name=file_name
21 + self.dictionary=dictionary
22 + self.strings=strings
23 +
24 + def __iter__(self):
25 + for line in open(self.file_name):
26 + # assume there's one document per line, tokens separated by whitespace
27 + if self.dictionary and not self.strings:
28 + yield self.dictionary.doc2bow(line.lower().split())
29 + elif not self.dictionary and self.strings:
30 + yield line.strip().lower()
31 +# Logging all our program
32 +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
33 + level=logging.INFO)
34 +
35 +parser = argparse.ArgumentParser()
36 +parser.add_argument("--n_topics", help="Number of eigenvectors picked up.",
37 + default=2, type=int)
38 +parser.add_argument("--input", help="Input file to perform LSA.",
39 + required=True)
40 +
41 +args = parser.parse_args()
42 +
43 +n_topics=args.n_topics
44 +n_docs=0
45 +input_file=args.input
46 +#input_file='/medargsia/iarroyof/Volumen de 384 GB/data/GUs_textform_noPeriods.txt'
47 +#input_file='lsa_example.csv'
48 +#input_file='wiki_sample/wiki_75_AA.txt.cln'
49 +#input_file='wiki_sample/wiki_77_AA.txt'
50 +
51 +# A little stopwords list
52 +stoplist = set('for a of the and to in _ [ ]'.split())
53 +# Do not load the text corpus into memory, but stream it!
54 +fille=corpus_streamer(input_file, strings=True)
55 +dictionary=corpora.Dictionary(line.lower().split() for line in fille)#open(input_file))
56 +# remove stop words and words that appear only once
57 +stop_ids=[dictionary.token2id[stopword] for stopword in stoplist
58 + if stopword in dictionary.token2id]
59 +once_ids=[tokenid for tokenid, docfreq in iteritems(dictionary.dfs)
60 + if docfreq == 1]
61 +dictionary.filter_tokens(stop_ids + once_ids)
62 +# remove gaps in id sequence after words that were removed
63 +dictionary.compactify()
64 +# Store the dictionary
65 +dictionary.save('lsa_mini.dict')
66 +# Reading sentences from file into a list of strings.
67 +# Use instead streaming objects:
68 +# Load stored word-id map (dictionary)
69 +stream_it = corpus_streamer(input_file, dictionary=dictionary)
70 +#for vector in stream_it: # load one vector into memory at a time
71 +# print vector
72 +# Convert to sparse matrix
73 +sparse_corpus = [text for text in stream_it]
74 +# Store to disk, for later use collect statistics about all tokens
75 +corpora.MmCorpus.serialize('lsa_mini.mm',
76 + sparse_corpus)
77 +## LSA zone
78 +# load the dictionary saved before
79 +id2word = dictionary.load('lsa_mini.dict')
80 +# Now load the sparse matrix corpus from file into a (memory friendly) streaming
81 +# object.
82 +corpus=corpora.MmCorpus('lsa_mini.mm')
83 +
84 +## IF TfidfModel
85 +tfidf = gensim.models.TfidfModel(corpus) # step 1 -- initialize a model
86 +corpus = tfidf[corpus]
87 +## FI TfidfModel
88 +# Compute the LSA vectors
89 +lsa=gensim.models.lsimodel.LsiModel(corpus, id2word=dictionary,
90 + num_topics=n_topics)
91 +# Print the n topics in our corpus:
92 +#lsa.print_topics(n_topics)
93 +f=open("topics_file.txt","wb")
94 +f.write("-------------------------------------------------\n")
95 +for t in lsa.show_topics():
96 + f.write("%s\n" % str(t))
97 +
98 +f.write("-------------------------------------------------\n")
99 +f.close()
100 +# create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
101 +corpus_lsa = lsa[corpus]
102 +# Stream sentences from file into a list of strings called "sentences"
103 +sentences=corpus_streamer(input_file, strings=True)
104 +n=0
105 +for pertenence, sentence in zip(corpus_lsa, sentences):
106 + if n_docs <= 0:
107 + #print "%s\t\t%s" % (pertenence, sentence.split("\t")[0])
108 + p=[dict(pertenence)[x] if x in dict(pertenence) else 0.0
109 + for x in xrange(n_topics)]
110 + print "%s %s" % ("".join(sentence.split("\t")[0].split()),
111 + "".join(str(p)[1:].strip("]").split(",")) )
112 + else:
113 + if n<n_docs:
114 + pertenence=[dict(pertenence)[x] if x in dict(pertenence) else 0.0
115 + for x in xrange(n_topics)]
116 + print "%s\t\t%s" % (pertenence, sentence)
117 + n+=1
118 + else:
119 + break
120 +
121 +
122 +
123 +# ============================== Homework ======================================
124 +# Modify the program for doing this for a sample of the English Wikipedia.
125 +# Compute LSA for 20 topics and print the fist 10 topics.
126 +# Take care of avoiding loading and printing documents of a large corpus, so
127 +# change the number of documents to print or sample the entire set randomly and
128 +# print a subset.
129 +# ==============================================================================
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff is collapsed. Click to expand it.