Showing
243 changed files
with
129 additions
and
0 deletions
BO_power_law_PDF.eps
0 → 100644
This diff is collapsed. Click to expand it.
BO_power_law_PDF.pdf
0 → 100644
No preview for this file type
BO_power_law_PDF.png
0 → 100644

34 KB
lsa_example.py
0 → 100644
1 | +"""Pirated example from Gensim library (a NLP specialized tool): | ||
2 | +https://radimrehurek.com/gensim/tut2.html | ||
3 | +https://radimrehurek.com/gensim/wiki.html#latent-semantic-analysis | ||
4 | + | ||
5 | +Ignacio Arroyo | ||
6 | +""" | ||
7 | + | ||
8 | +import gensim | ||
9 | +import logging | ||
10 | +from six import iteritems | ||
11 | +from gensim import corpora | ||
12 | +import argparse | ||
13 | + | ||
14 | +from pdb import set_trace as st # Debug the program step by step calling st() | ||
15 | + # anywhere. | ||
16 | +class corpus_streamer(object): | ||
17 | + """ This Object streams the input raw text file row by row. | ||
18 | + """ | ||
19 | + def __init__(self, file_name, dictionary=None, strings=None): | ||
20 | + self.file_name=file_name | ||
21 | + self.dictionary=dictionary | ||
22 | + self.strings=strings | ||
23 | + | ||
24 | + def __iter__(self): | ||
25 | + for line in open(self.file_name): | ||
26 | + # assume there's one document per line, tokens separated by whitespace | ||
27 | + if self.dictionary and not self.strings: | ||
28 | + yield self.dictionary.doc2bow(line.lower().split()) | ||
29 | + elif not self.dictionary and self.strings: | ||
30 | + yield line.strip().lower() | ||
31 | +# Logging all our program | ||
32 | +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', | ||
33 | + level=logging.INFO) | ||
34 | + | ||
35 | +parser = argparse.ArgumentParser() | ||
36 | +parser.add_argument("--n_topics", help="Number of eigenvectors picked up.", | ||
37 | + default=2, type=int) | ||
38 | +parser.add_argument("--input", help="Input file to perform LSA.", | ||
39 | + required=True) | ||
40 | + | ||
41 | +args = parser.parse_args() | ||
42 | + | ||
43 | +n_topics=args.n_topics | ||
44 | +n_docs=0 | ||
45 | +input_file=args.input | ||
46 | +#input_file='/medargsia/iarroyof/Volumen de 384 GB/data/GUs_textform_noPeriods.txt' | ||
47 | +#input_file='lsa_example.csv' | ||
48 | +#input_file='wiki_sample/wiki_75_AA.txt.cln' | ||
49 | +#input_file='wiki_sample/wiki_77_AA.txt' | ||
50 | + | ||
51 | +# A little stopwords list | ||
52 | +stoplist = set('for a of the and to in _ [ ]'.split()) | ||
53 | +# Do not load the text corpus into memory, but stream it! | ||
54 | +fille=corpus_streamer(input_file, strings=True) | ||
55 | +dictionary=corpora.Dictionary(line.lower().split() for line in fille)#open(input_file)) | ||
56 | +# remove stop words and words that appear only once | ||
57 | +stop_ids=[dictionary.token2id[stopword] for stopword in stoplist | ||
58 | + if stopword in dictionary.token2id] | ||
59 | +once_ids=[tokenid for tokenid, docfreq in iteritems(dictionary.dfs) | ||
60 | + if docfreq == 1] | ||
61 | +dictionary.filter_tokens(stop_ids + once_ids) | ||
62 | +# remove gaps in id sequence after words that were removed | ||
63 | +dictionary.compactify() | ||
64 | +# Store the dictionary | ||
65 | +dictionary.save('lsa_mini.dict') | ||
66 | +# Reading sentences from file into a list of strings. | ||
67 | +# Use instead streaming objects: | ||
68 | +# Load stored word-id map (dictionary) | ||
69 | +stream_it = corpus_streamer(input_file, dictionary=dictionary) | ||
70 | +#for vector in stream_it: # load one vector into memory at a time | ||
71 | +# print vector | ||
72 | +# Convert to sparse matrix | ||
73 | +sparse_corpus = [text for text in stream_it] | ||
74 | +# Store to disk, for later use collect statistics about all tokens | ||
75 | +corpora.MmCorpus.serialize('lsa_mini.mm', | ||
76 | + sparse_corpus) | ||
77 | +## LSA zone | ||
78 | +# load the dictionary saved before | ||
79 | +id2word = dictionary.load('lsa_mini.dict') | ||
80 | +# Now load the sparse matrix corpus from file into a (memory friendly) streaming | ||
81 | +# object. | ||
82 | +corpus=corpora.MmCorpus('lsa_mini.mm') | ||
83 | + | ||
84 | +## IF TfidfModel | ||
85 | +tfidf = gensim.models.TfidfModel(corpus) # step 1 -- initialize a model | ||
86 | +corpus = tfidf[corpus] | ||
87 | +## FI TfidfModel | ||
88 | +# Compute the LSA vectors | ||
89 | +lsa=gensim.models.lsimodel.LsiModel(corpus, id2word=dictionary, | ||
90 | + num_topics=n_topics) | ||
91 | +# Print the n topics in our corpus: | ||
92 | +#lsa.print_topics(n_topics) | ||
93 | +f=open("topics_file.txt","wb") | ||
94 | +f.write("-------------------------------------------------\n") | ||
95 | +for t in lsa.show_topics(): | ||
96 | + f.write("%s\n" % str(t)) | ||
97 | + | ||
98 | +f.write("-------------------------------------------------\n") | ||
99 | +f.close() | ||
100 | +# create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi | ||
101 | +corpus_lsa = lsa[corpus] | ||
102 | +# Stream sentences from file into a list of strings called "sentences" | ||
103 | +sentences=corpus_streamer(input_file, strings=True) | ||
104 | +n=0 | ||
105 | +for pertenence, sentence in zip(corpus_lsa, sentences): | ||
106 | + if n_docs <= 0: | ||
107 | + #print "%s\t\t%s" % (pertenence, sentence.split("\t")[0]) | ||
108 | + p=[dict(pertenence)[x] if x in dict(pertenence) else 0.0 | ||
109 | + for x in xrange(n_topics)] | ||
110 | + print "%s %s" % ("".join(sentence.split("\t")[0].split()), | ||
111 | + "".join(str(p)[1:].strip("]").split(",")) ) | ||
112 | + else: | ||
113 | + if n<n_docs: | ||
114 | + pertenence=[dict(pertenence)[x] if x in dict(pertenence) else 0.0 | ||
115 | + for x in xrange(n_topics)] | ||
116 | + print "%s\t\t%s" % (pertenence, sentence) | ||
117 | + n+=1 | ||
118 | + else: | ||
119 | + break | ||
120 | + | ||
121 | + | ||
122 | + | ||
123 | +# ============================== Homework ====================================== | ||
124 | +# Modify the program for doing this for a sample of the English Wikipedia. | ||
125 | +# Compute LSA for 20 topics and print the fist 10 topics. | ||
126 | +# Take care of avoiding loading and printing documents of a large corpus, so | ||
127 | +# change the number of documents to print or sample the entire set randomly and | ||
128 | +# print a subset. | ||
129 | +# ============================================================================== |
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_002.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_003.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_004.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_005.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_006.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_007.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_008.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_009.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_010.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_011.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_012.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_013.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_014.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_015.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_016.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_017.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_018.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_019.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_020.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_021.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_022.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_023.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_024.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_025.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_026.cls
0 → 100644
This diff is collapsed. Click to expand it.
one-by-one/vectors/gus_originales_027.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_028.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_029.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_030.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_031.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_032.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_033.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_034.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_035.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_036.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_037.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_038.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_039.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_040.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_041.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_042.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_043.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_044.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_045.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_046.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_047.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_048.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_049.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_050.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_051.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_052.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_053.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_054.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_055.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_056.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_057.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_058.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_059.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_060.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_061.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_062.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_063.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_064.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_065.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_066.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_067.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_068.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_069.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_070.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_071.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_072.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_073.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_074.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_075.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_076.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_077.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_078.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_079.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_080.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_081.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_082.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_083.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_084.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_085.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_086.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_087.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_088.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_089.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_090.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_091.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_092.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_093.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_094.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_095.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_096.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_097.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_098.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_099.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_100.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_101.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_102.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_103.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_104.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_105.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_106.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_107.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_108.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_109.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_110.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_111.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_112.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_113.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_114.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_115.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_116.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_117.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_118.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_119.cls
0 → 100644
This diff could not be displayed because it is too large.
one-by-one/vectors/gus_originales_120.cls
0 → 100644
This diff could not be displayed because it is too large.
power_law_plot.py
0 → 100644
This diff is collapsed. Click to expand it.
-
Please register or login to post a comment