Showing
3 changed files
with
419 additions
and
0 deletions
sentence-representation/keyed2indexed.py
0 → 100644
1 | +import wisse | ||
2 | +from gensim.models.keyedvectors import KeyedVectors as vDB | ||
3 | +import sys | ||
4 | +import logging | ||
5 | + | ||
6 | +# sys.argv[1]: Input embeddings model (w2v format) | ||
7 | +# sys.argv[2]: Output direcory for indexed format | ||
8 | +# sys.argv[3]: Input format (default: binary) | ||
9 | +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', | ||
10 | + level=logging.INFO) | ||
11 | + | ||
12 | +load_vectors = vDB.load_word2vec_format | ||
13 | + | ||
14 | +try: | ||
15 | + if sys.argv[3]: | ||
16 | + binary = False | ||
17 | +except: | ||
18 | + binary = True | ||
19 | + | ||
20 | +embedding = load_vectors(sys.argv[1], binary=binary, encoding = "latin-1") | ||
21 | +logging.info("""Indexing embeddings, this will take a while...\n""") | ||
22 | +wisse.keyed2indexed(embedding, sys.argv[2]) | ||
23 | +logging.info("""Embeddings indexed, please verify the contents of the output directory...\n""") |
sentence-representation/wisse.py
0 → 100644
1 | +#!/usr/bin/python | ||
2 | +# -*- coding: latin-1 -*- | ||
3 | +# Python2.7 | ||
4 | + | ||
5 | +import numpy as np | ||
6 | +import logging | ||
7 | +import os | ||
8 | +from functools import partial | ||
9 | +from pdb import set_trace as st | ||
10 | +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', | ||
11 | + level=logging.INFO) | ||
12 | + | ||
13 | + | ||
14 | +class wisse(object): | ||
15 | + """ Both the TFIDFVectorizer and the word embedding model must be pretrained, either from the local | ||
16 | + sentence corpus or from model persintence. | ||
17 | + """ | ||
18 | + def __init__(self, embeddings, vectorizer, tf_tfidf, combiner = "sum"): | ||
19 | + self.tokenize = vectorizer.build_tokenizer() | ||
20 | + self.tfidf = vectorizer | ||
21 | + self.embedding = embeddings | ||
22 | + self.pred_tfidf = tf_tfidf | ||
23 | + if combiner.startswith("avg"): | ||
24 | + self.comb = partial(np.mean, axis = 0) | ||
25 | + else: | ||
26 | + self.comb = partial(np.sum, axis = 0) | ||
27 | + | ||
28 | + | ||
29 | + def fit(self, X, y = None): # Scikit-learn template | ||
30 | + if isinstance(X, list): | ||
31 | + self.sentences = X | ||
32 | + | ||
33 | + return self | ||
34 | + | ||
35 | + | ||
36 | + def transform(self, X): | ||
37 | + if isinstance(X, list): | ||
38 | + return self.fit(X) | ||
39 | + | ||
40 | + elif isinstance(X, str): | ||
41 | + return self.infer_sentence(X) | ||
42 | + | ||
43 | + | ||
44 | + def fit_transform(self, X, y=None): | ||
45 | + return self.transform(X) | ||
46 | + | ||
47 | + | ||
48 | + def infer_sentence(self, sent): | ||
49 | + ss = self.tokenize(sent) | ||
50 | + missing_bow = [] | ||
51 | + missing_cbow = [] | ||
52 | + series = {} | ||
53 | + | ||
54 | + if not ss == []: | ||
55 | + self.weights, m = self.infer_tfidf_weights(ss) | ||
56 | + else: | ||
57 | + return None | ||
58 | + | ||
59 | + missing_bow += m | ||
60 | + | ||
61 | + for w in self.weights: | ||
62 | + try: | ||
63 | + series[w] = (self.weights[w], self.embedding[w]) | ||
64 | + except KeyError: | ||
65 | + series[w] = None | ||
66 | + missing_cbow.append(w) | ||
67 | + continue | ||
68 | + except IndexError: | ||
69 | + continue | ||
70 | + | ||
71 | + if self.weights == {}: return None | ||
72 | + # Embedding the sentence... : | ||
73 | + sentence = np.array([series[w][1] for w in series if not series[w] is None]) | ||
74 | + series = {} | ||
75 | + | ||
76 | + return missing_cbow, missing_bow, self.comb(sentence) | ||
77 | + | ||
78 | + | ||
79 | + def infer_tfidf_weights(self, sentence): | ||
80 | + existent = {} | ||
81 | + missing = [] | ||
82 | + | ||
83 | + if not self.tfidf: | ||
84 | + for word in sentence: | ||
85 | + existent[word] = 1.0 | ||
86 | + | ||
87 | + return existent, missing | ||
88 | + | ||
89 | + if self.pred_tfidf: | ||
90 | + unseen = self.tfidf.transform([" ".join(sentence)]).toarray() | ||
91 | + for word in sentence: | ||
92 | + try: | ||
93 | + existent[word] = unseen[0][self.tfidf.vocabulary_[word]] | ||
94 | + except KeyError: | ||
95 | + missing.append(word) | ||
96 | + continue | ||
97 | + else: | ||
98 | + for word in sentence: | ||
99 | + try: | ||
100 | + weight = vectorizer.idf_[vectorizer.vocabulary_[word]] | ||
101 | + existent[word] = weight if weight > 2 else 0.01 | ||
102 | + except KeyError: | ||
103 | + missing.append(word) | ||
104 | + continue | ||
105 | + | ||
106 | + return existent, missing | ||
107 | + | ||
108 | + | ||
109 | + def __iter__(self): | ||
110 | + for s in self.sentences: | ||
111 | + yield self.transform(s) | ||
112 | + | ||
113 | + | ||
114 | +def save_dense(directory, filename, array): | ||
115 | + directory=os.path.normpath(directory) + '/' | ||
116 | +# try: | ||
117 | + if filename.isalpha(): | ||
118 | + np.save(directory + filename, array) | ||
119 | + else: | ||
120 | + return None | ||
121 | +# except UnicodeEncodeError: | ||
122 | +# return None | ||
123 | + | ||
124 | +def load_dense(filename): | ||
125 | + return np.load(filename) | ||
126 | + | ||
127 | + | ||
128 | +def load_sparse_bsr(filename): | ||
129 | + loader = np.load(filename) | ||
130 | + return bsr_matrix((loader['data'], loader['indices'], loader['indptr']), | ||
131 | + shape=loader['shape']) | ||
132 | + | ||
133 | + | ||
134 | +def save_sparse_bsr(directory, filename, array): | ||
135 | +# note that .npz extension is added automatically | ||
136 | + directory=os.path.normpath(directory) + '/' | ||
137 | + if word.isalpha(): | ||
138 | + array=array.tobsr() | ||
139 | + np.savez(directory + filename, data=array.data, indices=array.indices, | ||
140 | + indptr=array.indptr, shape=array.shape) | ||
141 | + else: | ||
142 | + return None | ||
143 | + | ||
144 | + | ||
145 | +class vector_space(object): | ||
146 | + def __init__(self, directory, sparse = False): | ||
147 | + self.sparse = sparse | ||
148 | + ext = ".npz" if sparse else ".npy" | ||
149 | + if directory.endswith(".tar.gz"): | ||
150 | + self._tar = True | ||
151 | + import tarfile | ||
152 | + self.tar = tarfile.open(directory) | ||
153 | + file_list = self.tar.getnames() #[os.path.basename(n) for n in self.tar.getnames()] | ||
154 | + self.words = {os.path.basename(word).replace(ext, ''): word | ||
155 | + for word in file_list} | ||
156 | + else: | ||
157 | + self._tar = False | ||
158 | + directory = os.path.normpath(directory) + '/' | ||
159 | + file_list = os.listdir(directory) | ||
160 | + self.words = {word.replace(ext, ''): directory + word | ||
161 | + for word in file_list} | ||
162 | + | ||
163 | + | ||
164 | + def __getitem__(self, item): | ||
165 | + if self.sparse: | ||
166 | + if self._tar: | ||
167 | + member = self.tar.getmember(self.words[item]) | ||
168 | + word = self.tar.extractfile(member) | ||
169 | + else: | ||
170 | + word = self.words[item] | ||
171 | + #return load_sparse_bsr(self.words[item]) | ||
172 | + return load_sparse_bsr(word) | ||
173 | + | ||
174 | + else: | ||
175 | + if self._tar: | ||
176 | + member = self.tar.getmember(self.words[item]) | ||
177 | + word = self.tar.extractfile(member) | ||
178 | + else: | ||
179 | + word = self.words[item] | ||
180 | + #return load_sparse_bsr(self.words[item]) | ||
181 | + return load_dense(word) | ||
182 | + | ||
183 | + | ||
184 | +def keyed2indexed(keyed_model, output_dir = "word_embeddings/", parallel = True, n_jobs = -1): | ||
185 | + output_dir = os.path.normpath(output_dir) + '/' | ||
186 | + if not os.path.exists(output_dir): | ||
187 | + os.makedirs(output_dir) | ||
188 | + | ||
189 | + if parallel: | ||
190 | + from joblib import Parallel, delayed | ||
191 | + | ||
192 | + Parallel(n_jobs = n_jobs, verbose = 10)(delayed(save_dense)(output_dir, word, keyed_model[word]) | ||
193 | + for word, _ in keyed_model.vocab.items()) | ||
194 | + else: | ||
195 | + for word, _ in keyed_model.vocab.items(): | ||
196 | + save_dense(output_dir, word, keyed_model[word]) | ||
197 | + | ||
198 | + | ||
199 | +class streamer(object): | ||
200 | + def __init__(self, file_name): | ||
201 | + self.file_name = file_name | ||
202 | + | ||
203 | + def __iter__(self): | ||
204 | + for s in open(self.file_name): | ||
205 | + yield s.strip() |
sentence-representation/wisse_example.py
0 → 100644
1 | +#!/usr/bin/python | ||
2 | +# -*- coding: latin-1 -*- | ||
3 | +# Python2.7 | ||
4 | +from gensim.models.keyedvectors import KeyedVectors as vDB | ||
5 | +from sklearn.feature_extraction.text import TfidfVectorizer | ||
6 | +import numpy as np | ||
7 | +#import numexpr as ne | ||
8 | +import argparse | ||
9 | +#import _pickle as pickle | ||
10 | +import cPickle as pickle | ||
11 | +import logging | ||
12 | +import os | ||
13 | +from functools import partial | ||
14 | +import wisse | ||
15 | + | ||
16 | + | ||
17 | +load_vectors = vDB.load_word2vec_format | ||
18 | + | ||
19 | +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', | ||
20 | + level=logging.INFO) | ||
21 | + | ||
22 | + | ||
23 | +if __name__ == "__main__": | ||
24 | + | ||
25 | + parser = argparse.ArgumentParser(description="""This use example shows sentence | ||
26 | + embedding by using WISSE. The input is a text file which has a sentece in | ||
27 | + each of its rows. The output file has two tab-separated columns: the index | ||
28 | + line of the sentece in the input file and the sentence vector representation | ||
29 | + .""") | ||
30 | + parser.add_argument("--idfmodel", help = """Input file containing IDF | ||
31 | + pre-trained weights. If not provided, | ||
32 | + all word vector weights will be set to | ||
33 | + 1.0. If 'local' tf-idf weights will be | ||
34 | + computed locally from the input file | ||
35 | + (pickled sklearn object).""", | ||
36 | + default = None) | ||
37 | + parser.add_argument("--embedmodel", help = """Input file containing word | ||
38 | + embeddings model (binary and text | ||
39 | + are allowed).""", required = True) | ||
40 | + parser.add_argument("--output", help = """Output file containing the sentence | ||
41 | + embeddings.""", default = "") | ||
42 | + parser.add_argument("--input", help = """Input file containing a sentence | ||
43 | + by row.""", required = True) | ||
44 | + parser.add_argument("--comb", help = """Desired word vector combination for | ||
45 | + sentence representation {sum, avg}. | ||
46 | + (default = 'sum')""", default = "sum") | ||
47 | + parser.add_argument("--suffix", nargs = '?', help = """A suffix to be added | ||
48 | + to the output file (default = '')""", | ||
49 | + default = "", required = False) | ||
50 | + parser.add_argument("--tfidf", help="""To predict TFIDF complete weights | ||
51 | + ('tfidf') or use only partial IDFs | ||
52 | + ('idf'). (default = 'tfidf')""", | ||
53 | + default = "tfidf") | ||
54 | + parser.add_argument("--localw", help = """TFIDF word vector weights | ||
55 | + computed locally from the input file of | ||
56 | + sentences {freq, binary, sublinear} | ||
57 | + (default='none').""", default = "none") | ||
58 | + parser.add_argument("--stop", help = """Toggles stripping stop words in | ||
59 | + locally computed word vector weights.""", | ||
60 | + action = "store_true") | ||
61 | + parser.add_argument("--format", help = """The format of the embedding model | ||
62 | + file: {binary, text, wisse}. | ||
63 | + default = 'binary'""", default = "binary") | ||
64 | + args = parser.parse_args() | ||
65 | + | ||
66 | + | ||
67 | + if not args.format.startswith("wisse"): | ||
68 | + if not os.path.isfile(args.embedmodel): | ||
69 | + logging.info("""Embedding model file does not exist (EXIT): | ||
70 | + \n%s\n ...""" % args.embedmodel) | ||
71 | + exit() | ||
72 | + elif not os.path.exists(args.embedmodel): | ||
73 | + logging.info("""Embedding model directory does not exist (EXIT): | ||
74 | + \n%s\n ...""" % args.embedmodel) | ||
75 | + exit() | ||
76 | + | ||
77 | + if not os.path.isfile(args.idfmodel) and not args.idfmodel.startswith("local"): | ||
78 | + logging.info("""IDF model file does not exist (EXIT): | ||
79 | + \n%s\n ...""" % args.idfmodel) | ||
80 | + exit() | ||
81 | + if not os.path.isfile(args.input): | ||
82 | + logging.info("""Input file does not exist (EXIT): | ||
83 | + \n%s\n ...""" % args.input) | ||
84 | + exit() | ||
85 | + if args.output != "": | ||
86 | + if os.path.dirname(args.output) != "": | ||
87 | + if not os.path.exists(os.path.dirname(args.output)): | ||
88 | + logging.info("""Output directory does not exist (EXIT): | ||
89 | + \n%s\n ...""" % args.output) | ||
90 | + exit() | ||
91 | + else: | ||
92 | + output_name = args.output | ||
93 | + else: | ||
94 | + output_name = args.output | ||
95 | + else: | ||
96 | + suffix = "_".join([embedding_name, | ||
97 | + args.comb, | ||
98 | + args.tfidf, | ||
99 | + "local" if args.idfmodel.startswith("local") else tfidf_name, | ||
100 | + args.suffix]).strip("_") | ||
101 | + output_name = args.input + ".output_" + suffix | ||
102 | + | ||
103 | + | ||
104 | + if args.tfidf.startswith("tfidf"): | ||
105 | + pred_tfidf = True | ||
106 | + elif args.tfidf.startswith("idf"): | ||
107 | + pred_tfidf = False | ||
108 | + else: | ||
109 | + pred_tfidf = False | ||
110 | + tfidf = False | ||
111 | + | ||
112 | + vectorizer = TfidfVectorizer(min_df = 1, | ||
113 | + encoding = "latin-1", | ||
114 | + decode_error = "replace", | ||
115 | + lowercase = True, | ||
116 | + binary = True if args.localw.startswith("bin") else False, | ||
117 | + sublinear_tf = True if args.localw.startswith("subl") else False, | ||
118 | + stop_words = "english" if args.stop else None) | ||
119 | + | ||
120 | + sentences = wisse.streamer(args.input) | ||
121 | + | ||
122 | + if args.idfmodel.startswith("local"): | ||
123 | + logging.info("Fitting local TFIDF weights from: %s ..." % args.input) | ||
124 | + tfidf = vectorizer.fit(sentences) | ||
125 | + | ||
126 | + elif os.path.isfile(args.idfmodel): | ||
127 | + logging.info("Loading global TFIDF weights from: %s ..." % args.idfmodel) | ||
128 | + with open(args.idfmodel, 'rb') as f: | ||
129 | + tfidf = pickle.load(f)#, encoding = 'latin-1') | ||
130 | + | ||
131 | + else: | ||
132 | + tfidf = False | ||
133 | + | ||
134 | + try: | ||
135 | + if args.format.startswith("bin"): | ||
136 | + embedding = load_vectors(args.embedmodel, binary = True, | ||
137 | + encoding = "latin-1") | ||
138 | + elif args.format.startswith("tex"): | ||
139 | + embedding = load_vectors(args.embedmodel, binary = False, | ||
140 | + encoding = "latin-1") | ||
141 | + else: | ||
142 | + embedding = wisse.vector_space(args.embedmodel, sparse = False) | ||
143 | + | ||
144 | + except: | ||
145 | + logging.info( | ||
146 | + """Error while loading word embedding model. Verify if the file | ||
147 | + is broken (EXIT)...\n%s\n""" % args.embedmodel) | ||
148 | + exit() | ||
149 | + | ||
150 | + embedding_name = os.path.basename(args.embedmodel).split(".")[0] | ||
151 | + tfidf_name = os.path.basename(args.idfmodel).split(".")[0] | ||
152 | + | ||
153 | + missing_bow = [] # Stores missing words in the TFIDF model | ||
154 | + missing_cbow = [] # Stores missing words in the W2V model | ||
155 | + sidx = 0 # The index of the sentence according to the input file | ||
156 | + logging.info("\n\nEmbedding sentences and saving then to a the output file..\n%s\n" % output_name) | ||
157 | + | ||
158 | + with open(output_name, "w") as fo: | ||
159 | + for sent in sentences: | ||
160 | + sidx += 1 | ||
161 | + series = wisse.wisse(embeddings = embedding, vectorizer = tfidf, | ||
162 | + tf_tfidf = True, combiner='sum') | ||
163 | + try: | ||
164 | + mc, mb, vector = series.transform(sent) | ||
165 | + except TypeError: | ||
166 | + continue | ||
167 | + | ||
168 | + # At this point you can use the embedding 'vector' for any application as it | ||
169 | + # is a numpy array. Also you can simply save the vectors in text format as | ||
170 | + # follows: | ||
171 | + missing_cbow += mc | ||
172 | + missing_bow += mb | ||
173 | + fo.write("%d\t%s\n" % (sidx, np.array2string(vector, | ||
174 | + formatter = {'float_kind':lambda x: "%.6f" % x}, | ||
175 | + max_line_width = 20000).strip(']').strip('[') )) | ||
176 | + | ||
177 | + missing_name = (os.path.basename(args.input).split(".")[0] + "_" + | ||
178 | + embedding_name + "_" + | ||
179 | + tfidf_name + ".missing") | ||
180 | + logging.info("\n\nSaving missing vocabulary to %s ..\n\n" % missing_name) | ||
181 | + | ||
182 | + with open(missing_name, "w") as f: | ||
183 | + f.write("# missing word embeddings:\n") | ||
184 | + for w in set(missing_cbow): | ||
185 | + f.write("%s\n" % w) | ||
186 | + | ||
187 | + f.write("# missing MI weights:\n") | ||
188 | + for w in set(missing_bow): | ||
189 | + f.write("%s\n" % w) | ||
190 | + | ||
191 | + logging.info("FINISHED! \n") |
-
Please register or login to post a comment