carlosmendeznlp

Add WISSe scripts

1 +import wisse
2 +from gensim.models.keyedvectors import KeyedVectors as vDB
3 +import sys
4 +import logging
5 +
6 +# sys.argv[1]: Input embeddings model (w2v format)
7 +# sys.argv[2]: Output direcory for indexed format
8 +# sys.argv[3]: Input format (default: binary)
9 +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
10 + level=logging.INFO)
11 +
12 +load_vectors = vDB.load_word2vec_format
13 +
14 +try:
15 + if sys.argv[3]:
16 + binary = False
17 +except:
18 + binary = True
19 +
20 +embedding = load_vectors(sys.argv[1], binary=binary, encoding = "latin-1")
21 +logging.info("""Indexing embeddings, this will take a while...\n""")
22 +wisse.keyed2indexed(embedding, sys.argv[2])
23 +logging.info("""Embeddings indexed, please verify the contents of the output directory...\n""")
1 +#!/usr/bin/python
2 +# -*- coding: latin-1 -*-
3 +# Python2.7
4 +
5 +import numpy as np
6 +import logging
7 +import os
8 +from functools import partial
9 +from pdb import set_trace as st
10 +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
11 + level=logging.INFO)
12 +
13 +
14 +class wisse(object):
15 + """ Both the TFIDFVectorizer and the word embedding model must be pretrained, either from the local
16 + sentence corpus or from model persintence.
17 + """
18 + def __init__(self, embeddings, vectorizer, tf_tfidf, combiner = "sum"):
19 + self.tokenize = vectorizer.build_tokenizer()
20 + self.tfidf = vectorizer
21 + self.embedding = embeddings
22 + self.pred_tfidf = tf_tfidf
23 + if combiner.startswith("avg"):
24 + self.comb = partial(np.mean, axis = 0)
25 + else:
26 + self.comb = partial(np.sum, axis = 0)
27 +
28 +
29 + def fit(self, X, y = None): # Scikit-learn template
30 + if isinstance(X, list):
31 + self.sentences = X
32 +
33 + return self
34 +
35 +
36 + def transform(self, X):
37 + if isinstance(X, list):
38 + return self.fit(X)
39 +
40 + elif isinstance(X, str):
41 + return self.infer_sentence(X)
42 +
43 +
44 + def fit_transform(self, X, y=None):
45 + return self.transform(X)
46 +
47 +
48 + def infer_sentence(self, sent):
49 + ss = self.tokenize(sent)
50 + missing_bow = []
51 + missing_cbow = []
52 + series = {}
53 +
54 + if not ss == []:
55 + self.weights, m = self.infer_tfidf_weights(ss)
56 + else:
57 + return None
58 +
59 + missing_bow += m
60 +
61 + for w in self.weights:
62 + try:
63 + series[w] = (self.weights[w], self.embedding[w])
64 + except KeyError:
65 + series[w] = None
66 + missing_cbow.append(w)
67 + continue
68 + except IndexError:
69 + continue
70 +
71 + if self.weights == {}: return None
72 + # Embedding the sentence... :
73 + sentence = np.array([series[w][1] for w in series if not series[w] is None])
74 + series = {}
75 +
76 + return missing_cbow, missing_bow, self.comb(sentence)
77 +
78 +
79 + def infer_tfidf_weights(self, sentence):
80 + existent = {}
81 + missing = []
82 +
83 + if not self.tfidf:
84 + for word in sentence:
85 + existent[word] = 1.0
86 +
87 + return existent, missing
88 +
89 + if self.pred_tfidf:
90 + unseen = self.tfidf.transform([" ".join(sentence)]).toarray()
91 + for word in sentence:
92 + try:
93 + existent[word] = unseen[0][self.tfidf.vocabulary_[word]]
94 + except KeyError:
95 + missing.append(word)
96 + continue
97 + else:
98 + for word in sentence:
99 + try:
100 + weight = vectorizer.idf_[vectorizer.vocabulary_[word]]
101 + existent[word] = weight if weight > 2 else 0.01
102 + except KeyError:
103 + missing.append(word)
104 + continue
105 +
106 + return existent, missing
107 +
108 +
109 + def __iter__(self):
110 + for s in self.sentences:
111 + yield self.transform(s)
112 +
113 +
114 +def save_dense(directory, filename, array):
115 + directory=os.path.normpath(directory) + '/'
116 +# try:
117 + if filename.isalpha():
118 + np.save(directory + filename, array)
119 + else:
120 + return None
121 +# except UnicodeEncodeError:
122 +# return None
123 +
124 +def load_dense(filename):
125 + return np.load(filename)
126 +
127 +
128 +def load_sparse_bsr(filename):
129 + loader = np.load(filename)
130 + return bsr_matrix((loader['data'], loader['indices'], loader['indptr']),
131 + shape=loader['shape'])
132 +
133 +
134 +def save_sparse_bsr(directory, filename, array):
135 +# note that .npz extension is added automatically
136 + directory=os.path.normpath(directory) + '/'
137 + if word.isalpha():
138 + array=array.tobsr()
139 + np.savez(directory + filename, data=array.data, indices=array.indices,
140 + indptr=array.indptr, shape=array.shape)
141 + else:
142 + return None
143 +
144 +
145 +class vector_space(object):
146 + def __init__(self, directory, sparse = False):
147 + self.sparse = sparse
148 + ext = ".npz" if sparse else ".npy"
149 + if directory.endswith(".tar.gz"):
150 + self._tar = True
151 + import tarfile
152 + self.tar = tarfile.open(directory)
153 + file_list = self.tar.getnames() #[os.path.basename(n) for n in self.tar.getnames()]
154 + self.words = {os.path.basename(word).replace(ext, ''): word
155 + for word in file_list}
156 + else:
157 + self._tar = False
158 + directory = os.path.normpath(directory) + '/'
159 + file_list = os.listdir(directory)
160 + self.words = {word.replace(ext, ''): directory + word
161 + for word in file_list}
162 +
163 +
164 + def __getitem__(self, item):
165 + if self.sparse:
166 + if self._tar:
167 + member = self.tar.getmember(self.words[item])
168 + word = self.tar.extractfile(member)
169 + else:
170 + word = self.words[item]
171 + #return load_sparse_bsr(self.words[item])
172 + return load_sparse_bsr(word)
173 +
174 + else:
175 + if self._tar:
176 + member = self.tar.getmember(self.words[item])
177 + word = self.tar.extractfile(member)
178 + else:
179 + word = self.words[item]
180 + #return load_sparse_bsr(self.words[item])
181 + return load_dense(word)
182 +
183 +
184 +def keyed2indexed(keyed_model, output_dir = "word_embeddings/", parallel = True, n_jobs = -1):
185 + output_dir = os.path.normpath(output_dir) + '/'
186 + if not os.path.exists(output_dir):
187 + os.makedirs(output_dir)
188 +
189 + if parallel:
190 + from joblib import Parallel, delayed
191 +
192 + Parallel(n_jobs = n_jobs, verbose = 10)(delayed(save_dense)(output_dir, word, keyed_model[word])
193 + for word, _ in keyed_model.vocab.items())
194 + else:
195 + for word, _ in keyed_model.vocab.items():
196 + save_dense(output_dir, word, keyed_model[word])
197 +
198 +
199 +class streamer(object):
200 + def __init__(self, file_name):
201 + self.file_name = file_name
202 +
203 + def __iter__(self):
204 + for s in open(self.file_name):
205 + yield s.strip()
1 +#!/usr/bin/python
2 +# -*- coding: latin-1 -*-
3 +# Python2.7
4 +from gensim.models.keyedvectors import KeyedVectors as vDB
5 +from sklearn.feature_extraction.text import TfidfVectorizer
6 +import numpy as np
7 +#import numexpr as ne
8 +import argparse
9 +#import _pickle as pickle
10 +import cPickle as pickle
11 +import logging
12 +import os
13 +from functools import partial
14 +import wisse
15 +
16 +
17 +load_vectors = vDB.load_word2vec_format
18 +
19 +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
20 + level=logging.INFO)
21 +
22 +
23 +if __name__ == "__main__":
24 +
25 + parser = argparse.ArgumentParser(description="""This use example shows sentence
26 + embedding by using WISSE. The input is a text file which has a sentece in
27 + each of its rows. The output file has two tab-separated columns: the index
28 + line of the sentece in the input file and the sentence vector representation
29 + .""")
30 + parser.add_argument("--idfmodel", help = """Input file containing IDF
31 + pre-trained weights. If not provided,
32 + all word vector weights will be set to
33 + 1.0. If 'local' tf-idf weights will be
34 + computed locally from the input file
35 + (pickled sklearn object).""",
36 + default = None)
37 + parser.add_argument("--embedmodel", help = """Input file containing word
38 + embeddings model (binary and text
39 + are allowed).""", required = True)
40 + parser.add_argument("--output", help = """Output file containing the sentence
41 + embeddings.""", default = "")
42 + parser.add_argument("--input", help = """Input file containing a sentence
43 + by row.""", required = True)
44 + parser.add_argument("--comb", help = """Desired word vector combination for
45 + sentence representation {sum, avg}.
46 + (default = 'sum')""", default = "sum")
47 + parser.add_argument("--suffix", nargs = '?', help = """A suffix to be added
48 + to the output file (default = '')""",
49 + default = "", required = False)
50 + parser.add_argument("--tfidf", help="""To predict TFIDF complete weights
51 + ('tfidf') or use only partial IDFs
52 + ('idf'). (default = 'tfidf')""",
53 + default = "tfidf")
54 + parser.add_argument("--localw", help = """TFIDF word vector weights
55 + computed locally from the input file of
56 + sentences {freq, binary, sublinear}
57 + (default='none').""", default = "none")
58 + parser.add_argument("--stop", help = """Toggles stripping stop words in
59 + locally computed word vector weights.""",
60 + action = "store_true")
61 + parser.add_argument("--format", help = """The format of the embedding model
62 + file: {binary, text, wisse}.
63 + default = 'binary'""", default = "binary")
64 + args = parser.parse_args()
65 +
66 +
67 + if not args.format.startswith("wisse"):
68 + if not os.path.isfile(args.embedmodel):
69 + logging.info("""Embedding model file does not exist (EXIT):
70 + \n%s\n ...""" % args.embedmodel)
71 + exit()
72 + elif not os.path.exists(args.embedmodel):
73 + logging.info("""Embedding model directory does not exist (EXIT):
74 + \n%s\n ...""" % args.embedmodel)
75 + exit()
76 +
77 + if not os.path.isfile(args.idfmodel) and not args.idfmodel.startswith("local"):
78 + logging.info("""IDF model file does not exist (EXIT):
79 + \n%s\n ...""" % args.idfmodel)
80 + exit()
81 + if not os.path.isfile(args.input):
82 + logging.info("""Input file does not exist (EXIT):
83 + \n%s\n ...""" % args.input)
84 + exit()
85 + if args.output != "":
86 + if os.path.dirname(args.output) != "":
87 + if not os.path.exists(os.path.dirname(args.output)):
88 + logging.info("""Output directory does not exist (EXIT):
89 + \n%s\n ...""" % args.output)
90 + exit()
91 + else:
92 + output_name = args.output
93 + else:
94 + output_name = args.output
95 + else:
96 + suffix = "_".join([embedding_name,
97 + args.comb,
98 + args.tfidf,
99 + "local" if args.idfmodel.startswith("local") else tfidf_name,
100 + args.suffix]).strip("_")
101 + output_name = args.input + ".output_" + suffix
102 +
103 +
104 + if args.tfidf.startswith("tfidf"):
105 + pred_tfidf = True
106 + elif args.tfidf.startswith("idf"):
107 + pred_tfidf = False
108 + else:
109 + pred_tfidf = False
110 + tfidf = False
111 +
112 + vectorizer = TfidfVectorizer(min_df = 1,
113 + encoding = "latin-1",
114 + decode_error = "replace",
115 + lowercase = True,
116 + binary = True if args.localw.startswith("bin") else False,
117 + sublinear_tf = True if args.localw.startswith("subl") else False,
118 + stop_words = "english" if args.stop else None)
119 +
120 + sentences = wisse.streamer(args.input)
121 +
122 + if args.idfmodel.startswith("local"):
123 + logging.info("Fitting local TFIDF weights from: %s ..." % args.input)
124 + tfidf = vectorizer.fit(sentences)
125 +
126 + elif os.path.isfile(args.idfmodel):
127 + logging.info("Loading global TFIDF weights from: %s ..." % args.idfmodel)
128 + with open(args.idfmodel, 'rb') as f:
129 + tfidf = pickle.load(f)#, encoding = 'latin-1')
130 +
131 + else:
132 + tfidf = False
133 +
134 + try:
135 + if args.format.startswith("bin"):
136 + embedding = load_vectors(args.embedmodel, binary = True,
137 + encoding = "latin-1")
138 + elif args.format.startswith("tex"):
139 + embedding = load_vectors(args.embedmodel, binary = False,
140 + encoding = "latin-1")
141 + else:
142 + embedding = wisse.vector_space(args.embedmodel, sparse = False)
143 +
144 + except:
145 + logging.info(
146 + """Error while loading word embedding model. Verify if the file
147 + is broken (EXIT)...\n%s\n""" % args.embedmodel)
148 + exit()
149 +
150 + embedding_name = os.path.basename(args.embedmodel).split(".")[0]
151 + tfidf_name = os.path.basename(args.idfmodel).split(".")[0]
152 +
153 + missing_bow = [] # Stores missing words in the TFIDF model
154 + missing_cbow = [] # Stores missing words in the W2V model
155 + sidx = 0 # The index of the sentence according to the input file
156 + logging.info("\n\nEmbedding sentences and saving then to a the output file..\n%s\n" % output_name)
157 +
158 + with open(output_name, "w") as fo:
159 + for sent in sentences:
160 + sidx += 1
161 + series = wisse.wisse(embeddings = embedding, vectorizer = tfidf,
162 + tf_tfidf = True, combiner='sum')
163 + try:
164 + mc, mb, vector = series.transform(sent)
165 + except TypeError:
166 + continue
167 +
168 + # At this point you can use the embedding 'vector' for any application as it
169 + # is a numpy array. Also you can simply save the vectors in text format as
170 + # follows:
171 + missing_cbow += mc
172 + missing_bow += mb
173 + fo.write("%d\t%s\n" % (sidx, np.array2string(vector,
174 + formatter = {'float_kind':lambda x: "%.6f" % x},
175 + max_line_width = 20000).strip(']').strip('[') ))
176 +
177 + missing_name = (os.path.basename(args.input).split(".")[0] + "_" +
178 + embedding_name + "_" +
179 + tfidf_name + ".missing")
180 + logging.info("\n\nSaving missing vocabulary to %s ..\n\n" % missing_name)
181 +
182 + with open(missing_name, "w") as f:
183 + f.write("# missing word embeddings:\n")
184 + for w in set(missing_cbow):
185 + f.write("%s\n" % w)
186 +
187 + f.write("# missing MI weights:\n")
188 + for w in set(missing_bow):
189 + f.write("%s\n" % w)
190 +
191 + logging.info("FINISHED! \n")