Showing
3 changed files
with
419 additions
and
0 deletions
sentence-representation/keyed2indexed.py
0 → 100644
| 1 | +import wisse | ||
| 2 | +from gensim.models.keyedvectors import KeyedVectors as vDB | ||
| 3 | +import sys | ||
| 4 | +import logging | ||
| 5 | + | ||
| 6 | +# sys.argv[1]: Input embeddings model (w2v format) | ||
| 7 | +# sys.argv[2]: Output direcory for indexed format | ||
| 8 | +# sys.argv[3]: Input format (default: binary) | ||
| 9 | +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', | ||
| 10 | + level=logging.INFO) | ||
| 11 | + | ||
| 12 | +load_vectors = vDB.load_word2vec_format | ||
| 13 | + | ||
| 14 | +try: | ||
| 15 | + if sys.argv[3]: | ||
| 16 | + binary = False | ||
| 17 | +except: | ||
| 18 | + binary = True | ||
| 19 | + | ||
| 20 | +embedding = load_vectors(sys.argv[1], binary=binary, encoding = "latin-1") | ||
| 21 | +logging.info("""Indexing embeddings, this will take a while...\n""") | ||
| 22 | +wisse.keyed2indexed(embedding, sys.argv[2]) | ||
| 23 | +logging.info("""Embeddings indexed, please verify the contents of the output directory...\n""") |
sentence-representation/wisse.py
0 → 100644
| 1 | +#!/usr/bin/python | ||
| 2 | +# -*- coding: latin-1 -*- | ||
| 3 | +# Python2.7 | ||
| 4 | + | ||
| 5 | +import numpy as np | ||
| 6 | +import logging | ||
| 7 | +import os | ||
| 8 | +from functools import partial | ||
| 9 | +from pdb import set_trace as st | ||
| 10 | +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', | ||
| 11 | + level=logging.INFO) | ||
| 12 | + | ||
| 13 | + | ||
| 14 | +class wisse(object): | ||
| 15 | + """ Both the TFIDFVectorizer and the word embedding model must be pretrained, either from the local | ||
| 16 | + sentence corpus or from model persintence. | ||
| 17 | + """ | ||
| 18 | + def __init__(self, embeddings, vectorizer, tf_tfidf, combiner = "sum"): | ||
| 19 | + self.tokenize = vectorizer.build_tokenizer() | ||
| 20 | + self.tfidf = vectorizer | ||
| 21 | + self.embedding = embeddings | ||
| 22 | + self.pred_tfidf = tf_tfidf | ||
| 23 | + if combiner.startswith("avg"): | ||
| 24 | + self.comb = partial(np.mean, axis = 0) | ||
| 25 | + else: | ||
| 26 | + self.comb = partial(np.sum, axis = 0) | ||
| 27 | + | ||
| 28 | + | ||
| 29 | + def fit(self, X, y = None): # Scikit-learn template | ||
| 30 | + if isinstance(X, list): | ||
| 31 | + self.sentences = X | ||
| 32 | + | ||
| 33 | + return self | ||
| 34 | + | ||
| 35 | + | ||
| 36 | + def transform(self, X): | ||
| 37 | + if isinstance(X, list): | ||
| 38 | + return self.fit(X) | ||
| 39 | + | ||
| 40 | + elif isinstance(X, str): | ||
| 41 | + return self.infer_sentence(X) | ||
| 42 | + | ||
| 43 | + | ||
| 44 | + def fit_transform(self, X, y=None): | ||
| 45 | + return self.transform(X) | ||
| 46 | + | ||
| 47 | + | ||
| 48 | + def infer_sentence(self, sent): | ||
| 49 | + ss = self.tokenize(sent) | ||
| 50 | + missing_bow = [] | ||
| 51 | + missing_cbow = [] | ||
| 52 | + series = {} | ||
| 53 | + | ||
| 54 | + if not ss == []: | ||
| 55 | + self.weights, m = self.infer_tfidf_weights(ss) | ||
| 56 | + else: | ||
| 57 | + return None | ||
| 58 | + | ||
| 59 | + missing_bow += m | ||
| 60 | + | ||
| 61 | + for w in self.weights: | ||
| 62 | + try: | ||
| 63 | + series[w] = (self.weights[w], self.embedding[w]) | ||
| 64 | + except KeyError: | ||
| 65 | + series[w] = None | ||
| 66 | + missing_cbow.append(w) | ||
| 67 | + continue | ||
| 68 | + except IndexError: | ||
| 69 | + continue | ||
| 70 | + | ||
| 71 | + if self.weights == {}: return None | ||
| 72 | + # Embedding the sentence... : | ||
| 73 | + sentence = np.array([series[w][1] for w in series if not series[w] is None]) | ||
| 74 | + series = {} | ||
| 75 | + | ||
| 76 | + return missing_cbow, missing_bow, self.comb(sentence) | ||
| 77 | + | ||
| 78 | + | ||
| 79 | + def infer_tfidf_weights(self, sentence): | ||
| 80 | + existent = {} | ||
| 81 | + missing = [] | ||
| 82 | + | ||
| 83 | + if not self.tfidf: | ||
| 84 | + for word in sentence: | ||
| 85 | + existent[word] = 1.0 | ||
| 86 | + | ||
| 87 | + return existent, missing | ||
| 88 | + | ||
| 89 | + if self.pred_tfidf: | ||
| 90 | + unseen = self.tfidf.transform([" ".join(sentence)]).toarray() | ||
| 91 | + for word in sentence: | ||
| 92 | + try: | ||
| 93 | + existent[word] = unseen[0][self.tfidf.vocabulary_[word]] | ||
| 94 | + except KeyError: | ||
| 95 | + missing.append(word) | ||
| 96 | + continue | ||
| 97 | + else: | ||
| 98 | + for word in sentence: | ||
| 99 | + try: | ||
| 100 | + weight = vectorizer.idf_[vectorizer.vocabulary_[word]] | ||
| 101 | + existent[word] = weight if weight > 2 else 0.01 | ||
| 102 | + except KeyError: | ||
| 103 | + missing.append(word) | ||
| 104 | + continue | ||
| 105 | + | ||
| 106 | + return existent, missing | ||
| 107 | + | ||
| 108 | + | ||
| 109 | + def __iter__(self): | ||
| 110 | + for s in self.sentences: | ||
| 111 | + yield self.transform(s) | ||
| 112 | + | ||
| 113 | + | ||
| 114 | +def save_dense(directory, filename, array): | ||
| 115 | + directory=os.path.normpath(directory) + '/' | ||
| 116 | +# try: | ||
| 117 | + if filename.isalpha(): | ||
| 118 | + np.save(directory + filename, array) | ||
| 119 | + else: | ||
| 120 | + return None | ||
| 121 | +# except UnicodeEncodeError: | ||
| 122 | +# return None | ||
| 123 | + | ||
| 124 | +def load_dense(filename): | ||
| 125 | + return np.load(filename) | ||
| 126 | + | ||
| 127 | + | ||
| 128 | +def load_sparse_bsr(filename): | ||
| 129 | + loader = np.load(filename) | ||
| 130 | + return bsr_matrix((loader['data'], loader['indices'], loader['indptr']), | ||
| 131 | + shape=loader['shape']) | ||
| 132 | + | ||
| 133 | + | ||
| 134 | +def save_sparse_bsr(directory, filename, array): | ||
| 135 | +# note that .npz extension is added automatically | ||
| 136 | + directory=os.path.normpath(directory) + '/' | ||
| 137 | + if word.isalpha(): | ||
| 138 | + array=array.tobsr() | ||
| 139 | + np.savez(directory + filename, data=array.data, indices=array.indices, | ||
| 140 | + indptr=array.indptr, shape=array.shape) | ||
| 141 | + else: | ||
| 142 | + return None | ||
| 143 | + | ||
| 144 | + | ||
| 145 | +class vector_space(object): | ||
| 146 | + def __init__(self, directory, sparse = False): | ||
| 147 | + self.sparse = sparse | ||
| 148 | + ext = ".npz" if sparse else ".npy" | ||
| 149 | + if directory.endswith(".tar.gz"): | ||
| 150 | + self._tar = True | ||
| 151 | + import tarfile | ||
| 152 | + self.tar = tarfile.open(directory) | ||
| 153 | + file_list = self.tar.getnames() #[os.path.basename(n) for n in self.tar.getnames()] | ||
| 154 | + self.words = {os.path.basename(word).replace(ext, ''): word | ||
| 155 | + for word in file_list} | ||
| 156 | + else: | ||
| 157 | + self._tar = False | ||
| 158 | + directory = os.path.normpath(directory) + '/' | ||
| 159 | + file_list = os.listdir(directory) | ||
| 160 | + self.words = {word.replace(ext, ''): directory + word | ||
| 161 | + for word in file_list} | ||
| 162 | + | ||
| 163 | + | ||
| 164 | + def __getitem__(self, item): | ||
| 165 | + if self.sparse: | ||
| 166 | + if self._tar: | ||
| 167 | + member = self.tar.getmember(self.words[item]) | ||
| 168 | + word = self.tar.extractfile(member) | ||
| 169 | + else: | ||
| 170 | + word = self.words[item] | ||
| 171 | + #return load_sparse_bsr(self.words[item]) | ||
| 172 | + return load_sparse_bsr(word) | ||
| 173 | + | ||
| 174 | + else: | ||
| 175 | + if self._tar: | ||
| 176 | + member = self.tar.getmember(self.words[item]) | ||
| 177 | + word = self.tar.extractfile(member) | ||
| 178 | + else: | ||
| 179 | + word = self.words[item] | ||
| 180 | + #return load_sparse_bsr(self.words[item]) | ||
| 181 | + return load_dense(word) | ||
| 182 | + | ||
| 183 | + | ||
| 184 | +def keyed2indexed(keyed_model, output_dir = "word_embeddings/", parallel = True, n_jobs = -1): | ||
| 185 | + output_dir = os.path.normpath(output_dir) + '/' | ||
| 186 | + if not os.path.exists(output_dir): | ||
| 187 | + os.makedirs(output_dir) | ||
| 188 | + | ||
| 189 | + if parallel: | ||
| 190 | + from joblib import Parallel, delayed | ||
| 191 | + | ||
| 192 | + Parallel(n_jobs = n_jobs, verbose = 10)(delayed(save_dense)(output_dir, word, keyed_model[word]) | ||
| 193 | + for word, _ in keyed_model.vocab.items()) | ||
| 194 | + else: | ||
| 195 | + for word, _ in keyed_model.vocab.items(): | ||
| 196 | + save_dense(output_dir, word, keyed_model[word]) | ||
| 197 | + | ||
| 198 | + | ||
| 199 | +class streamer(object): | ||
| 200 | + def __init__(self, file_name): | ||
| 201 | + self.file_name = file_name | ||
| 202 | + | ||
| 203 | + def __iter__(self): | ||
| 204 | + for s in open(self.file_name): | ||
| 205 | + yield s.strip() |
sentence-representation/wisse_example.py
0 → 100644
| 1 | +#!/usr/bin/python | ||
| 2 | +# -*- coding: latin-1 -*- | ||
| 3 | +# Python2.7 | ||
| 4 | +from gensim.models.keyedvectors import KeyedVectors as vDB | ||
| 5 | +from sklearn.feature_extraction.text import TfidfVectorizer | ||
| 6 | +import numpy as np | ||
| 7 | +#import numexpr as ne | ||
| 8 | +import argparse | ||
| 9 | +#import _pickle as pickle | ||
| 10 | +import cPickle as pickle | ||
| 11 | +import logging | ||
| 12 | +import os | ||
| 13 | +from functools import partial | ||
| 14 | +import wisse | ||
| 15 | + | ||
| 16 | + | ||
| 17 | +load_vectors = vDB.load_word2vec_format | ||
| 18 | + | ||
| 19 | +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', | ||
| 20 | + level=logging.INFO) | ||
| 21 | + | ||
| 22 | + | ||
| 23 | +if __name__ == "__main__": | ||
| 24 | + | ||
| 25 | + parser = argparse.ArgumentParser(description="""This use example shows sentence | ||
| 26 | + embedding by using WISSE. The input is a text file which has a sentece in | ||
| 27 | + each of its rows. The output file has two tab-separated columns: the index | ||
| 28 | + line of the sentece in the input file and the sentence vector representation | ||
| 29 | + .""") | ||
| 30 | + parser.add_argument("--idfmodel", help = """Input file containing IDF | ||
| 31 | + pre-trained weights. If not provided, | ||
| 32 | + all word vector weights will be set to | ||
| 33 | + 1.0. If 'local' tf-idf weights will be | ||
| 34 | + computed locally from the input file | ||
| 35 | + (pickled sklearn object).""", | ||
| 36 | + default = None) | ||
| 37 | + parser.add_argument("--embedmodel", help = """Input file containing word | ||
| 38 | + embeddings model (binary and text | ||
| 39 | + are allowed).""", required = True) | ||
| 40 | + parser.add_argument("--output", help = """Output file containing the sentence | ||
| 41 | + embeddings.""", default = "") | ||
| 42 | + parser.add_argument("--input", help = """Input file containing a sentence | ||
| 43 | + by row.""", required = True) | ||
| 44 | + parser.add_argument("--comb", help = """Desired word vector combination for | ||
| 45 | + sentence representation {sum, avg}. | ||
| 46 | + (default = 'sum')""", default = "sum") | ||
| 47 | + parser.add_argument("--suffix", nargs = '?', help = """A suffix to be added | ||
| 48 | + to the output file (default = '')""", | ||
| 49 | + default = "", required = False) | ||
| 50 | + parser.add_argument("--tfidf", help="""To predict TFIDF complete weights | ||
| 51 | + ('tfidf') or use only partial IDFs | ||
| 52 | + ('idf'). (default = 'tfidf')""", | ||
| 53 | + default = "tfidf") | ||
| 54 | + parser.add_argument("--localw", help = """TFIDF word vector weights | ||
| 55 | + computed locally from the input file of | ||
| 56 | + sentences {freq, binary, sublinear} | ||
| 57 | + (default='none').""", default = "none") | ||
| 58 | + parser.add_argument("--stop", help = """Toggles stripping stop words in | ||
| 59 | + locally computed word vector weights.""", | ||
| 60 | + action = "store_true") | ||
| 61 | + parser.add_argument("--format", help = """The format of the embedding model | ||
| 62 | + file: {binary, text, wisse}. | ||
| 63 | + default = 'binary'""", default = "binary") | ||
| 64 | + args = parser.parse_args() | ||
| 65 | + | ||
| 66 | + | ||
| 67 | + if not args.format.startswith("wisse"): | ||
| 68 | + if not os.path.isfile(args.embedmodel): | ||
| 69 | + logging.info("""Embedding model file does not exist (EXIT): | ||
| 70 | + \n%s\n ...""" % args.embedmodel) | ||
| 71 | + exit() | ||
| 72 | + elif not os.path.exists(args.embedmodel): | ||
| 73 | + logging.info("""Embedding model directory does not exist (EXIT): | ||
| 74 | + \n%s\n ...""" % args.embedmodel) | ||
| 75 | + exit() | ||
| 76 | + | ||
| 77 | + if not os.path.isfile(args.idfmodel) and not args.idfmodel.startswith("local"): | ||
| 78 | + logging.info("""IDF model file does not exist (EXIT): | ||
| 79 | + \n%s\n ...""" % args.idfmodel) | ||
| 80 | + exit() | ||
| 81 | + if not os.path.isfile(args.input): | ||
| 82 | + logging.info("""Input file does not exist (EXIT): | ||
| 83 | + \n%s\n ...""" % args.input) | ||
| 84 | + exit() | ||
| 85 | + if args.output != "": | ||
| 86 | + if os.path.dirname(args.output) != "": | ||
| 87 | + if not os.path.exists(os.path.dirname(args.output)): | ||
| 88 | + logging.info("""Output directory does not exist (EXIT): | ||
| 89 | + \n%s\n ...""" % args.output) | ||
| 90 | + exit() | ||
| 91 | + else: | ||
| 92 | + output_name = args.output | ||
| 93 | + else: | ||
| 94 | + output_name = args.output | ||
| 95 | + else: | ||
| 96 | + suffix = "_".join([embedding_name, | ||
| 97 | + args.comb, | ||
| 98 | + args.tfidf, | ||
| 99 | + "local" if args.idfmodel.startswith("local") else tfidf_name, | ||
| 100 | + args.suffix]).strip("_") | ||
| 101 | + output_name = args.input + ".output_" + suffix | ||
| 102 | + | ||
| 103 | + | ||
| 104 | + if args.tfidf.startswith("tfidf"): | ||
| 105 | + pred_tfidf = True | ||
| 106 | + elif args.tfidf.startswith("idf"): | ||
| 107 | + pred_tfidf = False | ||
| 108 | + else: | ||
| 109 | + pred_tfidf = False | ||
| 110 | + tfidf = False | ||
| 111 | + | ||
| 112 | + vectorizer = TfidfVectorizer(min_df = 1, | ||
| 113 | + encoding = "latin-1", | ||
| 114 | + decode_error = "replace", | ||
| 115 | + lowercase = True, | ||
| 116 | + binary = True if args.localw.startswith("bin") else False, | ||
| 117 | + sublinear_tf = True if args.localw.startswith("subl") else False, | ||
| 118 | + stop_words = "english" if args.stop else None) | ||
| 119 | + | ||
| 120 | + sentences = wisse.streamer(args.input) | ||
| 121 | + | ||
| 122 | + if args.idfmodel.startswith("local"): | ||
| 123 | + logging.info("Fitting local TFIDF weights from: %s ..." % args.input) | ||
| 124 | + tfidf = vectorizer.fit(sentences) | ||
| 125 | + | ||
| 126 | + elif os.path.isfile(args.idfmodel): | ||
| 127 | + logging.info("Loading global TFIDF weights from: %s ..." % args.idfmodel) | ||
| 128 | + with open(args.idfmodel, 'rb') as f: | ||
| 129 | + tfidf = pickle.load(f)#, encoding = 'latin-1') | ||
| 130 | + | ||
| 131 | + else: | ||
| 132 | + tfidf = False | ||
| 133 | + | ||
| 134 | + try: | ||
| 135 | + if args.format.startswith("bin"): | ||
| 136 | + embedding = load_vectors(args.embedmodel, binary = True, | ||
| 137 | + encoding = "latin-1") | ||
| 138 | + elif args.format.startswith("tex"): | ||
| 139 | + embedding = load_vectors(args.embedmodel, binary = False, | ||
| 140 | + encoding = "latin-1") | ||
| 141 | + else: | ||
| 142 | + embedding = wisse.vector_space(args.embedmodel, sparse = False) | ||
| 143 | + | ||
| 144 | + except: | ||
| 145 | + logging.info( | ||
| 146 | + """Error while loading word embedding model. Verify if the file | ||
| 147 | + is broken (EXIT)...\n%s\n""" % args.embedmodel) | ||
| 148 | + exit() | ||
| 149 | + | ||
| 150 | + embedding_name = os.path.basename(args.embedmodel).split(".")[0] | ||
| 151 | + tfidf_name = os.path.basename(args.idfmodel).split(".")[0] | ||
| 152 | + | ||
| 153 | + missing_bow = [] # Stores missing words in the TFIDF model | ||
| 154 | + missing_cbow = [] # Stores missing words in the W2V model | ||
| 155 | + sidx = 0 # The index of the sentence according to the input file | ||
| 156 | + logging.info("\n\nEmbedding sentences and saving then to a the output file..\n%s\n" % output_name) | ||
| 157 | + | ||
| 158 | + with open(output_name, "w") as fo: | ||
| 159 | + for sent in sentences: | ||
| 160 | + sidx += 1 | ||
| 161 | + series = wisse.wisse(embeddings = embedding, vectorizer = tfidf, | ||
| 162 | + tf_tfidf = True, combiner='sum') | ||
| 163 | + try: | ||
| 164 | + mc, mb, vector = series.transform(sent) | ||
| 165 | + except TypeError: | ||
| 166 | + continue | ||
| 167 | + | ||
| 168 | + # At this point you can use the embedding 'vector' for any application as it | ||
| 169 | + # is a numpy array. Also you can simply save the vectors in text format as | ||
| 170 | + # follows: | ||
| 171 | + missing_cbow += mc | ||
| 172 | + missing_bow += mb | ||
| 173 | + fo.write("%d\t%s\n" % (sidx, np.array2string(vector, | ||
| 174 | + formatter = {'float_kind':lambda x: "%.6f" % x}, | ||
| 175 | + max_line_width = 20000).strip(']').strip('[') )) | ||
| 176 | + | ||
| 177 | + missing_name = (os.path.basename(args.input).split(".")[0] + "_" + | ||
| 178 | + embedding_name + "_" + | ||
| 179 | + tfidf_name + ".missing") | ||
| 180 | + logging.info("\n\nSaving missing vocabulary to %s ..\n\n" % missing_name) | ||
| 181 | + | ||
| 182 | + with open(missing_name, "w") as f: | ||
| 183 | + f.write("# missing word embeddings:\n") | ||
| 184 | + for w in set(missing_cbow): | ||
| 185 | + f.write("%s\n" % w) | ||
| 186 | + | ||
| 187 | + f.write("# missing MI weights:\n") | ||
| 188 | + for w in set(missing_bow): | ||
| 189 | + f.write("%s\n" % w) | ||
| 190 | + | ||
| 191 | + logging.info("FINISHED! \n") |
-
Please register or login to post a comment