Add WISSe scripts

carlosmendeznlp
Commit c3d0c4d6f3a6f1b545b4cccebbac431388ef4637 c3d0c4d6 0 parents
Showing 3 changed files with 419 additions and 0 deletions
sentence-representation/keyed2indexed.py
sentence-representation/wisse.py
sentence-representation/wisse_example.py
--- a/sentence-representation/keyed2indexed.py 0 → 100644
View file @c3d0c4d
+++ b/sentence-representation/keyed2indexed.py 0 → 100644
View file @c3d0c4d
+ import wisse
+ from gensim.models.keyedvectors import KeyedVectors as vDB
+ import sys
+ import logging
+ 
+ # sys.argv[1]: Input embeddings model (w2v format)
+ # sys.argv[2]: Output direcory for indexed format
+ # sys.argv[3]: Input format (default: binary)
+ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
+         level=logging.INFO)
+ 
+ load_vectors = vDB.load_word2vec_format
+ 
+ try:
+     if sys.argv[3]:
+         binary = False
+ except:
+     binary = True
+ 
+ embedding = load_vectors(sys.argv[1], binary=binary, encoding = "latin-1")
+ logging.info("""Indexing embeddings, this will take a while...\n""")
+ wisse.keyed2indexed(embedding, sys.argv[2])
+ logging.info("""Embeddings indexed, please verify the contents of the output directory...\n""")
--- a/sentence-representation/wisse.py 0 → 100644
View file @c3d0c4d
+++ b/sentence-representation/wisse.py 0 → 100644
View file @c3d0c4d
+ #!/usr/bin/python
+ # -*- coding: latin-1 -*-
+ # Python2.7
+ 
+ import numpy as np
+ import logging
+ import os
+ from functools import partial
+ from pdb import set_trace as st
+ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
+                     level=logging.INFO)
+ 
+ 
+ class wisse(object):
+     """ Both the TFIDFVectorizer and the word embedding model must be pretrained, either from the local 
+         sentence corpus or from model persintence.
+     """
+     def __init__(self, embeddings, vectorizer, tf_tfidf, combiner = "sum"):
+         self.tokenize = vectorizer.build_tokenizer()
+         self.tfidf = vectorizer
+         self.embedding = embeddings
+         self.pred_tfidf = tf_tfidf
+         if combiner.startswith("avg"):
+             self.comb = partial(np.mean, axis = 0)
+         else:
+             self.comb = partial(np.sum, axis = 0)
+ 
+ 
+     def fit(self, X, y = None): # Scikit-learn template
+         if isinstance(X, list):
+             self.sentences = X
+ 
+         return self
+ 
+ 
+     def transform(self, X):
+         if isinstance(X, list):
+             return self.fit(X)
+ 
+         elif isinstance(X, str):
+             return self.infer_sentence(X)
+ 
+     
+     def fit_transform(self, X, y=None):
+         return self.transform(X)
+ 
+ 
+     def infer_sentence(self, sent):
+         ss = self.tokenize(sent)
+         missing_bow = []
+         missing_cbow = []
+         series = {}
+ 
+         if not ss == []:
+             self.weights, m = self.infer_tfidf_weights(ss)
+         else:
+             return None
+ 
+         missing_bow += m
+ 
+         for w in self.weights:
+             try:
+                 series[w] = (self.weights[w], self.embedding[w])
+             except KeyError:
+                 series[w] = None
+                 missing_cbow.append(w)
+                 continue
+             except IndexError:
+                 continue
+ 
+         if self.weights == {}: return None
+         # Embedding the sentence... :
+         sentence = np.array([series[w][1] for w in series if not series[w] is None])
+         series = {}
+ 
+         return missing_cbow, missing_bow, self.comb(sentence)
+ 
+ 
+     def infer_tfidf_weights(self, sentence):
+         existent = {}
+         missing = []
+ 
+         if not self.tfidf:
+             for word in sentence:
+                 existent[word] = 1.0
+ 
+             return existent, missing
+ 
+         if self.pred_tfidf:
+             unseen = self.tfidf.transform([" ".join(sentence)]).toarray()
+             for word in sentence:
+                 try:
+                     existent[word] = unseen[0][self.tfidf.vocabulary_[word]]
+                 except KeyError:
+                     missing.append(word)
+                     continue
+         else:
+             for word in sentence:
+                 try:
+                     weight = vectorizer.idf_[vectorizer.vocabulary_[word]]
+                     existent[word] = weight if weight > 2 else 0.01
+                 except KeyError:
+                     missing.append(word)
+                     continue
+ 
+         return existent, missing
+ 
+ 
+     def __iter__(self):
+         for s in self.sentences:
+             yield self.transform(s)
+ 
+ 
+ def save_dense(directory, filename, array):
+     directory=os.path.normpath(directory) + '/'
+ #    try:
+     if filename.isalpha():
+             np.save(directory + filename, array)
+     else:
+             return None
+ #    except UnicodeEncodeError:
+ #        return None    
+ 
+ def load_dense(filename):
+     return np.load(filename)
+ 
+ 
+ def load_sparse_bsr(filename):
+     loader = np.load(filename) 
+     return bsr_matrix((loader['data'], loader['indices'], loader['indptr']),                       
+         shape=loader['shape']) 
+ 
+ 
+ def save_sparse_bsr(directory, filename, array):     
+ # note that .npz extension is added automatically     
+     directory=os.path.normpath(directory) + '/'
+     if word.isalpha():
+         array=array.tobsr()
+         np.savez(directory + filename, data=array.data, indices=array.indices,              
+             indptr=array.indptr, shape=array.shape) 
+     else:
+         return None
+ 
+ 
+ class vector_space(object):
+     def __init__(self, directory, sparse = False):
+         self.sparse = sparse 
+         ext = ".npz" if sparse else ".npy"
+         if directory.endswith(".tar.gz"):
+             self._tar = True
+             import tarfile
+             self.tar = tarfile.open(directory)
+             file_list = self.tar.getnames() #[os.path.basename(n) for n in self.tar.getnames()]
+             self.words = {os.path.basename(word).replace(ext, ''): word 
+                                                     for word in file_list}
+         else:
+             self._tar = False
+             directory = os.path.normpath(directory) + '/' 
+             file_list = os.listdir(directory)
+             self.words = {word.replace(ext, ''): directory + word 
+                                                 for word in file_list}
+ 
+ 
+     def __getitem__(self, item):
+         if self.sparse:
+             if self._tar:
+                 member = self.tar.getmember(self.words[item])
+                 word = self.tar.extractfile(member)
+             else:
+                 word = self.words[item]
+             #return load_sparse_bsr(self.words[item])
+             return load_sparse_bsr(word) 
+ 
+         else:
+             if self._tar:
+                 member = self.tar.getmember(self.words[item])
+                 word = self.tar.extractfile(member)
+             else:
+                 word = self.words[item]
+             #return load_sparse_bsr(self.words[item])
+             return load_dense(word)
+ 
+ 
+ def keyed2indexed(keyed_model, output_dir = "word_embeddings/", parallel = True, n_jobs = -1):
+     output_dir = os.path.normpath(output_dir) + '/'
+     if not os.path.exists(output_dir):
+         os.makedirs(output_dir)
+ 
+     if parallel:
+         from joblib import Parallel, delayed
+ 
+         Parallel(n_jobs = n_jobs, verbose = 10)(delayed(save_dense)(output_dir, word, keyed_model[word]) 
+                                                         for word, _ in keyed_model.vocab.items())
+     else:
+         for word, _ in keyed_model.vocab.items():
+             save_dense(output_dir, word, keyed_model[word])
+     
+ 
+ class streamer(object):
+     def __init__(self, file_name):
+         self.file_name = file_name
+ 
+     def __iter__(self):
+         for s in open(self.file_name):
+             yield s.strip()
--- a/sentence-representation/wisse_example.py 0 → 100644
View file @c3d0c4d
+++ b/sentence-representation/wisse_example.py 0 → 100644
View file @c3d0c4d
+ #!/usr/bin/python
+ # -*- coding: latin-1 -*-
+ # Python2.7
+ from gensim.models.keyedvectors import KeyedVectors as vDB
+ from sklearn.feature_extraction.text import TfidfVectorizer
+ import numpy as np
+ #import numexpr as ne
+ import argparse
+ #import _pickle as pickle
+ import cPickle as pickle
+ import logging
+ import os
+ from functools import partial
+ import wisse
+ 
+ 
+ load_vectors = vDB.load_word2vec_format
+ 
+ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
+                     level=logging.INFO)
+ 
+ 
+ if __name__ == "__main__":
+ 
+     parser = argparse.ArgumentParser(description="""This use example shows sentence 
+         embedding by using WISSE. The input is a text file which has a sentece in 
+         each of its rows. The output file has two tab-separated columns: the index
+         line of the sentece in the input file and the sentence vector representation
+         .""")
+     parser.add_argument("--idfmodel", help = """Input file containing IDF
+                                         pre-trained weights. If not provided,
+                                         all word vector weights will be set to
+                                         1.0. If 'local' tf-idf weights will be
+                                         computed locally from the input file
+                                         (pickled sklearn object).""",
+                                         default = None)
+     parser.add_argument("--embedmodel", help = """Input file containing word
+                                             embeddings model (binary and text
+                                             are allowed).""", required = True)
+     parser.add_argument("--output", help = """Output file containing the sentence
+                                             embeddings.""", default = "")
+     parser.add_argument("--input", help = """Input file containing a sentence
+                                             by row.""", required = True)
+     parser.add_argument("--comb", help = """Desired word vector combination for
+                                         sentence representation {sum, avg}.
+                                         (default = 'sum')""", default = "sum")
+     parser.add_argument("--suffix", nargs = '?', help = """A suffix to be added
+                                         to the output file (default = '')""",
+                                             default = "", required = False)
+     parser.add_argument("--tfidf", help="""To predict TFIDF complete weights
+                                         ('tfidf') or use only partial IDFs
+                                         ('idf'). (default = 'tfidf')""",
+                                         default = "tfidf")
+     parser.add_argument("--localw", help = """TFIDF word vector weights
+                                     computed locally from the input file of
+                                     sentences {freq, binary, sublinear}
+                                     (default='none').""", default = "none")
+     parser.add_argument("--stop", help = """Toggles stripping stop words in
+                                     locally computed word vector weights.""",
+                                                         action = "store_true")
+     parser.add_argument("--format", help = """The format of the embedding model
+                                      file: {binary, text, wisse}. 
+                                     default = 'binary'""", default = "binary")
+     args = parser.parse_args()
+ 
+ 
+     if not args.format.startswith("wisse"):
+         if not os.path.isfile(args.embedmodel):
+             logging.info("""Embedding model file does not exist (EXIT):
+                 \n%s\n ...""" % args.embedmodel)
+             exit()
+     elif not os.path.exists(args.embedmodel):
+         logging.info("""Embedding model directory does not exist (EXIT):
+                 \n%s\n ...""" % args.embedmodel)
+         exit()
+ 
+     if not os.path.isfile(args.idfmodel) and not args.idfmodel.startswith("local"):
+         logging.info("""IDF model file does not exist (EXIT):
+                 \n%s\n ...""" % args.idfmodel)
+         exit()
+     if not os.path.isfile(args.input):
+         logging.info("""Input file does not exist (EXIT):
+                 \n%s\n ...""" % args.input)
+         exit()
+     if args.output != "":
+         if os.path.dirname(args.output) != "":
+             if not os.path.exists(os.path.dirname(args.output)):
+                 logging.info("""Output directory does not exist (EXIT):
+                     \n%s\n ...""" % args.output)
+                 exit()
+             else:
+                 output_name = args.output
+         else:
+             output_name = args.output
+     else:
+         suffix = "_".join([embedding_name,
+             args.comb,
+             args.tfidf,
+             "local" if args.idfmodel.startswith("local") else tfidf_name,
+             args.suffix]).strip("_")
+         output_name = args.input + ".output_" + suffix
+ 
+ 
+     if args.tfidf.startswith("tfidf"):
+         pred_tfidf = True
+     elif args.tfidf.startswith("idf"):
+         pred_tfidf = False
+     else:
+         pred_tfidf = False
+         tfidf = False
+ 
+     vectorizer = TfidfVectorizer(min_df = 1,
+                 encoding = "latin-1",
+                 decode_error = "replace",
+                 lowercase = True,
+                 binary = True if args.localw.startswith("bin") else False,
+                 sublinear_tf = True if args.localw.startswith("subl") else False,
+                 stop_words = "english" if args.stop else None)
+ 
+     sentences = wisse.streamer(args.input)
+ 
+     if args.idfmodel.startswith("local"):
+         logging.info("Fitting local TFIDF weights from: %s ..." % args.input)
+         tfidf = vectorizer.fit(sentences)
+ 
+     elif os.path.isfile(args.idfmodel):
+         logging.info("Loading global TFIDF weights from: %s ..." % args.idfmodel)
+         with open(args.idfmodel, 'rb') as f:
+             tfidf = pickle.load(f)#, encoding = 'latin-1')
+ 
+     else:
+         tfidf = False
+ 
+     try:
+         if args.format.startswith("bin"):
+             embedding = load_vectors(args.embedmodel, binary = True,
+                                                         encoding = "latin-1")
+         elif args.format.startswith("tex"):
+             embedding = load_vectors(args.embedmodel, binary = False,
+                                                         encoding = "latin-1")
+         else:
+             embedding = wisse.vector_space(args.embedmodel, sparse = False)
+ 
+     except:
+         logging.info(
+             """Error while loading word embedding model. Verify if the file
+             is broken (EXIT)...\n%s\n""" % args.embedmodel)
+         exit()
+ 
+     embedding_name = os.path.basename(args.embedmodel).split(".")[0]
+     tfidf_name = os.path.basename(args.idfmodel).split(".")[0]
+ 
+     missing_bow = []    # Stores missing words in the TFIDF model
+     missing_cbow = []   # Stores missing words in the W2V model
+     sidx = 0 # The index of the sentence according to the input file
+     logging.info("\n\nEmbedding sentences and saving then to a the output file..\n%s\n" % output_name)
+ 
+     with open(output_name, "w") as fo:
+         for sent in sentences:
+             sidx += 1
+             series = wisse.wisse(embeddings = embedding, vectorizer = tfidf, 
+                                                 tf_tfidf = True, combiner='sum')
+             try:
+                 mc, mb, vector = series.transform(sent)
+             except TypeError:
+                 continue
+ 
+             # At this point you can use the embedding 'vector' for any application as it
+             # is a numpy array. Also you can simply save the vectors in text format as
+             # follows:
+             missing_cbow += mc
+             missing_bow += mb
+             fo.write("%d\t%s\n" % (sidx, np.array2string(vector,
+                                 formatter = {'float_kind':lambda x: "%.6f" % x},
+                                 max_line_width = 20000).strip(']').strip('[') ))
+ 
+     missing_name = (os.path.basename(args.input).split(".")[0] + "_" +
+                                                         embedding_name + "_" +
+                                                         tfidf_name + ".missing")
+     logging.info("\n\nSaving missing vocabulary to %s ..\n\n" % missing_name)
+ 
+     with open(missing_name, "w") as f:
+         f.write("# missing word embeddings:\n")
+         for w in set(missing_cbow):
+             f.write("%s\n" % w)
+ 
+         f.write("# missing MI weights:\n")
+         for w in set(missing_bow):
+             f.write("%s\n" % w)
+ 
+     logging.info("FINISHED! \n")