Add WISSe scripts

carlosmendeznlp
Commit c3d0c4d6f3a6f1b545b4cccebbac431388ef4637 c3d0c4d6 0 parents
Showing 3 changed files with 419 additions and 0 deletions
sentence-representation/keyed2indexed.py
sentence-representation/wisse.py
sentence-representation/wisse_example.py
--- a/sentence-representation/keyed2indexed.py 0 → 100644
View file @c3d0c4d
+++ b/sentence-representation/keyed2indexed.py 0 → 100644
View file @c3d0c4d
+import wisse
+from gensim.models.keyedvectors import KeyedVectors as vDB
+import sys
+import logging
+
+# sys.argv[1]: Input embeddings model (w2v format)
+# sys.argv[2]: Output direcory for indexed format
+# sys.argv[3]: Input format (default: binary)
+logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
+        level=logging.INFO)
+
+load_vectors = vDB.load_word2vec_format
+
+try:
+    if sys.argv[3]:
+        binary = False
+except:
+    binary = True
+
+embedding = load_vectors(sys.argv[1], binary=binary, encoding = "latin-1")
+logging.info("""Indexing embeddings, this will take a while...\n""")
+wisse.keyed2indexed(embedding, sys.argv[2])
+logging.info("""Embeddings indexed, please verify the contents of the output directory...\n""")
--- a/sentence-representation/wisse.py 0 → 100644
View file @c3d0c4d
+++ b/sentence-representation/wisse.py 0 → 100644
View file @c3d0c4d
+#!/usr/bin/python
+# -*- coding: latin-1 -*-
+# Python2.7
+
+import numpy as np
+import logging
+import os
+from functools import partial
+from pdb import set_trace as st
+logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
+                    level=logging.INFO)
+
+
+class wisse(object):
+    """ Both the TFIDFVectorizer and the word embedding model must be pretrained, either from the local 
+        sentence corpus or from model persintence.
+    """
+    def __init__(self, embeddings, vectorizer, tf_tfidf, combiner = "sum"):
+        self.tokenize = vectorizer.build_tokenizer()
+        self.tfidf = vectorizer
+        self.embedding = embeddings
+        self.pred_tfidf = tf_tfidf
+        if combiner.startswith("avg"):
+            self.comb = partial(np.mean, axis = 0)
+        else:
+            self.comb = partial(np.sum, axis = 0)
+
+
+    def fit(self, X, y = None): # Scikit-learn template
+        if isinstance(X, list):
+            self.sentences = X
+
+        return self
+
+
+    def transform(self, X):
+        if isinstance(X, list):
+            return self.fit(X)
+
+        elif isinstance(X, str):
+            return self.infer_sentence(X)
+
+    
+    def fit_transform(self, X, y=None):
+        return self.transform(X)
+
+
+    def infer_sentence(self, sent):
+        ss = self.tokenize(sent)
+        missing_bow = []
+        missing_cbow = []
+        series = {}
+
+        if not ss == []:
+            self.weights, m = self.infer_tfidf_weights(ss)
+        else:
+            return None
+
+        missing_bow += m
+
+        for w in self.weights:
+            try:
+                series[w] = (self.weights[w], self.embedding[w])
+            except KeyError:
+                series[w] = None
+                missing_cbow.append(w)
+                continue
+            except IndexError:
+                continue
+
+        if self.weights == {}: return None
+        # Embedding the sentence... :
+        sentence = np.array([series[w][1] for w in series if not series[w] is None])
+        series = {}
+
+        return missing_cbow, missing_bow, self.comb(sentence)
+
+
+    def infer_tfidf_weights(self, sentence):
+        existent = {}
+        missing = []
+
+        if not self.tfidf:
+            for word in sentence:
+                existent[word] = 1.0
+
+            return existent, missing
+
+        if self.pred_tfidf:
+            unseen = self.tfidf.transform([" ".join(sentence)]).toarray()
+            for word in sentence:
+                try:
+                    existent[word] = unseen[0][self.tfidf.vocabulary_[word]]
+                except KeyError:
+                    missing.append(word)
+                    continue
+        else:
+            for word in sentence:
+                try:
+                    weight = vectorizer.idf_[vectorizer.vocabulary_[word]]
+                    existent[word] = weight if weight > 2 else 0.01
+                except KeyError:
+                    missing.append(word)
+                    continue
+
+        return existent, missing
+
+
+    def __iter__(self):
+        for s in self.sentences:
+            yield self.transform(s)
+
+
+def save_dense(directory, filename, array):
+    directory=os.path.normpath(directory) + '/'
+#    try:
+    if filename.isalpha():
+            np.save(directory + filename, array)
+    else:
+            return None
+#    except UnicodeEncodeError:
+#        return None    
+
+def load_dense(filename):
+    return np.load(filename)
+
+
+def load_sparse_bsr(filename):
+    loader = np.load(filename) 
+    return bsr_matrix((loader['data'], loader['indices'], loader['indptr']),                       
+        shape=loader['shape']) 
+
+
+def save_sparse_bsr(directory, filename, array):     
+# note that .npz extension is added automatically     
+    directory=os.path.normpath(directory) + '/'
+    if word.isalpha():
+        array=array.tobsr()
+        np.savez(directory + filename, data=array.data, indices=array.indices,              
+            indptr=array.indptr, shape=array.shape) 
+    else:
+        return None
+
+
+class vector_space(object):
+    def __init__(self, directory, sparse = False):
+        self.sparse = sparse 
+        ext = ".npz" if sparse else ".npy"
+        if directory.endswith(".tar.gz"):
+            self._tar = True
+            import tarfile
+            self.tar = tarfile.open(directory)
+            file_list = self.tar.getnames() #[os.path.basename(n) for n in self.tar.getnames()]
+            self.words = {os.path.basename(word).replace(ext, ''): word 
+                                                    for word in file_list}
+        else:
+            self._tar = False
+            directory = os.path.normpath(directory) + '/' 
+            file_list = os.listdir(directory)
+            self.words = {word.replace(ext, ''): directory + word 
+                                                for word in file_list}
+
+
+    def __getitem__(self, item):
+        if self.sparse:
+            if self._tar:
+                member = self.tar.getmember(self.words[item])
+                word = self.tar.extractfile(member)
+            else:
+                word = self.words[item]
+            #return load_sparse_bsr(self.words[item])
+            return load_sparse_bsr(word) 
+
+        else:
+            if self._tar:
+                member = self.tar.getmember(self.words[item])
+                word = self.tar.extractfile(member)
+            else:
+                word = self.words[item]
+            #return load_sparse_bsr(self.words[item])
+            return load_dense(word)
+
+
+def keyed2indexed(keyed_model, output_dir = "word_embeddings/", parallel = True, n_jobs = -1):
+    output_dir = os.path.normpath(output_dir) + '/'
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    if parallel:
+        from joblib import Parallel, delayed
+
+        Parallel(n_jobs = n_jobs, verbose = 10)(delayed(save_dense)(output_dir, word, keyed_model[word]) 
+                                                        for word, _ in keyed_model.vocab.items())
+    else:
+        for word, _ in keyed_model.vocab.items():
+            save_dense(output_dir, word, keyed_model[word])
+    
+
+class streamer(object):
+    def __init__(self, file_name):
+        self.file_name = file_name
+
+    def __iter__(self):
+        for s in open(self.file_name):
+            yield s.strip()
--- a/sentence-representation/wisse_example.py 0 → 100644
View file @c3d0c4d
+++ b/sentence-representation/wisse_example.py 0 → 100644
View file @c3d0c4d
+#!/usr/bin/python
+# -*- coding: latin-1 -*-
+# Python2.7
+from gensim.models.keyedvectors import KeyedVectors as vDB
+from sklearn.feature_extraction.text import TfidfVectorizer
+import numpy as np
+#import numexpr as ne
+import argparse
+#import _pickle as pickle
+import cPickle as pickle
+import logging
+import os
+from functools import partial
+import wisse
+
+
+load_vectors = vDB.load_word2vec_format
+
+logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
+                    level=logging.INFO)
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="""This use example shows sentence 
+        embedding by using WISSE. The input is a text file which has a sentece in 
+        each of its rows. The output file has two tab-separated columns: the index
+        line of the sentece in the input file and the sentence vector representation
+        .""")
+    parser.add_argument("--idfmodel", help = """Input file containing IDF
+                                        pre-trained weights. If not provided,
+                                        all word vector weights will be set to
+                                        1.0. If 'local' tf-idf weights will be
+                                        computed locally from the input file
+                                        (pickled sklearn object).""",
+                                        default = None)
+    parser.add_argument("--embedmodel", help = """Input file containing word
+                                            embeddings model (binary and text
+                                            are allowed).""", required = True)
+    parser.add_argument("--output", help = """Output file containing the sentence
+                                            embeddings.""", default = "")
+    parser.add_argument("--input", help = """Input file containing a sentence
+                                            by row.""", required = True)
+    parser.add_argument("--comb", help = """Desired word vector combination for
+                                        sentence representation {sum, avg}.
+                                        (default = 'sum')""", default = "sum")
+    parser.add_argument("--suffix", nargs = '?', help = """A suffix to be added
+                                        to the output file (default = '')""",
+                                            default = "", required = False)
+    parser.add_argument("--tfidf", help="""To predict TFIDF complete weights
+                                        ('tfidf') or use only partial IDFs
+                                        ('idf'). (default = 'tfidf')""",
+                                        default = "tfidf")
+    parser.add_argument("--localw", help = """TFIDF word vector weights
+                                    computed locally from the input file of
+                                    sentences {freq, binary, sublinear}
+                                    (default='none').""", default = "none")
+    parser.add_argument("--stop", help = """Toggles stripping stop words in
+                                    locally computed word vector weights.""",
+                                                        action = "store_true")
+    parser.add_argument("--format", help = """The format of the embedding model
+                                     file: {binary, text, wisse}. 
+                                    default = 'binary'""", default = "binary")
+    args = parser.parse_args()
+
+
+    if not args.format.startswith("wisse"):
+        if not os.path.isfile(args.embedmodel):
+            logging.info("""Embedding model file does not exist (EXIT):
+                \n%s\n ...""" % args.embedmodel)
+            exit()
+    elif not os.path.exists(args.embedmodel):
+        logging.info("""Embedding model directory does not exist (EXIT):
+                \n%s\n ...""" % args.embedmodel)
+        exit()
+
+    if not os.path.isfile(args.idfmodel) and not args.idfmodel.startswith("local"):
+        logging.info("""IDF model file does not exist (EXIT):
+                \n%s\n ...""" % args.idfmodel)
+        exit()
+    if not os.path.isfile(args.input):
+        logging.info("""Input file does not exist (EXIT):
+                \n%s\n ...""" % args.input)
+        exit()
+    if args.output != "":
+        if os.path.dirname(args.output) != "":
+            if not os.path.exists(os.path.dirname(args.output)):
+                logging.info("""Output directory does not exist (EXIT):
+                    \n%s\n ...""" % args.output)
+                exit()
+            else:
+                output_name = args.output
+        else:
+            output_name = args.output
+    else:
+        suffix = "_".join([embedding_name,
+            args.comb,
+            args.tfidf,
+            "local" if args.idfmodel.startswith("local") else tfidf_name,
+            args.suffix]).strip("_")
+        output_name = args.input + ".output_" + suffix
+
+
+    if args.tfidf.startswith("tfidf"):
+        pred_tfidf = True
+    elif args.tfidf.startswith("idf"):
+        pred_tfidf = False
+    else:
+        pred_tfidf = False
+        tfidf = False
+
+    vectorizer = TfidfVectorizer(min_df = 1,
+                encoding = "latin-1",
+                decode_error = "replace",
+                lowercase = True,
+                binary = True if args.localw.startswith("bin") else False,
+                sublinear_tf = True if args.localw.startswith("subl") else False,
+                stop_words = "english" if args.stop else None)
+
+    sentences = wisse.streamer(args.input)
+
+    if args.idfmodel.startswith("local"):
+        logging.info("Fitting local TFIDF weights from: %s ..." % args.input)
+        tfidf = vectorizer.fit(sentences)
+
+    elif os.path.isfile(args.idfmodel):
+        logging.info("Loading global TFIDF weights from: %s ..." % args.idfmodel)
+        with open(args.idfmodel, 'rb') as f:
+            tfidf = pickle.load(f)#, encoding = 'latin-1')
+
+    else:
+        tfidf = False
+
+    try:
+        if args.format.startswith("bin"):
+            embedding = load_vectors(args.embedmodel, binary = True,
+                                                        encoding = "latin-1")
+        elif args.format.startswith("tex"):
+            embedding = load_vectors(args.embedmodel, binary = False,
+                                                        encoding = "latin-1")
+        else:
+            embedding = wisse.vector_space(args.embedmodel, sparse = False)
+
+    except:
+        logging.info(
+            """Error while loading word embedding model. Verify if the file
+            is broken (EXIT)...\n%s\n""" % args.embedmodel)
+        exit()
+
+    embedding_name = os.path.basename(args.embedmodel).split(".")[0]
+    tfidf_name = os.path.basename(args.idfmodel).split(".")[0]
+
+    missing_bow = []    # Stores missing words in the TFIDF model
+    missing_cbow = []   # Stores missing words in the W2V model
+    sidx = 0 # The index of the sentence according to the input file
+    logging.info("\n\nEmbedding sentences and saving then to a the output file..\n%s\n" % output_name)
+
+    with open(output_name, "w") as fo:
+        for sent in sentences:
+            sidx += 1
+            series = wisse.wisse(embeddings = embedding, vectorizer = tfidf, 
+                                                tf_tfidf = True, combiner='sum')
+            try:
+                mc, mb, vector = series.transform(sent)
+            except TypeError:
+                continue
+
+            # At this point you can use the embedding 'vector' for any application as it
+            # is a numpy array. Also you can simply save the vectors in text format as
+            # follows:
+            missing_cbow += mc
+            missing_bow += mb
+            fo.write("%d\t%s\n" % (sidx, np.array2string(vector,
+                                formatter = {'float_kind':lambda x: "%.6f" % x},
+                                max_line_width = 20000).strip(']').strip('[') ))
+
+    missing_name = (os.path.basename(args.input).split(".")[0] + "_" +
+                                                        embedding_name + "_" +
+                                                        tfidf_name + ".missing")
+    logging.info("\n\nSaving missing vocabulary to %s ..\n\n" % missing_name)
+
+    with open(missing_name, "w") as f:
+        f.write("# missing word embeddings:\n")
+        for w in set(missing_cbow):
+            f.write("%s\n" % w)
+
+        f.write("# missing MI weights:\n")
+        for w in set(missing_bow):
+            f.write("%s\n" % w)
+
+    logging.info("FINISHED! \n")