carlosmendeznlp

Add WISSe scripts

import wisse
from gensim.models.keyedvectors import KeyedVectors as vDB
import sys
import logging
# sys.argv[1]: Input embeddings model (w2v format)
# sys.argv[2]: Output direcory for indexed format
# sys.argv[3]: Input format (default: binary)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
load_vectors = vDB.load_word2vec_format
try:
if sys.argv[3]:
binary = False
except:
binary = True
embedding = load_vectors(sys.argv[1], binary=binary, encoding = "latin-1")
logging.info("""Indexing embeddings, this will take a while...\n""")
wisse.keyed2indexed(embedding, sys.argv[2])
logging.info("""Embeddings indexed, please verify the contents of the output directory...\n""")
#!/usr/bin/python
# -*- coding: latin-1 -*-
# Python2.7
import numpy as np
import logging
import os
from functools import partial
from pdb import set_trace as st
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
class wisse(object):
""" Both the TFIDFVectorizer and the word embedding model must be pretrained, either from the local
sentence corpus or from model persintence.
"""
def __init__(self, embeddings, vectorizer, tf_tfidf, combiner = "sum"):
self.tokenize = vectorizer.build_tokenizer()
self.tfidf = vectorizer
self.embedding = embeddings
self.pred_tfidf = tf_tfidf
if combiner.startswith("avg"):
self.comb = partial(np.mean, axis = 0)
else:
self.comb = partial(np.sum, axis = 0)
def fit(self, X, y = None): # Scikit-learn template
if isinstance(X, list):
self.sentences = X
return self
def transform(self, X):
if isinstance(X, list):
return self.fit(X)
elif isinstance(X, str):
return self.infer_sentence(X)
def fit_transform(self, X, y=None):
return self.transform(X)
def infer_sentence(self, sent):
ss = self.tokenize(sent)
missing_bow = []
missing_cbow = []
series = {}
if not ss == []:
self.weights, m = self.infer_tfidf_weights(ss)
else:
return None
missing_bow += m
for w in self.weights:
try:
series[w] = (self.weights[w], self.embedding[w])
except KeyError:
series[w] = None
missing_cbow.append(w)
continue
except IndexError:
continue
if self.weights == {}: return None
# Embedding the sentence... :
sentence = np.array([series[w][1] for w in series if not series[w] is None])
series = {}
return missing_cbow, missing_bow, self.comb(sentence)
def infer_tfidf_weights(self, sentence):
existent = {}
missing = []
if not self.tfidf:
for word in sentence:
existent[word] = 1.0
return existent, missing
if self.pred_tfidf:
unseen = self.tfidf.transform([" ".join(sentence)]).toarray()
for word in sentence:
try:
existent[word] = unseen[0][self.tfidf.vocabulary_[word]]
except KeyError:
missing.append(word)
continue
else:
for word in sentence:
try:
weight = vectorizer.idf_[vectorizer.vocabulary_[word]]
existent[word] = weight if weight > 2 else 0.01
except KeyError:
missing.append(word)
continue
return existent, missing
def __iter__(self):
for s in self.sentences:
yield self.transform(s)
def save_dense(directory, filename, array):
directory=os.path.normpath(directory) + '/'
# try:
if filename.isalpha():
np.save(directory + filename, array)
else:
return None
# except UnicodeEncodeError:
# return None
def load_dense(filename):
return np.load(filename)
def load_sparse_bsr(filename):
loader = np.load(filename)
return bsr_matrix((loader['data'], loader['indices'], loader['indptr']),
shape=loader['shape'])
def save_sparse_bsr(directory, filename, array):
# note that .npz extension is added automatically
directory=os.path.normpath(directory) + '/'
if word.isalpha():
array=array.tobsr()
np.savez(directory + filename, data=array.data, indices=array.indices,
indptr=array.indptr, shape=array.shape)
else:
return None
class vector_space(object):
def __init__(self, directory, sparse = False):
self.sparse = sparse
ext = ".npz" if sparse else ".npy"
if directory.endswith(".tar.gz"):
self._tar = True
import tarfile
self.tar = tarfile.open(directory)
file_list = self.tar.getnames() #[os.path.basename(n) for n in self.tar.getnames()]
self.words = {os.path.basename(word).replace(ext, ''): word
for word in file_list}
else:
self._tar = False
directory = os.path.normpath(directory) + '/'
file_list = os.listdir(directory)
self.words = {word.replace(ext, ''): directory + word
for word in file_list}
def __getitem__(self, item):
if self.sparse:
if self._tar:
member = self.tar.getmember(self.words[item])
word = self.tar.extractfile(member)
else:
word = self.words[item]
#return load_sparse_bsr(self.words[item])
return load_sparse_bsr(word)
else:
if self._tar:
member = self.tar.getmember(self.words[item])
word = self.tar.extractfile(member)
else:
word = self.words[item]
#return load_sparse_bsr(self.words[item])
return load_dense(word)
def keyed2indexed(keyed_model, output_dir = "word_embeddings/", parallel = True, n_jobs = -1):
output_dir = os.path.normpath(output_dir) + '/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if parallel:
from joblib import Parallel, delayed
Parallel(n_jobs = n_jobs, verbose = 10)(delayed(save_dense)(output_dir, word, keyed_model[word])
for word, _ in keyed_model.vocab.items())
else:
for word, _ in keyed_model.vocab.items():
save_dense(output_dir, word, keyed_model[word])
class streamer(object):
def __init__(self, file_name):
self.file_name = file_name
def __iter__(self):
for s in open(self.file_name):
yield s.strip()
#!/usr/bin/python
# -*- coding: latin-1 -*-
# Python2.7
from gensim.models.keyedvectors import KeyedVectors as vDB
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
#import numexpr as ne
import argparse
#import _pickle as pickle
import cPickle as pickle
import logging
import os
from functools import partial
import wisse
load_vectors = vDB.load_word2vec_format
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="""This use example shows sentence
embedding by using WISSE. The input is a text file which has a sentece in
each of its rows. The output file has two tab-separated columns: the index
line of the sentece in the input file and the sentence vector representation
.""")
parser.add_argument("--idfmodel", help = """Input file containing IDF
pre-trained weights. If not provided,
all word vector weights will be set to
1.0. If 'local' tf-idf weights will be
computed locally from the input file
(pickled sklearn object).""",
default = None)
parser.add_argument("--embedmodel", help = """Input file containing word
embeddings model (binary and text
are allowed).""", required = True)
parser.add_argument("--output", help = """Output file containing the sentence
embeddings.""", default = "")
parser.add_argument("--input", help = """Input file containing a sentence
by row.""", required = True)
parser.add_argument("--comb", help = """Desired word vector combination for
sentence representation {sum, avg}.
(default = 'sum')""", default = "sum")
parser.add_argument("--suffix", nargs = '?', help = """A suffix to be added
to the output file (default = '')""",
default = "", required = False)
parser.add_argument("--tfidf", help="""To predict TFIDF complete weights
('tfidf') or use only partial IDFs
('idf'). (default = 'tfidf')""",
default = "tfidf")
parser.add_argument("--localw", help = """TFIDF word vector weights
computed locally from the input file of
sentences {freq, binary, sublinear}
(default='none').""", default = "none")
parser.add_argument("--stop", help = """Toggles stripping stop words in
locally computed word vector weights.""",
action = "store_true")
parser.add_argument("--format", help = """The format of the embedding model
file: {binary, text, wisse}.
default = 'binary'""", default = "binary")
args = parser.parse_args()
if not args.format.startswith("wisse"):
if not os.path.isfile(args.embedmodel):
logging.info("""Embedding model file does not exist (EXIT):
\n%s\n ...""" % args.embedmodel)
exit()
elif not os.path.exists(args.embedmodel):
logging.info("""Embedding model directory does not exist (EXIT):
\n%s\n ...""" % args.embedmodel)
exit()
if not os.path.isfile(args.idfmodel) and not args.idfmodel.startswith("local"):
logging.info("""IDF model file does not exist (EXIT):
\n%s\n ...""" % args.idfmodel)
exit()
if not os.path.isfile(args.input):
logging.info("""Input file does not exist (EXIT):
\n%s\n ...""" % args.input)
exit()
if args.output != "":
if os.path.dirname(args.output) != "":
if not os.path.exists(os.path.dirname(args.output)):
logging.info("""Output directory does not exist (EXIT):
\n%s\n ...""" % args.output)
exit()
else:
output_name = args.output
else:
output_name = args.output
else:
suffix = "_".join([embedding_name,
args.comb,
args.tfidf,
"local" if args.idfmodel.startswith("local") else tfidf_name,
args.suffix]).strip("_")
output_name = args.input + ".output_" + suffix
if args.tfidf.startswith("tfidf"):
pred_tfidf = True
elif args.tfidf.startswith("idf"):
pred_tfidf = False
else:
pred_tfidf = False
tfidf = False
vectorizer = TfidfVectorizer(min_df = 1,
encoding = "latin-1",
decode_error = "replace",
lowercase = True,
binary = True if args.localw.startswith("bin") else False,
sublinear_tf = True if args.localw.startswith("subl") else False,
stop_words = "english" if args.stop else None)
sentences = wisse.streamer(args.input)
if args.idfmodel.startswith("local"):
logging.info("Fitting local TFIDF weights from: %s ..." % args.input)
tfidf = vectorizer.fit(sentences)
elif os.path.isfile(args.idfmodel):
logging.info("Loading global TFIDF weights from: %s ..." % args.idfmodel)
with open(args.idfmodel, 'rb') as f:
tfidf = pickle.load(f)#, encoding = 'latin-1')
else:
tfidf = False
try:
if args.format.startswith("bin"):
embedding = load_vectors(args.embedmodel, binary = True,
encoding = "latin-1")
elif args.format.startswith("tex"):
embedding = load_vectors(args.embedmodel, binary = False,
encoding = "latin-1")
else:
embedding = wisse.vector_space(args.embedmodel, sparse = False)
except:
logging.info(
"""Error while loading word embedding model. Verify if the file
is broken (EXIT)...\n%s\n""" % args.embedmodel)
exit()
embedding_name = os.path.basename(args.embedmodel).split(".")[0]
tfidf_name = os.path.basename(args.idfmodel).split(".")[0]
missing_bow = [] # Stores missing words in the TFIDF model
missing_cbow = [] # Stores missing words in the W2V model
sidx = 0 # The index of the sentence according to the input file
logging.info("\n\nEmbedding sentences and saving then to a the output file..\n%s\n" % output_name)
with open(output_name, "w") as fo:
for sent in sentences:
sidx += 1
series = wisse.wisse(embeddings = embedding, vectorizer = tfidf,
tf_tfidf = True, combiner='sum')
try:
mc, mb, vector = series.transform(sent)
except TypeError:
continue
# At this point you can use the embedding 'vector' for any application as it
# is a numpy array. Also you can simply save the vectors in text format as
# follows:
missing_cbow += mc
missing_bow += mb
fo.write("%d\t%s\n" % (sidx, np.array2string(vector,
formatter = {'float_kind':lambda x: "%.6f" % x},
max_line_width = 20000).strip(']').strip('[') ))
missing_name = (os.path.basename(args.input).split(".")[0] + "_" +
embedding_name + "_" +
tfidf_name + ".missing")
logging.info("\n\nSaving missing vocabulary to %s ..\n\n" % missing_name)
with open(missing_name, "w") as f:
f.write("# missing word embeddings:\n")
for w in set(missing_cbow):
f.write("%s\n" % w)
f.write("# missing MI weights:\n")
for w in set(missing_bow):
f.write("%s\n" % w)
logging.info("FINISHED! \n")