wisse_example.py
8.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/python
# -*- coding: latin-1 -*-
# Python2.7
from gensim.models.keyedvectors import KeyedVectors as vDB
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
#import numexpr as ne
import argparse
#import _pickle as pickle
#import cPickle as pickle
import logging
import os
from functools import partial
import wisse
load_vectors = vDB.load_word2vec_format
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="""This use example shows sentence
embedding by using WISSE. The input is a text file which has a sentece in
each of its rows. The output file has two tab-separated columns: the index
line of the sentece in the input file and the sentence vector representation
.""")
parser.add_argument("--idfmodel", help = """Input file containing IDF
pre-trained weights. If not provided,
all word vector weights will be set to
1.0. If 'local' tf-idf weights will be
computed locally from the input file
(pickled sklearn object).""",
default = None)
parser.add_argument("--embedmodel", help = """Input file containing word
embeddings model (binary and text
are allowed).""", required = True)
parser.add_argument("--output", help = """Output file containing the sentence
embeddings.""", default = "")
parser.add_argument("--input", help = """Input file containing a sentence
by row.""", required = True)
parser.add_argument("--comb", help = """Desired word vector combination for
sentence representation {sum, avg}.
(default = 'sum')""", default = "sum")
parser.add_argument("--suffix", nargs = '?', help = """A suffix to be added
to the output file (default = '')""",
default = "", required = False)
parser.add_argument("--tfidf", help="""To predict TFIDF complete weights
('tfidf') or use only partial IDFs
('idf'). (default = 'tfidf')""",
default = "tfidf")
parser.add_argument("--localw", help = """TFIDF word vector weights
computed locally from the input file of
sentences {freq, binary, sublinear}
(default='none').""", default = "none")
parser.add_argument("--stop", help = """Toggles stripping stop words in
locally computed word vector weights.""",
action = "store_true")
parser.add_argument("--format", help = """The format of the embedding model
file: {binary, text, wisse}.
default = 'binary'""", default = "binary")
args = parser.parse_args()
if not args.format.startswith("wisse"):
if not os.path.isfile(args.embedmodel):
logging.info("""Embedding model file does not exist (EXIT):
\n%s\n ...""" % args.embedmodel)
exit()
elif not os.path.exists(args.embedmodel):
logging.info("""Embedding model directory does not exist (EXIT):
\n%s\n ...""" % args.embedmodel)
exit()
if not os.path.isfile(args.idfmodel) and not args.idfmodel.startswith("local"):
logging.info("""IDF model file does not exist (EXIT):
\n%s\n ...""" % args.idfmodel)
exit()
if not os.path.isfile(args.input):
logging.info("""Input file does not exist (EXIT):
\n%s\n ...""" % args.input)
exit()
if args.output != "":
if os.path.dirname(args.output) != "":
if not os.path.exists(os.path.dirname(args.output)):
logging.info("""Output directory does not exist (EXIT):
\n%s\n ...""" % args.output)
exit()
else:
output_name = args.output
else:
output_name = args.output
else:
suffix = "_".join([embedding_name,
args.comb,
args.tfidf,
"local" if args.idfmodel.startswith("local") else tfidf_name,
args.suffix]).strip("_")
output_name = args.input + ".output_" + suffix
if args.tfidf.startswith("tfidf"):
pred_tfidf = True
elif args.tfidf.startswith("idf"):
pred_tfidf = False
else:
pred_tfidf = False
tfidf = False
vectorizer = TfidfVectorizer(min_df = 1,
encoding = "latin-1",
decode_error = "replace",
lowercase = True,
binary = True if args.localw.startswith("bin") else False,
sublinear_tf = True if args.localw.startswith("subl") else False,
stop_words = "english" if args.stop else None)
sentences = wisse.streamer(args.input)
if args.idfmodel.startswith("local"):
logging.info("Fitting local TFIDF weights from: %s ..." % args.input)
tfidf = vectorizer.fit(sentences)
elif os.path.isfile(args.idfmodel):
logging.info("Loading global TFIDF weights from: %s ..." % args.idfmodel)
with open(args.idfmodel, 'rb') as f:
tfidf = pickle.load(f)#, encoding = 'latin-1')
else:
tfidf = False
try:
if args.format.startswith("bin"):
embedding = load_vectors(args.embedmodel, binary = True,
encoding = "latin-1")
elif args.format.startswith("tex"):
embedding = load_vectors(args.embedmodel, binary = False,
encoding = "latin-1")
else:
embedding = wisse.vector_space(args.embedmodel, sparse = False)
except:
logging.info(
"""Error while loading word embedding model. Verify if the file
is broken (EXIT)...\n%s\n""" % args.embedmodel)
exit()
embedding_name = os.path.basename(args.embedmodel).split(".")[0]
tfidf_name = os.path.basename(args.idfmodel).split(".")[0]
missing_bow = [] # Stores missing words in the TFIDF model
missing_cbow = [] # Stores missing words in the W2V model
sidx = 0 # The index of the sentence according to the input file
logging.info("\n\nEmbedding sentences and saving then to a the output file..\n%s\n" % output_name)
with open(output_name, "w") as fo:
for sent in sentences:
sidx += 1
series = wisse.wisse(embeddings = embedding, vectorizer = tfidf,
tf_tfidf = True, combiner='sum')
try:
mc, mb, vector = series.transform(sent)
except TypeError:
continue
# At this point you can use the embedding 'vector' for any application as it
# is a numpy array. Also you can simply save the vectors in text format as
# follows:
missing_cbow += mc
missing_bow += mb
fo.write("%d\t%s\n" % (sidx, np.array2string(vector,
formatter = {'float_kind':lambda x: "%.6f" % x},
max_line_width = 20000).strip(']').strip('[') ))
missing_name = (os.path.basename(args.input).split(".")[0] + "_" +
embedding_name + "_" +
tfidf_name + ".missing")
logging.info("\n\nSaving missing vocabulary to %s ..\n\n" % missing_name)
with open(missing_name, "w") as f:
f.write("# missing word embeddings:\n")
for w in set(missing_cbow):
f.write("%s\n" % w)
f.write("# missing MI weights:\n")
for w in set(missing_bow):
f.write("%s\n" % w)
logging.info("FINISHED! \n")