Larisa Morales Soto

word embeddings

This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
# missing word embeddings:
29c
14
31
26a
beta1
486
profibrotic
92a
29a
matricellular
155
13
b1
etiopathology
nexin
avb6
21
19
acetylglucosaminidase
326
dermatan
1b
p38
47
68
3p
dedifferentiating
alpha3beta1
chymase
2alpha
90
7d
5100
18a
1a
alpha3
150
1343
140
541
fibrogenesis
221
mitophagy
b4
p63
10
5p
haptotactic
nonkinase
farnesoid
lymphopoietin
7a
18
338
1beta
424
199a
154
smad2
17
185
bronchoalveolar
101
profibrogenic
153
gambogic
ubiquitinating
p110y
200
323a
196a
dysregulates
fibrogenic
salvianolic
# missing MI weights:
Tumor
sTNFR
Compromised
Modulating
Renin
Invasive
Cytokine
D1
C3aR
Prostaglandin
Diagnostic
HLA
Interstitial
Transforming
Phenotypes
Up
Clinical
TGFb
Interplay
Animal
Decoction
Foxp3high
Leucine
CCL2
SDKP
IL
Correct
Elk1
Participation
Azithromycin
Amplification
1A
Endothelin
Kinase
Possible
Associated
MicroRNA
sL1
Action
For
Development
Establishment
Extracellular
Epstein
MUM
Serine
Interactions
Macrophage
Pseudomonas
Cytokines
IGFBP
Calu
PI3K
B4
Chinese
Cigarette
BALF
NK
alphaEbeta7
microRNAs
Gene
True
Pneumonia
CUX1
miRNA
Ubiquitin
Wnt
CDCP1
Barr
ORP150
Curcumin
aB
Myofibroblast
JNK
Aortic
L5
Hsp90
Smoke
IFN
Reactive
Significance
aVb6
Th2
Constitutive
Melatonin
Mediates
AP
Lindau
ALK5
AKT2
EMT
Reversion
Patients
Solution
Focal
Immunoglobulin
RhoA
Neutrophil
BMPR2
CXCL9
Lower
Olodaterol
Lavage
Induces
Induced
Beyond
MMP
Fstl1
cAMP
Aging
SPARC
Activity
ECM
Human
Fibrotic
Repression
Prostatic
NOX4
Against
Discovery
Resveratrol
Interleukin
Deleted
Differentiation
Spiruchostatin
Insulin
PXS64
MCTC
HP
Triptolide
Values
Differential
Lysyl
FGF
Promote
Yin
Tissue
BMP
CAM
Discoidin
Overproduction
Wilms
Regulates
Reduced
Mesenchymal
Inflammation
PINK1
Release
E2
CD44V6
TGFbeta1
Rg1
H441
kDa
M2
Expression
Data
Contribution
Negative
Nintedanib
Pulmonary
Emphysema
Intrinsic
17A
Marks
26S
RXFP1
Neovessel
Genetic
Protective
Tensin
Inducer
Like
Determining
Increased
Microsatellite
Sphingosine
TRPV4
Club
Methylation
Phenoconversion
Serpin
Activated
Muc5ac
MyD88
Glucagon
IPF
SIRT6
Rapamycin
Essential
TNFalpha
Corilagin
Sorafenib
Epithelial
T869C
Induction
Long
Wt1
Molecules
TGFBR2
P110
Nuclear
Old
Targeting
WNT7B
Thy
Potential
TGF
Tubastatin
Semaphorin
Attenuating
Smad
Pigment
Homolog
Binding
microRNA
C57BL
Regulating
Implications
Hydrogen
BARD1
A549
Homeostasis
Selectivity
Medical
Model
Cytoskeletal
Differing
BMP3
Enhances
NADPH
Fibrogenesis
Defect
Two
FAK
RNA
Quantifying
Epigenetic
Profibrotic
Ambroxol
Trigger
Titration
Transcription
Regulation
Mitochondrial
H1N1
Recent
BMPER
PPARs
VCAM
Microsomal
Hippel
Renshen
Absence
Anchorage
Applying
Free
OSF
PGE
Tannic
Plasminogen
TGFBR
Channel
Age
Cell
Connective
Proteasomal
RAGE
Bach1
Pirfenidone
Outcomes
GATA
Small
Autoimmunity
III
VEGF
Control
HSP27
Cartilage
Periostin
Idiopathic
COL1A1
CBP
Bronchoalveolar
Crosstalk
Amplified
Evidence
Simvastatin
Sphingolipids
Mechanisms
JAK2
Rats
Mice
Protease
From
LPA1
Collagen
Carbon
Molecular
Stat3
Genomewide
Stem
S1P
Novel
EBV
Serum
Abrogation
That
Pingfei
Stromal
Current
Molecule
MAP3K19
Decisive
Protein
Fluid
HDAC4
Angiotensin
SOCS1
Different
Membrane
Domain
Secretory
Signalling
NCI
Bax
ADAM
Are
Beta1
Activation
Problem
Prognostic
II
Sputum
Phosphatase
Inhibition
Profile
Dogs
HRCT
lncRNA
Storage
Nitrated
Box
Forkhead
CREB
Sirtuin
Cryptogenic
Decreased
Inhibits
Formation
MK2
Comparison
Mediated
Latent
Recombinant
Microencapsulation
PHGDH
Organizing
Dysfunction
Way
Using
Peripheral
Markers
MiR
Anti
Studies
May
Significant
Morphogenic
Low
Lactic
Overexpression
Protects
Arsenic
Caveolin
pH
Inhibit
Proteasome
MicroRNAs
Toll
Herpes
CTGF
Normal
Defective
CD44
Large
Ligands
Axis
NH2
Progression
Smad3
Phenotype
Ets
Identification
kB
Role
Relation
Mode
Developmental
Fibrosis
Stanniocalcin
WNT10A
Integrated
Syndecan
Metalloproteinase
TOB2
USP11
WISP1
Dysregulated
Th17
Progressive
Key
Subpleural
Mast
Rho
Growth
Upregulation
Alleviates
Re
Preventive
ITGB6
Fibroblasts
ATG4B
Comparative
Cthrc1
mRNA
Peptide
SNAI
BM
ATPase
AKT
Fibroblastic
Matriptase
Sub
Sustained
Pleiotropic
New
Regulator
Receptor
Therapeutic
Vimentin
IGF
Cells
LIGHT
Production
D2
Dehydroepiandrosterone
Lin28B
Antifibrotic
Raised
Proliferation
Dependent
COL1
Lysocardiolipin
Epithelium
STAT3
Prevents
Th1
NF
CCN5
Snail
Myogenic
CD4
Akt
TGFb1
Accelerated
PDGF
Intratracheal
TGFB1
Cysteine
Oxidant
Effect
Reprogramming
IIP
MS80
FOXF1
Promotes
Assessment
BLM
CC16
BAL
CD248
Ginsenoside
Secreted
Association
IQ
mTORC2
Established
The
Combined
Jun
UIP
Sulf2
Thalidomide
Bioenergetics
TNF
CCN2
NEU1
Attenuates
HMGA2
Group
Conversion
Predisposition
Transglutaminase
Pathway
Reviews
Treg
DDR2
Autophagy
Hyper
Bile
Sunitinib
Stiffening
Signal
Resolution
De
Type
Factor
Smad2
Single
PPAR
WNT5A
Novo
An
EGFR
Cub
GLI
HSP47
Early
ERK1
TGFbeta
Deficiency
hydroxytryptamine2A
BAX
Inhibitory
Integrin
Suppression
Shikonin
SMAD3
Effects
Metformin
F1
MAPK
Modulation
Bleomycin
Injury
Elevated
Cellular
Radioligand
Citrus
TIAM1
Subjects
Lung
ARPC2
H19
EZH2
Pathways
Is
Microarray
Fas
CCN1
Ac
miRNAs
Myofibroblasts
FFPE
Inhibitor
During
Matrix
Nrf2
Immunomodulation
C5aR
Gremlin
High
Concentration
Evaluation
Roles
Number
Bone
ACLP
Hypertension
Lipogenic
Uncoupling
Signaling
Lrp5
Berberine
A4
CD11c
miR
Chop
Galectin
Alveolar
Transition
Plasma
Impacts
Smad4
Its
Pathogenesis
Inappropriate
Investigation
Beta
Ca
ERK
Deregulation
MSCs
PTEN
Lipoxin
Nitric
C1q
KCa3
kappaB
Involvement
MCP
Pleural
EMMPRIN
Smooth
Synthesis
Blockade
Compared
Transgelin
No preview for this file type
# missing word embeddings:
profibrogenic
199a
p38
beta1
68
etiopathology
1343
lymphopoietin
29c
185
5p
17
dermatan
1a
13
424
101
p63
140
b1
fibrogenic
gambogic
nonkinase
21
alpha3
154
2alpha
chymase
18a
196a
5100
smad2
7d
541
1b
acetylglucosaminidase
326
47
dysregulates
92a
200
29a
90
31
mitophagy
b4
3p
nexin
dedifferentiating
155
150
ubiquitinating
10
486
19
avb6
fibrogenesis
farnesoid
haptotactic
alpha3beta1
14
323a
matricellular
7a
profibrotic
bronchoalveolar
26a
18
salvianolic
338
1beta
p110y
221
153
# missing MI weights:
Compared
Are
True
Ambroxol
Diagnostic
Alveolar
Smad2
Neovessel
RXFP1
Normal
Shikonin
Spiruchostatin
ORP150
Tubastatin
That
Bone
WISP1
Wt1
Smad4
ECM
Syndecan
Radioligand
BAX
De
FAK
Prevents
Endothelin
kB
Promote
Reversion
Determining
Cytokines
Glucagon
Pathways
Myogenic
SOCS1
Investigation
Regulating
Targeting
Decoction
Stromal
PPARs
HP
Focal
Transgelin
Association
Effects
EGFR
Gene
Human
Metalloproteinase
Lower
Rg1
Binding
Therapeutic
Mesenchymal
CD248
Formation
Cysteine
Caveolin
Type
Signaling
Molecular
Alleviates
Early
Transforming
Potential
COL1A1
Plasminogen
Factor
Semaphorin
CC16
Integrated
Like
Stat3
Tissue
Signalling
Phenotype
TGFBR2
Homolog
III
MiR
Lactic
Pulmonary
Fibroblastic
Defect
Molecules
Yin
MCP
MicroRNA
LIGHT
Beyond
Recombinant
Compromised
Ginsenoside
P110
Production
Lipogenic
HRCT
Its
Implications
Problem
NH2
Fibrogenesis
TOB2
SMAD3
Lin28B
Significance
Differential
Cytokine
Progressive
Solution
Identification
Peptide
Synthesis
Protein
Macrophage
PDGF
Repression
CREB
Cellular
Plasma
A4
Latent
Wnt
Proteasome
Kinase
Proteasomal
Pathway
Sirtuin
MSCs
D2
Absence
Cells
Thalidomide
Regulation
Hippel
pH
Chinese
Th17
Uncoupling
Periostin
Promotes
Amplification
Smad
Profibrotic
Patients
Subpleural
Cytoskeletal
Progression
Lavage
Angiotensin
Domain
Peripheral
Inhibitor
Associated
Involvement
Serum
Toll
Activation
SPARC
Attenuating
Resveratrol
PI3K
Induced
Matrix
Leucine
BALF
Defective
Negative
JNK
Receptor
Reprogramming
CD11c
Nuclear
hydroxytryptamine2A
Is
Sustained
Essential
Beta
CBP
miRNA
Pathogenesis
Aging
Neutrophil
Nitrated
Resolution
Signal
Low
New
Contribution
Homeostasis
HLA
Sub
26S
HMGA2
Treg
Significant
Blockade
HSP27
Clinical
Th1
Triptolide
Dependent
Inflammation
VEGF
PINK1
H1N1
E2
Discovery
Interplay
Secretory
CCL2
Dehydroepiandrosterone
Modulation
Mechanisms
MMP
Mediated
H19
Morphogenic
Ac
Corilagin
Tannic
Hypertension
WNT10A
MCTC
Possible
Studies
Pigment
Wilms
Hydrogen
Azithromycin
Number
Transglutaminase
Outcomes
NCI
RNA
WNT5A
Mice
Methylation
Novel
Nitric
Cell
HSP47
TRPV4
Protease
Release
For
Ligands
TGF
Epithelium
Aortic
NADPH
Herpes
Bach1
Ca
Dysregulated
Membrane
TGFBR
Connective
Decreased
TIAM1
Serpin
Fstl1
CCN1
BLM
Sphingosine
C57BL
Data
Dogs
Organizing
IGF
COL1
CD44V6
BMPER
ARPC2
Galectin
Lindau
Inappropriate
Microencapsulation
Oxidant
M2
Renshen
Sputum
Snail
Inducer
Prognostic
Storage
MAPK
Citrus
PPAR
Collagen
Matriptase
Arsenic
Long
F1
Deleted
Genomewide
PXS64
Lysyl
ERK
Calu
MyD88
aB
Activity
Applying
Secreted
Control
BM
Mitochondrial
Age
EZH2
Overproduction
Way
The
DDR2
1A
Rho
Bronchoalveolar
TGFbeta1
Akt
ERK1
Novo
Curcumin
FGF
C5aR
17A
Lysocardiolipin
Protects
Predisposition
Thy
C1q
Nintedanib
High
KCa3
Olodaterol
Reviews
Proliferation
Immunomodulation
Attenuates
Gremlin
Cthrc1
Vimentin
Elk1
Lipoxin
IQ
Roles
BAL
Relation
Autophagy
IGFBP
Inhibition
BMP
Anchorage
ITGB6
Mode
Modulating
miRNAs
Inhibit
PHGDH
Up
Phosphatase
TGFbeta
C3aR
Pseudomonas
Comparative
Reduced
Crosstalk
Conversion
Injury
Phenotypes
CD4
MicroRNAs
Regulates
TNFalpha
Pirfenidone
Raised
Old
Cartilage
Prostaglandin
BMP3
BARD1
Deficiency
RhoA
AKT2
NF
Cigarette
GATA
MAP3K19
sTNFR
NK
Different
Subjects
Autoimmunity
Mast
Single
Microsomal
WNT7B
MK2
TGFb1
CCN2
Growth
Prostatic
PGE
Abrogation
Stem
EBV
Microsatellite
Nrf2
Epstein
Club
TGFB1
ATG4B
Differentiation
EMMPRIN
Smad3
Genetic
Sorafenib
IFN
Impacts
Key
Activated
AKT
Th2
PTEN
USP11
IL
Effect
HDAC4
Free
Sunitinib
Established
Fluid
Decisive
Inhibits
Marks
mTORC2
Trigger
Concentration
Intratracheal
Participation
Against
Expression
kappaB
Role
Rats
Intrinsic
Epigenetic
Smooth
NOX4
Tumor
Rapamycin
microRNA
Overexpression
Current
Muc5ac
Combined
II
D1
Accelerated
Regulator
Pleural
Invasive
alphaEbeta7
From
MUM
Immunoglobulin
Beta1
Small
Sphingolipids
Stiffening
FFPE
miR
LPA1
B4
Two
Extracellular
Enhances
Evaluation
Recent
Elevated
Re
IIP
CD44
Interactions
CXCL9
Protective
Ets
Preventive
Establishment
ALK5
Increased
Values
kDa
Large
May
Transcription
ACLP
Cryptogenic
Ubiquitin
GLI
L5
Discoidin
Bleomycin
Carbon
Renin
CUX1
Correct
Constitutive
SNAI
Bile
Assessment
Fibrotic
Differing
Development
Channel
Simvastatin
CAM
Fibroblasts
Melatonin
SIRT6
STAT3
Tensin
Pingfei
Stanniocalcin
Bax
Group
mRNA
Selectivity
Emphysema
Barr
Berberine
Metformin
Hsp90
CDCP1
T869C
EMT
ADAM
Cub
Pneumonia
Induces
FOXF1
Upregulation
H441
RAGE
Myofibroblasts
JAK2
Interstitial
Amplified
Fibrosis
Microarray
Developmental
CTGF
Serine
Integrin
AP
Fas
During
CCN5
Insulin
Pleiotropic
TGFb
Evidence
Phenoconversion
Comparison
Smoke
Box
microRNAs
Anti
Suppression
A549
Chop
Jun
Myofibroblast
Dysfunction
Axis
IPF
MS80
S1P
Inhibitory
Interleukin
Action
Bioenergetics
Transition
Hyper
Lrp5
Model
cAMP
Medical
SDKP
UIP
Animal
Forkhead
lncRNA
Lung
Antifibrotic
Induction
Titration
Epithelial
OSF
ATPase
Reactive
TNF
aVb6
Molecule
NEU1
Deregulation
Idiopathic
An
Using
Quantifying
BMPR2
Foxp3high
sL1
Profile
Sulf2
Mediates
Markers
VCAM
#!/usr/bin/python
# -*- coding: latin-1 -*-
# Python2.7
import numpy as np
import logging
import os
from functools import partial
from pdb import set_trace as st
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
class wisse(object):
""" Both the TFIDFVectorizer and the word embedding model must be pretrained, either from the local
sentence corpus or from model persintence.
"""
def __init__(self, embeddings, vectorizer, tf_tfidf, combiner = "sum"):
self.tokenize = vectorizer.build_tokenizer()
self.tfidf = vectorizer
self.embedding = embeddings
self.pred_tfidf = tf_tfidf
if combiner.startswith("avg"):
self.comb = partial(np.mean, axis = 0)
else:
self.comb = partial(np.sum, axis = 0)
def fit(self, X, y = None): # Scikit-learn template
if isinstance(X, list):
self.sentences = X
return self
def transform(self, X):
if isinstance(X, list):
return self.fit(X)
elif isinstance(X, str):
return self.infer_sentence(X)
def fit_transform(self, X, y=None):
return self.transform(X)
def infer_sentence(self, sent):
ss = self.tokenize(sent)
missing_bow = []
missing_cbow = []
series = {}
if not ss == []:
self.weights, m = self.infer_tfidf_weights(ss)
else:
return None
missing_bow += m
for w in self.weights:
try:
series[w] = (self.weights[w], self.embedding[w])
except KeyError:
series[w] = None
missing_cbow.append(w)
continue
except IndexError:
continue
if self.weights == {}: return None
# Embedding the sentence... :
sentence = np.array([series[w][1] for w in series if not series[w] is None])
series = {}
return missing_cbow, missing_bow, self.comb(sentence)
def infer_tfidf_weights(self, sentence):
existent = {}
missing = []
if not self.tfidf:
for word in sentence:
existent[word] = 1.0
return existent, missing
if self.pred_tfidf:
unseen = self.tfidf.transform([" ".join(sentence)]).toarray()
for word in sentence:
try:
existent[word] = unseen[0][self.tfidf.vocabulary_[word]]
except KeyError:
missing.append(word)
continue
else:
for word in sentence:
try:
weight = vectorizer.idf_[vectorizer.vocabulary_[word]]
existent[word] = weight if weight > 2 else 0.01
except KeyError:
missing.append(word)
continue
return existent, missing
def __iter__(self):
for s in self.sentences:
yield self.transform(s)
def save_dense(directory, filename, array):
directory=os.path.normpath(directory) + '/'
# try:
if filename.isalpha():
np.save(directory + filename, array)
else:
return None
# except UnicodeEncodeError:
# return None
def load_dense(filename):
return np.load(filename)
def load_sparse_bsr(filename):
loader = np.load(filename)
return bsr_matrix((loader['data'], loader['indices'], loader['indptr']),
shape=loader['shape'])
def save_sparse_bsr(directory, filename, array):
# note that .npz extension is added automatically
directory=os.path.normpath(directory) + '/'
if word.isalpha():
array=array.tobsr()
np.savez(directory + filename, data=array.data, indices=array.indices,
indptr=array.indptr, shape=array.shape)
else:
return None
class vector_space(object):
def __init__(self, directory, sparse = False):
self.sparse = sparse
ext = ".npz" if sparse else ".npy"
if directory.endswith(".tar.gz"):
self._tar = True
import tarfile
self.tar = tarfile.open(directory)
file_list = self.tar.getnames() #[os.path.basename(n) for n in self.tar.getnames()]
self.words = {os.path.basename(word).replace(ext, ''): word
for word in file_list}
else:
self._tar = False
directory = os.path.normpath(directory) + '/'
file_list = os.listdir(directory)
self.words = {word.replace(ext, ''): directory + word
for word in file_list}
def __getitem__(self, item):
if self.sparse:
if self._tar:
member = self.tar.getmember(self.words[item])
word = self.tar.extractfile(member)
else:
word = self.words[item]
#return load_sparse_bsr(self.words[item])
return load_sparse_bsr(word)
else:
if self._tar:
member = self.tar.getmember(self.words[item])
word = self.tar.extractfile(member)
else:
word = self.words[item]
#return load_sparse_bsr(self.words[item])
return load_dense(word)
def keyed2indexed(keyed_model, output_dir = "word_embeddings/", parallel = True, n_jobs = -1):
output_dir = os.path.normpath(output_dir) + '/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if parallel:
from joblib import Parallel, delayed
Parallel(n_jobs = n_jobs, verbose = 10)(delayed(save_dense)(output_dir, word, keyed_model[word])
for word, _ in keyed_model.vocab.items())
else:
for word, _ in keyed_model.vocab.items():
save_dense(output_dir, word, keyed_model[word])
class streamer(object):
def __init__(self, file_name):
self.file_name = file_name
def __iter__(self):
for s in open(self.file_name):
yield s.strip()
No preview for this file type
#!/usr/bin/python
# -*- coding: latin-1 -*-
# Python2.7
from gensim.models.keyedvectors import KeyedVectors as vDB
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
#import numexpr as ne
import argparse
#import _pickle as pickle
#import cPickle as pickle
import logging
import os
from functools import partial
import wisse
load_vectors = vDB.load_word2vec_format
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="""This use example shows sentence
embedding by using WISSE. The input is a text file which has a sentece in
each of its rows. The output file has two tab-separated columns: the index
line of the sentece in the input file and the sentence vector representation
.""")
parser.add_argument("--idfmodel", help = """Input file containing IDF
pre-trained weights. If not provided,
all word vector weights will be set to
1.0. If 'local' tf-idf weights will be
computed locally from the input file
(pickled sklearn object).""",
default = None)
parser.add_argument("--embedmodel", help = """Input file containing word
embeddings model (binary and text
are allowed).""", required = True)
parser.add_argument("--output", help = """Output file containing the sentence
embeddings.""", default = "")
parser.add_argument("--input", help = """Input file containing a sentence
by row.""", required = True)
parser.add_argument("--comb", help = """Desired word vector combination for
sentence representation {sum, avg}.
(default = 'sum')""", default = "sum")
parser.add_argument("--suffix", nargs = '?', help = """A suffix to be added
to the output file (default = '')""",
default = "", required = False)
parser.add_argument("--tfidf", help="""To predict TFIDF complete weights
('tfidf') or use only partial IDFs
('idf'). (default = 'tfidf')""",
default = "tfidf")
parser.add_argument("--localw", help = """TFIDF word vector weights
computed locally from the input file of
sentences {freq, binary, sublinear}
(default='none').""", default = "none")
parser.add_argument("--stop", help = """Toggles stripping stop words in
locally computed word vector weights.""",
action = "store_true")
parser.add_argument("--format", help = """The format of the embedding model
file: {binary, text, wisse}.
default = 'binary'""", default = "binary")
args = parser.parse_args()
if not args.format.startswith("wisse"):
if not os.path.isfile(args.embedmodel):
logging.info("""Embedding model file does not exist (EXIT):
\n%s\n ...""" % args.embedmodel)
exit()
elif not os.path.exists(args.embedmodel):
logging.info("""Embedding model directory does not exist (EXIT):
\n%s\n ...""" % args.embedmodel)
exit()
if not os.path.isfile(args.idfmodel) and not args.idfmodel.startswith("local"):
logging.info("""IDF model file does not exist (EXIT):
\n%s\n ...""" % args.idfmodel)
exit()
if not os.path.isfile(args.input):
logging.info("""Input file does not exist (EXIT):
\n%s\n ...""" % args.input)
exit()
if args.output != "":
if os.path.dirname(args.output) != "":
if not os.path.exists(os.path.dirname(args.output)):
logging.info("""Output directory does not exist (EXIT):
\n%s\n ...""" % args.output)
exit()
else:
output_name = args.output
else:
output_name = args.output
else:
suffix = "_".join([embedding_name,
args.comb,
args.tfidf,
"local" if args.idfmodel.startswith("local") else tfidf_name,
args.suffix]).strip("_")
output_name = args.input + ".output_" + suffix
if args.tfidf.startswith("tfidf"):
pred_tfidf = True
elif args.tfidf.startswith("idf"):
pred_tfidf = False
else:
pred_tfidf = False
tfidf = False
vectorizer = TfidfVectorizer(min_df = 1,
encoding = "latin-1",
decode_error = "replace",
lowercase = True,
binary = True if args.localw.startswith("bin") else False,
sublinear_tf = True if args.localw.startswith("subl") else False,
stop_words = "english" if args.stop else None)
sentences = wisse.streamer(args.input)
if args.idfmodel.startswith("local"):
logging.info("Fitting local TFIDF weights from: %s ..." % args.input)
tfidf = vectorizer.fit(sentences)
elif os.path.isfile(args.idfmodel):
logging.info("Loading global TFIDF weights from: %s ..." % args.idfmodel)
with open(args.idfmodel, 'rb') as f:
tfidf = pickle.load(f)#, encoding = 'latin-1')
else:
tfidf = False
try:
if args.format.startswith("bin"):
embedding = load_vectors(args.embedmodel, binary = True,
encoding = "latin-1")
elif args.format.startswith("tex"):
embedding = load_vectors(args.embedmodel, binary = False,
encoding = "latin-1")
else:
embedding = wisse.vector_space(args.embedmodel, sparse = False)
except:
logging.info(
"""Error while loading word embedding model. Verify if the file
is broken (EXIT)...\n%s\n""" % args.embedmodel)
exit()
embedding_name = os.path.basename(args.embedmodel).split(".")[0]
tfidf_name = os.path.basename(args.idfmodel).split(".")[0]
missing_bow = [] # Stores missing words in the TFIDF model
missing_cbow = [] # Stores missing words in the W2V model
sidx = 0 # The index of the sentence according to the input file
logging.info("\n\nEmbedding sentences and saving then to a the output file..\n%s\n" % output_name)
with open(output_name, "w") as fo:
for sent in sentences:
sidx += 1
series = wisse.wisse(embeddings = embedding, vectorizer = tfidf,
tf_tfidf = True, combiner='sum')
try:
mc, mb, vector = series.transform(sent)
except TypeError:
continue
# At this point you can use the embedding 'vector' for any application as it
# is a numpy array. Also you can simply save the vectors in text format as
# follows:
missing_cbow += mc
missing_bow += mb
fo.write("%d\t%s\n" % (sidx, np.array2string(vector,
formatter = {'float_kind':lambda x: "%.6f" % x},
max_line_width = 20000).strip(']').strip('[') ))
missing_name = (os.path.basename(args.input).split(".")[0] + "_" +
embedding_name + "_" +
tfidf_name + ".missing")
logging.info("\n\nSaving missing vocabulary to %s ..\n\n" % missing_name)
with open(missing_name, "w") as f:
f.write("# missing word embeddings:\n")
for w in set(missing_cbow):
f.write("%s\n" % w)
f.write("# missing MI weights:\n")
for w in set(missing_bow):
f.write("%s\n" % w)
logging.info("FINISHED! \n")