word embeddings

Larisa Morales Soto
Commit d242c75fc1631af57387df82d5f4398cbe6dc518 d242c75f 0 parents
Showing 8 changed files with 1786 additions and 0 deletions
corpora/abstracts-titles.txt
embeddings/abstracts-titles.vec
embeddings/abstracts-titles_indexed_w2v_En_vector_space_H300_local.missing
scripts/__pycache__/wisse.cpython-36.pyc
scripts/abstracts-titles_indexed_w2v_En_vector_space_H300_local.missing
scripts/wisse.py
scripts/wisse.pyc
scripts/wisse_example.py
--- a/corpora/abstracts-titles.txt 0 → 100644
View file @d242c75
+++ b/corpora/abstracts-titles.txt 0 → 100644
View file @d242c75
--- a/embeddings/abstracts-titles.vec 0 → 100644
View file @d242c75
+++ b/embeddings/abstracts-titles.vec 0 → 100644
View file @d242c75
--- a/embeddings/abstracts-titles_indexed_w2v_En_vector_space_H300_local.missing 0 → 100644
View file @d242c75
+++ b/embeddings/abstracts-titles_indexed_w2v_En_vector_space_H300_local.missing 0 → 100644
View file @d242c75
+ # missing word embeddings:
+ 29c
+ 14
+ 31
+ 26a
+ beta1
+ 486
+ profibrotic
+ 92a
+ 29a
+ matricellular
+ 155
+ 13
+ b1
+ etiopathology
+ nexin
+ avb6
+ 21
+ 19
+ acetylglucosaminidase
+ 326
+ dermatan
+ 1b
+ p38
+ 47
+ 68
+ 3p
+ dedifferentiating
+ alpha3beta1
+ chymase
+ 2alpha
+ 90
+ 7d
+ 5100
+ 18a
+ 1a
+ alpha3
+ 150
+ 1343
+ 140
+ 541
+ fibrogenesis
+ 221
+ mitophagy
+ b4
+ p63
+ 10
+ 5p
+ haptotactic
+ nonkinase
+ farnesoid
+ lymphopoietin
+ 7a
+ 18
+ 338
+ 1beta
+ 424
+ 199a
+ 154
+ smad2
+ 17
+ 185
+ bronchoalveolar
+ 101
+ profibrogenic
+ 153
+ gambogic
+ ubiquitinating
+ p110y
+ 200
+ 323a
+ 196a
+ dysregulates
+ fibrogenic
+ salvianolic
+ # missing MI weights:
+ Tumor
+ sTNFR
+ Compromised
+ Modulating
+ Renin
+ Invasive
+ Cytokine
+ D1
+ C3aR
+ Prostaglandin
+ Diagnostic
+ HLA
+ Interstitial
+ Transforming
+ Phenotypes
+ Up
+ Clinical
+ TGFb
+ Interplay
+ Animal
+ Decoction
+ Foxp3high
+ Leucine
+ CCL2
+ SDKP
+ IL
+ Correct
+ Elk1
+ Participation
+ Azithromycin
+ Amplification
+ 1A
+ Endothelin
+ Kinase
+ Possible
+ Associated
+ MicroRNA
+ sL1
+ Action
+ For
+ Development
+ Establishment
+ Extracellular
+ Epstein
+ MUM
+ Serine
+ Interactions
+ Macrophage
+ Pseudomonas
+ Cytokines
+ IGFBP
+ Calu
+ PI3K
+ B4
+ Chinese
+ Cigarette
+ BALF
+ NK
+ alphaEbeta7
+ microRNAs
+ Gene
+ True
+ Pneumonia
+ CUX1
+ miRNA
+ Ubiquitin
+ Wnt
+ CDCP1
+ Barr
+ ORP150
+ Curcumin
+ aB
+ Myofibroblast
+ JNK
+ Aortic
+ L5
+ Hsp90
+ Smoke
+ IFN
+ Reactive
+ Significance
+ aVb6
+ Th2
+ Constitutive
+ Melatonin
+ Mediates
+ AP
+ Lindau
+ ALK5
+ AKT2
+ EMT
+ Reversion
+ Patients
+ Solution
+ Focal
+ Immunoglobulin
+ RhoA
+ Neutrophil
+ BMPR2
+ CXCL9
+ Lower
+ Olodaterol
+ Lavage
+ Induces
+ Induced
+ Beyond
+ MMP
+ Fstl1
+ cAMP
+ Aging
+ SPARC
+ Activity
+ ECM
+ Human
+ Fibrotic
+ Repression
+ Prostatic
+ NOX4
+ Against
+ Discovery
+ Resveratrol
+ Interleukin
+ Deleted
+ Differentiation
+ Spiruchostatin
+ Insulin
+ PXS64
+ MCTC
+ HP
+ Triptolide
+ Values
+ Differential
+ Lysyl
+ FGF
+ Promote
+ Yin
+ Tissue
+ BMP
+ CAM
+ Discoidin
+ Overproduction
+ Wilms
+ Regulates
+ Reduced
+ Mesenchymal
+ Inflammation
+ PINK1
+ Release
+ E2
+ CD44V6
+ TGFbeta1
+ Rg1
+ H441
+ kDa
+ M2
+ Expression
+ Data
+ Contribution
+ Negative
+ Nintedanib
+ Pulmonary
+ Emphysema
+ Intrinsic
+ 17A
+ Marks
+ 26S
+ RXFP1
+ Neovessel
+ Genetic
+ Protective
+ Tensin
+ Inducer
+ Like
+ Determining
+ Increased
+ Microsatellite
+ Sphingosine
+ TRPV4
+ Club
+ Methylation
+ Phenoconversion
+ Serpin
+ Activated
+ Muc5ac
+ MyD88
+ Glucagon
+ IPF
+ SIRT6
+ Rapamycin
+ Essential
+ TNFalpha
+ Corilagin
+ Sorafenib
+ Epithelial
+ T869C
+ Induction
+ Long
+ Wt1
+ Molecules
+ TGFBR2
+ P110
+ Nuclear
+ Old
+ Targeting
+ WNT7B
+ Thy
+ Potential
+ TGF
+ Tubastatin
+ Semaphorin
+ Attenuating
+ Smad
+ Pigment
+ Homolog
+ Binding
+ microRNA
+ C57BL
+ Regulating
+ Implications
+ Hydrogen
+ BARD1
+ A549
+ Homeostasis
+ Selectivity
+ Medical
+ Model
+ Cytoskeletal
+ Differing
+ BMP3
+ Enhances
+ NADPH
+ Fibrogenesis
+ Defect
+ Two
+ FAK
+ RNA
+ Quantifying
+ Epigenetic
+ Profibrotic
+ Ambroxol
+ Trigger
+ Titration
+ Transcription
+ Regulation
+ Mitochondrial
+ H1N1
+ Recent
+ BMPER
+ PPARs
+ VCAM
+ Microsomal
+ Hippel
+ Renshen
+ Absence
+ Anchorage
+ Applying
+ Free
+ OSF
+ PGE
+ Tannic
+ Plasminogen
+ TGFBR
+ Channel
+ Age
+ Cell
+ Connective
+ Proteasomal
+ RAGE
+ Bach1
+ Pirfenidone
+ Outcomes
+ GATA
+ Small
+ Autoimmunity
+ III
+ VEGF
+ Control
+ HSP27
+ Cartilage
+ Periostin
+ Idiopathic
+ COL1A1
+ CBP
+ Bronchoalveolar
+ Crosstalk
+ Amplified
+ Evidence
+ Simvastatin
+ Sphingolipids
+ Mechanisms
+ JAK2
+ Rats
+ Mice
+ Protease
+ From
+ LPA1
+ Collagen
+ Carbon
+ Molecular
+ Stat3
+ Genomewide
+ Stem
+ S1P
+ Novel
+ EBV
+ Serum
+ Abrogation
+ That
+ Pingfei
+ Stromal
+ Current
+ Molecule
+ MAP3K19
+ Decisive
+ Protein
+ Fluid
+ HDAC4
+ Angiotensin
+ SOCS1
+ Different
+ Membrane
+ Domain
+ Secretory
+ Signalling
+ NCI
+ Bax
+ ADAM
+ Are
+ Beta1
+ Activation
+ Problem
+ Prognostic
+ II
+ Sputum
+ Phosphatase
+ Inhibition
+ Profile
+ Dogs
+ HRCT
+ lncRNA
+ Storage
+ Nitrated
+ Box
+ Forkhead
+ CREB
+ Sirtuin
+ Cryptogenic
+ Decreased
+ Inhibits
+ Formation
+ MK2
+ Comparison
+ Mediated
+ Latent
+ Recombinant
+ Microencapsulation
+ PHGDH
+ Organizing
+ Dysfunction
+ Way
+ Using
+ Peripheral
+ Markers
+ MiR
+ Anti
+ Studies
+ May
+ Significant
+ Morphogenic
+ Low
+ Lactic
+ Overexpression
+ Protects
+ Arsenic
+ Caveolin
+ pH
+ Inhibit
+ Proteasome
+ MicroRNAs
+ Toll
+ Herpes
+ CTGF
+ Normal
+ Defective
+ CD44
+ Large
+ Ligands
+ Axis
+ NH2
+ Progression
+ Smad3
+ Phenotype
+ Ets
+ Identification
+ kB
+ Role
+ Relation
+ Mode
+ Developmental
+ Fibrosis
+ Stanniocalcin
+ WNT10A
+ Integrated
+ Syndecan
+ Metalloproteinase
+ TOB2
+ USP11
+ WISP1
+ Dysregulated
+ Th17
+ Progressive
+ Key
+ Subpleural
+ Mast
+ Rho
+ Growth
+ Upregulation
+ Alleviates
+ Re
+ Preventive
+ ITGB6
+ Fibroblasts
+ ATG4B
+ Comparative
+ Cthrc1
+ mRNA
+ Peptide
+ SNAI
+ BM
+ ATPase
+ AKT
+ Fibroblastic
+ Matriptase
+ Sub
+ Sustained
+ Pleiotropic
+ New
+ Regulator
+ Receptor
+ Therapeutic
+ Vimentin
+ IGF
+ Cells
+ LIGHT
+ Production
+ D2
+ Dehydroepiandrosterone
+ Lin28B
+ Antifibrotic
+ Raised
+ Proliferation
+ Dependent
+ COL1
+ Lysocardiolipin
+ Epithelium
+ STAT3
+ Prevents
+ Th1
+ NF
+ CCN5
+ Snail
+ Myogenic
+ CD4
+ Akt
+ TGFb1
+ Accelerated
+ PDGF
+ Intratracheal
+ TGFB1
+ Cysteine
+ Oxidant
+ Effect
+ Reprogramming
+ IIP
+ MS80
+ FOXF1
+ Promotes
+ Assessment
+ BLM
+ CC16
+ BAL
+ CD248
+ Ginsenoside
+ Secreted
+ Association
+ IQ
+ mTORC2
+ Established
+ The
+ Combined
+ Jun
+ UIP
+ Sulf2
+ Thalidomide
+ Bioenergetics
+ TNF
+ CCN2
+ NEU1
+ Attenuates
+ HMGA2
+ Group
+ Conversion
+ Predisposition
+ Transglutaminase
+ Pathway
+ Reviews
+ Treg
+ DDR2
+ Autophagy
+ Hyper
+ Bile
+ Sunitinib
+ Stiffening
+ Signal
+ Resolution
+ De
+ Type
+ Factor
+ Smad2
+ Single
+ PPAR
+ WNT5A
+ Novo
+ An
+ EGFR
+ Cub
+ GLI
+ HSP47
+ Early
+ ERK1
+ TGFbeta
+ Deficiency
+ hydroxytryptamine2A
+ BAX
+ Inhibitory
+ Integrin
+ Suppression
+ Shikonin
+ SMAD3
+ Effects
+ Metformin
+ F1
+ MAPK
+ Modulation
+ Bleomycin
+ Injury
+ Elevated
+ Cellular
+ Radioligand
+ Citrus
+ TIAM1
+ Subjects
+ Lung
+ ARPC2
+ H19
+ EZH2
+ Pathways
+ Is
+ Microarray
+ Fas
+ CCN1
+ Ac
+ miRNAs
+ Myofibroblasts
+ FFPE
+ Inhibitor
+ During
+ Matrix
+ Nrf2
+ Immunomodulation
+ C5aR
+ Gremlin
+ High
+ Concentration
+ Evaluation
+ Roles
+ Number
+ Bone
+ ACLP
+ Hypertension
+ Lipogenic
+ Uncoupling
+ Signaling
+ Lrp5
+ Berberine
+ A4
+ CD11c
+ miR
+ Chop
+ Galectin
+ Alveolar
+ Transition
+ Plasma
+ Impacts
+ Smad4
+ Its
+ Pathogenesis
+ Inappropriate
+ Investigation
+ Beta
+ Ca
+ ERK
+ Deregulation
+ MSCs
+ PTEN
+ Lipoxin
+ Nitric
+ C1q
+ KCa3
+ kappaB
+ Involvement
+ MCP
+ Pleural
+ EMMPRIN
+ Smooth
+ Synthesis
+ Blockade
+ Compared
+ Transgelin
--- a/scripts/__pycache__/wisse.cpython-36.pyc 0 → 100644
View file @d242c75
+++ b/scripts/__pycache__/wisse.cpython-36.pyc 0 → 100644
View file @d242c75
--- a/scripts/abstracts-titles_indexed_w2v_En_vector_space_H300_local.missing 0 → 100644
View file @d242c75
+++ b/scripts/abstracts-titles_indexed_w2v_En_vector_space_H300_local.missing 0 → 100644
View file @d242c75
+ # missing word embeddings:
+ profibrogenic
+ 199a
+ p38
+ beta1
+ 68
+ etiopathology
+ 1343
+ lymphopoietin
+ 29c
+ 185
+ 5p
+ 17
+ dermatan
+ 1a
+ 13
+ 424
+ 101
+ p63
+ 140
+ b1
+ fibrogenic
+ gambogic
+ nonkinase
+ 21
+ alpha3
+ 154
+ 2alpha
+ chymase
+ 18a
+ 196a
+ 5100
+ smad2
+ 7d
+ 541
+ 1b
+ acetylglucosaminidase
+ 326
+ 47
+ dysregulates
+ 92a
+ 200
+ 29a
+ 90
+ 31
+ mitophagy
+ b4
+ 3p
+ nexin
+ dedifferentiating
+ 155
+ 150
+ ubiquitinating
+ 10
+ 486
+ 19
+ avb6
+ fibrogenesis
+ farnesoid
+ haptotactic
+ alpha3beta1
+ 14
+ 323a
+ matricellular
+ 7a
+ profibrotic
+ bronchoalveolar
+ 26a
+ 18
+ salvianolic
+ 338
+ 1beta
+ p110y
+ 221
+ 153
+ # missing MI weights:
+ Compared
+ Are
+ True
+ Ambroxol
+ Diagnostic
+ Alveolar
+ Smad2
+ Neovessel
+ RXFP1
+ Normal
+ Shikonin
+ Spiruchostatin
+ ORP150
+ Tubastatin
+ That
+ Bone
+ WISP1
+ Wt1
+ Smad4
+ ECM
+ Syndecan
+ Radioligand
+ BAX
+ De
+ FAK
+ Prevents
+ Endothelin
+ kB
+ Promote
+ Reversion
+ Determining
+ Cytokines
+ Glucagon
+ Pathways
+ Myogenic
+ SOCS1
+ Investigation
+ Regulating
+ Targeting
+ Decoction
+ Stromal
+ PPARs
+ HP
+ Focal
+ Transgelin
+ Association
+ Effects
+ EGFR
+ Gene
+ Human
+ Metalloproteinase
+ Lower
+ Rg1
+ Binding
+ Therapeutic
+ Mesenchymal
+ CD248
+ Formation
+ Cysteine
+ Caveolin
+ Type
+ Signaling
+ Molecular
+ Alleviates
+ Early
+ Transforming
+ Potential
+ COL1A1
+ Plasminogen
+ Factor
+ Semaphorin
+ CC16
+ Integrated
+ Like
+ Stat3
+ Tissue
+ Signalling
+ Phenotype
+ TGFBR2
+ Homolog
+ III
+ MiR
+ Lactic
+ Pulmonary
+ Fibroblastic
+ Defect
+ Molecules
+ Yin
+ MCP
+ MicroRNA
+ LIGHT
+ Beyond
+ Recombinant
+ Compromised
+ Ginsenoside
+ P110
+ Production
+ Lipogenic
+ HRCT
+ Its
+ Implications
+ Problem
+ NH2
+ Fibrogenesis
+ TOB2
+ SMAD3
+ Lin28B
+ Significance
+ Differential
+ Cytokine
+ Progressive
+ Solution
+ Identification
+ Peptide
+ Synthesis
+ Protein
+ Macrophage
+ PDGF
+ Repression
+ CREB
+ Cellular
+ Plasma
+ A4
+ Latent
+ Wnt
+ Proteasome
+ Kinase
+ Proteasomal
+ Pathway
+ Sirtuin
+ MSCs
+ D2
+ Absence
+ Cells
+ Thalidomide
+ Regulation
+ Hippel
+ pH
+ Chinese
+ Th17
+ Uncoupling
+ Periostin
+ Promotes
+ Amplification
+ Smad
+ Profibrotic
+ Patients
+ Subpleural
+ Cytoskeletal
+ Progression
+ Lavage
+ Angiotensin
+ Domain
+ Peripheral
+ Inhibitor
+ Associated
+ Involvement
+ Serum
+ Toll
+ Activation
+ SPARC
+ Attenuating
+ Resveratrol
+ PI3K
+ Induced
+ Matrix
+ Leucine
+ BALF
+ Defective
+ Negative
+ JNK
+ Receptor
+ Reprogramming
+ CD11c
+ Nuclear
+ hydroxytryptamine2A
+ Is
+ Sustained
+ Essential
+ Beta
+ CBP
+ miRNA
+ Pathogenesis
+ Aging
+ Neutrophil
+ Nitrated
+ Resolution
+ Signal
+ Low
+ New
+ Contribution
+ Homeostasis
+ HLA
+ Sub
+ 26S
+ HMGA2
+ Treg
+ Significant
+ Blockade
+ HSP27
+ Clinical
+ Th1
+ Triptolide
+ Dependent
+ Inflammation
+ VEGF
+ PINK1
+ H1N1
+ E2
+ Discovery
+ Interplay
+ Secretory
+ CCL2
+ Dehydroepiandrosterone
+ Modulation
+ Mechanisms
+ MMP
+ Mediated
+ H19
+ Morphogenic
+ Ac
+ Corilagin
+ Tannic
+ Hypertension
+ WNT10A
+ MCTC
+ Possible
+ Studies
+ Pigment
+ Wilms
+ Hydrogen
+ Azithromycin
+ Number
+ Transglutaminase
+ Outcomes
+ NCI
+ RNA
+ WNT5A
+ Mice
+ Methylation
+ Novel
+ Nitric
+ Cell
+ HSP47
+ TRPV4
+ Protease
+ Release
+ For
+ Ligands
+ TGF
+ Epithelium
+ Aortic
+ NADPH
+ Herpes
+ Bach1
+ Ca
+ Dysregulated
+ Membrane
+ TGFBR
+ Connective
+ Decreased
+ TIAM1
+ Serpin
+ Fstl1
+ CCN1
+ BLM
+ Sphingosine
+ C57BL
+ Data
+ Dogs
+ Organizing
+ IGF
+ COL1
+ CD44V6
+ BMPER
+ ARPC2
+ Galectin
+ Lindau
+ Inappropriate
+ Microencapsulation
+ Oxidant
+ M2
+ Renshen
+ Sputum
+ Snail
+ Inducer
+ Prognostic
+ Storage
+ MAPK
+ Citrus
+ PPAR
+ Collagen
+ Matriptase
+ Arsenic
+ Long
+ F1
+ Deleted
+ Genomewide
+ PXS64
+ Lysyl
+ ERK
+ Calu
+ MyD88
+ aB
+ Activity
+ Applying
+ Secreted
+ Control
+ BM
+ Mitochondrial
+ Age
+ EZH2
+ Overproduction
+ Way
+ The
+ DDR2
+ 1A
+ Rho
+ Bronchoalveolar
+ TGFbeta1
+ Akt
+ ERK1
+ Novo
+ Curcumin
+ FGF
+ C5aR
+ 17A
+ Lysocardiolipin
+ Protects
+ Predisposition
+ Thy
+ C1q
+ Nintedanib
+ High
+ KCa3
+ Olodaterol
+ Reviews
+ Proliferation
+ Immunomodulation
+ Attenuates
+ Gremlin
+ Cthrc1
+ Vimentin
+ Elk1
+ Lipoxin
+ IQ
+ Roles
+ BAL
+ Relation
+ Autophagy
+ IGFBP
+ Inhibition
+ BMP
+ Anchorage
+ ITGB6
+ Mode
+ Modulating
+ miRNAs
+ Inhibit
+ PHGDH
+ Up
+ Phosphatase
+ TGFbeta
+ C3aR
+ Pseudomonas
+ Comparative
+ Reduced
+ Crosstalk
+ Conversion
+ Injury
+ Phenotypes
+ CD4
+ MicroRNAs
+ Regulates
+ TNFalpha
+ Pirfenidone
+ Raised
+ Old
+ Cartilage
+ Prostaglandin
+ BMP3
+ BARD1
+ Deficiency
+ RhoA
+ AKT2
+ NF
+ Cigarette
+ GATA
+ MAP3K19
+ sTNFR
+ NK
+ Different
+ Subjects
+ Autoimmunity
+ Mast
+ Single
+ Microsomal
+ WNT7B
+ MK2
+ TGFb1
+ CCN2
+ Growth
+ Prostatic
+ PGE
+ Abrogation
+ Stem
+ EBV
+ Microsatellite
+ Nrf2
+ Epstein
+ Club
+ TGFB1
+ ATG4B
+ Differentiation
+ EMMPRIN
+ Smad3
+ Genetic
+ Sorafenib
+ IFN
+ Impacts
+ Key
+ Activated
+ AKT
+ Th2
+ PTEN
+ USP11
+ IL
+ Effect
+ HDAC4
+ Free
+ Sunitinib
+ Established
+ Fluid
+ Decisive
+ Inhibits
+ Marks
+ mTORC2
+ Trigger
+ Concentration
+ Intratracheal
+ Participation
+ Against
+ Expression
+ kappaB
+ Role
+ Rats
+ Intrinsic
+ Epigenetic
+ Smooth
+ NOX4
+ Tumor
+ Rapamycin
+ microRNA
+ Overexpression
+ Current
+ Muc5ac
+ Combined
+ II
+ D1
+ Accelerated
+ Regulator
+ Pleural
+ Invasive
+ alphaEbeta7
+ From
+ MUM
+ Immunoglobulin
+ Beta1
+ Small
+ Sphingolipids
+ Stiffening
+ FFPE
+ miR
+ LPA1
+ B4
+ Two
+ Extracellular
+ Enhances
+ Evaluation
+ Recent
+ Elevated
+ Re
+ IIP
+ CD44
+ Interactions
+ CXCL9
+ Protective
+ Ets
+ Preventive
+ Establishment
+ ALK5
+ Increased
+ Values
+ kDa
+ Large
+ May
+ Transcription
+ ACLP
+ Cryptogenic
+ Ubiquitin
+ GLI
+ L5
+ Discoidin
+ Bleomycin
+ Carbon
+ Renin
+ CUX1
+ Correct
+ Constitutive
+ SNAI
+ Bile
+ Assessment
+ Fibrotic
+ Differing
+ Development
+ Channel
+ Simvastatin
+ CAM
+ Fibroblasts
+ Melatonin
+ SIRT6
+ STAT3
+ Tensin
+ Pingfei
+ Stanniocalcin
+ Bax
+ Group
+ mRNA
+ Selectivity
+ Emphysema
+ Barr
+ Berberine
+ Metformin
+ Hsp90
+ CDCP1
+ T869C
+ EMT
+ ADAM
+ Cub
+ Pneumonia
+ Induces
+ FOXF1
+ Upregulation
+ H441
+ RAGE
+ Myofibroblasts
+ JAK2
+ Interstitial
+ Amplified
+ Fibrosis
+ Microarray
+ Developmental
+ CTGF
+ Serine
+ Integrin
+ AP
+ Fas
+ During
+ CCN5
+ Insulin
+ Pleiotropic
+ TGFb
+ Evidence
+ Phenoconversion
+ Comparison
+ Smoke
+ Box
+ microRNAs
+ Anti
+ Suppression
+ A549
+ Chop
+ Jun
+ Myofibroblast
+ Dysfunction
+ Axis
+ IPF
+ MS80
+ S1P
+ Inhibitory
+ Interleukin
+ Action
+ Bioenergetics
+ Transition
+ Hyper
+ Lrp5
+ Model
+ cAMP
+ Medical
+ SDKP
+ UIP
+ Animal
+ Forkhead
+ lncRNA
+ Lung
+ Antifibrotic
+ Induction
+ Titration
+ Epithelial
+ OSF
+ ATPase
+ Reactive
+ TNF
+ aVb6
+ Molecule
+ NEU1
+ Deregulation
+ Idiopathic
+ An
+ Using
+ Quantifying
+ BMPR2
+ Foxp3high
+ sL1
+ Profile
+ Sulf2
+ Mediates
+ Markers
+ VCAM
--- a/scripts/wisse.py 0 → 100644
View file @d242c75
+++ b/scripts/wisse.py 0 → 100644
View file @d242c75
+ #!/usr/bin/python
+ # -*- coding: latin-1 -*-
+ # Python2.7
+ 
+ import numpy as np
+ import logging
+ import os
+ from functools import partial
+ from pdb import set_trace as st
+ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
+                     level=logging.INFO)
+ 
+ 
+ class wisse(object):
+     """ Both the TFIDFVectorizer and the word embedding model must be pretrained, either from the local 
+         sentence corpus or from model persintence.
+     """
+     def __init__(self, embeddings, vectorizer, tf_tfidf, combiner = "sum"):
+         self.tokenize = vectorizer.build_tokenizer()
+         self.tfidf = vectorizer
+         self.embedding = embeddings
+         self.pred_tfidf = tf_tfidf
+         if combiner.startswith("avg"):
+             self.comb = partial(np.mean, axis = 0)
+         else:
+             self.comb = partial(np.sum, axis = 0)
+ 
+ 
+     def fit(self, X, y = None): # Scikit-learn template
+         if isinstance(X, list):
+             self.sentences = X
+ 
+         return self
+ 
+ 
+     def transform(self, X):
+         if isinstance(X, list):
+             return self.fit(X)
+ 
+         elif isinstance(X, str):
+             return self.infer_sentence(X)
+ 
+     
+     def fit_transform(self, X, y=None):
+         return self.transform(X)
+ 
+ 
+     def infer_sentence(self, sent):
+         ss = self.tokenize(sent)
+         missing_bow = []
+         missing_cbow = []
+         series = {}
+ 
+         if not ss == []:
+             self.weights, m = self.infer_tfidf_weights(ss)
+         else:
+             return None
+ 
+         missing_bow += m
+ 
+         for w in self.weights:
+             try:
+                 series[w] = (self.weights[w], self.embedding[w])
+             except KeyError:
+                 series[w] = None
+                 missing_cbow.append(w)
+                 continue
+             except IndexError:
+                 continue
+ 
+         if self.weights == {}: return None
+         # Embedding the sentence... :
+         sentence = np.array([series[w][1] for w in series if not series[w] is None])
+         series = {}
+ 
+         return missing_cbow, missing_bow, self.comb(sentence)
+ 
+ 
+     def infer_tfidf_weights(self, sentence):
+         existent = {}
+         missing = []
+ 
+         if not self.tfidf:
+             for word in sentence:
+                 existent[word] = 1.0
+ 
+             return existent, missing
+ 
+         if self.pred_tfidf:
+             unseen = self.tfidf.transform([" ".join(sentence)]).toarray()
+             for word in sentence:
+                 try:
+                     existent[word] = unseen[0][self.tfidf.vocabulary_[word]]
+                 except KeyError:
+                     missing.append(word)
+                     continue
+         else:
+             for word in sentence:
+                 try:
+                     weight = vectorizer.idf_[vectorizer.vocabulary_[word]]
+                     existent[word] = weight if weight > 2 else 0.01
+                 except KeyError:
+                     missing.append(word)
+                     continue
+ 
+         return existent, missing
+ 
+ 
+     def __iter__(self):
+         for s in self.sentences:
+             yield self.transform(s)
+ 
+ 
+ def save_dense(directory, filename, array):
+     directory=os.path.normpath(directory) + '/'
+ #    try:
+     if filename.isalpha():
+             np.save(directory + filename, array)
+     else:
+             return None
+ #    except UnicodeEncodeError:
+ #        return None    
+ 
+ def load_dense(filename):
+     return np.load(filename)
+ 
+ 
+ def load_sparse_bsr(filename):
+     loader = np.load(filename) 
+     return bsr_matrix((loader['data'], loader['indices'], loader['indptr']),                       
+         shape=loader['shape']) 
+ 
+ 
+ def save_sparse_bsr(directory, filename, array):     
+ # note that .npz extension is added automatically     
+     directory=os.path.normpath(directory) + '/'
+     if word.isalpha():
+         array=array.tobsr()
+         np.savez(directory + filename, data=array.data, indices=array.indices,              
+             indptr=array.indptr, shape=array.shape) 
+     else:
+         return None
+ 
+ 
+ class vector_space(object):
+     def __init__(self, directory, sparse = False):
+         self.sparse = sparse 
+         ext = ".npz" if sparse else ".npy"
+         if directory.endswith(".tar.gz"):
+             self._tar = True
+             import tarfile
+             self.tar = tarfile.open(directory)
+             file_list = self.tar.getnames() #[os.path.basename(n) for n in self.tar.getnames()]
+             self.words = {os.path.basename(word).replace(ext, ''): word 
+                                                     for word in file_list}
+         else:
+             self._tar = False
+             directory = os.path.normpath(directory) + '/' 
+             file_list = os.listdir(directory)
+             self.words = {word.replace(ext, ''): directory + word 
+                                                 for word in file_list}
+ 
+ 
+     def __getitem__(self, item):
+         if self.sparse:
+             if self._tar:
+                 member = self.tar.getmember(self.words[item])
+                 word = self.tar.extractfile(member)
+             else:
+                 word = self.words[item]
+             #return load_sparse_bsr(self.words[item])
+             return load_sparse_bsr(word) 
+ 
+         else:
+             if self._tar:
+                 member = self.tar.getmember(self.words[item])
+                 word = self.tar.extractfile(member)
+             else:
+                 word = self.words[item]
+             #return load_sparse_bsr(self.words[item])
+             return load_dense(word)
+ 
+ 
+ def keyed2indexed(keyed_model, output_dir = "word_embeddings/", parallel = True, n_jobs = -1):
+     output_dir = os.path.normpath(output_dir) + '/'
+     if not os.path.exists(output_dir):
+         os.makedirs(output_dir)
+ 
+     if parallel:
+         from joblib import Parallel, delayed
+ 
+         Parallel(n_jobs = n_jobs, verbose = 10)(delayed(save_dense)(output_dir, word, keyed_model[word]) 
+                                                         for word, _ in keyed_model.vocab.items())
+     else:
+         for word, _ in keyed_model.vocab.items():
+             save_dense(output_dir, word, keyed_model[word])
+     
+ 
+ class streamer(object):
+     def __init__(self, file_name):
+         self.file_name = file_name
+ 
+     def __iter__(self):
+         for s in open(self.file_name):
+             yield s.strip()
--- a/scripts/wisse.pyc 0 → 100644
View file @d242c75
+++ b/scripts/wisse.pyc 0 → 100644
View file @d242c75
--- a/scripts/wisse_example.py 0 → 100644
View file @d242c75
+++ b/scripts/wisse_example.py 0 → 100644
View file @d242c75
+ #!/usr/bin/python
+ # -*- coding: latin-1 -*-
+ # Python2.7
+ from gensim.models.keyedvectors import KeyedVectors as vDB
+ from sklearn.feature_extraction.text import TfidfVectorizer
+ import numpy as np
+ #import numexpr as ne
+ import argparse
+ #import _pickle as pickle
+ #import cPickle as pickle
+ import logging
+ import os
+ from functools import partial
+ import wisse
+ 
+ 
+ load_vectors = vDB.load_word2vec_format
+ 
+ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
+                     level=logging.INFO)
+ 
+ 
+ if __name__ == "__main__":
+ 
+     parser = argparse.ArgumentParser(description="""This use example shows sentence 
+         embedding by using WISSE. The input is a text file which has a sentece in 
+         each of its rows. The output file has two tab-separated columns: the index
+         line of the sentece in the input file and the sentence vector representation
+         .""")
+     parser.add_argument("--idfmodel", help = """Input file containing IDF
+                                         pre-trained weights. If not provided,
+                                         all word vector weights will be set to
+                                         1.0. If 'local' tf-idf weights will be
+                                         computed locally from the input file
+                                         (pickled sklearn object).""",
+                                         default = None)
+     parser.add_argument("--embedmodel", help = """Input file containing word
+                                             embeddings model (binary and text
+                                             are allowed).""", required = True)
+     parser.add_argument("--output", help = """Output file containing the sentence
+                                             embeddings.""", default = "")
+     parser.add_argument("--input", help = """Input file containing a sentence
+                                             by row.""", required = True)
+     parser.add_argument("--comb", help = """Desired word vector combination for
+                                         sentence representation {sum, avg}.
+                                         (default = 'sum')""", default = "sum")
+     parser.add_argument("--suffix", nargs = '?', help = """A suffix to be added
+                                         to the output file (default = '')""",
+                                             default = "", required = False)
+     parser.add_argument("--tfidf", help="""To predict TFIDF complete weights
+                                         ('tfidf') or use only partial IDFs
+                                         ('idf'). (default = 'tfidf')""",
+                                         default = "tfidf")
+     parser.add_argument("--localw", help = """TFIDF word vector weights
+                                     computed locally from the input file of
+                                     sentences {freq, binary, sublinear}
+                                     (default='none').""", default = "none")
+     parser.add_argument("--stop", help = """Toggles stripping stop words in
+                                     locally computed word vector weights.""",
+                                                         action = "store_true")
+     parser.add_argument("--format", help = """The format of the embedding model
+                                      file: {binary, text, wisse}. 
+                                     default = 'binary'""", default = "binary")
+     args = parser.parse_args()
+ 
+ 
+     if not args.format.startswith("wisse"):
+         if not os.path.isfile(args.embedmodel):
+             logging.info("""Embedding model file does not exist (EXIT):
+                 \n%s\n ...""" % args.embedmodel)
+             exit()
+     elif not os.path.exists(args.embedmodel):
+         logging.info("""Embedding model directory does not exist (EXIT):
+                 \n%s\n ...""" % args.embedmodel)
+         exit()
+ 
+     if not os.path.isfile(args.idfmodel) and not args.idfmodel.startswith("local"):
+         logging.info("""IDF model file does not exist (EXIT):
+                 \n%s\n ...""" % args.idfmodel)
+         exit()
+     if not os.path.isfile(args.input):
+         logging.info("""Input file does not exist (EXIT):
+                 \n%s\n ...""" % args.input)
+         exit()
+     if args.output != "":
+         if os.path.dirname(args.output) != "":
+             if not os.path.exists(os.path.dirname(args.output)):
+                 logging.info("""Output directory does not exist (EXIT):
+                     \n%s\n ...""" % args.output)
+                 exit()
+             else:
+                 output_name = args.output
+         else:
+             output_name = args.output
+     else:
+         suffix = "_".join([embedding_name,
+             args.comb,
+             args.tfidf,
+             "local" if args.idfmodel.startswith("local") else tfidf_name,
+             args.suffix]).strip("_")
+         output_name = args.input + ".output_" + suffix
+ 
+ 
+     if args.tfidf.startswith("tfidf"):
+         pred_tfidf = True
+     elif args.tfidf.startswith("idf"):
+         pred_tfidf = False
+     else:
+         pred_tfidf = False
+         tfidf = False
+ 
+     vectorizer = TfidfVectorizer(min_df = 1,
+                 encoding = "latin-1",
+                 decode_error = "replace",
+                 lowercase = True,
+                 binary = True if args.localw.startswith("bin") else False,
+                 sublinear_tf = True if args.localw.startswith("subl") else False,
+                 stop_words = "english" if args.stop else None)
+ 
+     sentences = wisse.streamer(args.input)
+ 
+     if args.idfmodel.startswith("local"):
+         logging.info("Fitting local TFIDF weights from: %s ..." % args.input)
+         tfidf = vectorizer.fit(sentences)
+ 
+     elif os.path.isfile(args.idfmodel):
+         logging.info("Loading global TFIDF weights from: %s ..." % args.idfmodel)
+         with open(args.idfmodel, 'rb') as f:
+             tfidf = pickle.load(f)#, encoding = 'latin-1')
+ 
+     else:
+         tfidf = False
+ 
+     try:
+         if args.format.startswith("bin"):
+             embedding = load_vectors(args.embedmodel, binary = True,
+                                                         encoding = "latin-1")
+         elif args.format.startswith("tex"):
+             embedding = load_vectors(args.embedmodel, binary = False,
+                                                         encoding = "latin-1")
+         else:
+             embedding = wisse.vector_space(args.embedmodel, sparse = False)
+ 
+     except:
+         logging.info(
+             """Error while loading word embedding model. Verify if the file
+             is broken (EXIT)...\n%s\n""" % args.embedmodel)
+         exit()
+ 
+     embedding_name = os.path.basename(args.embedmodel).split(".")[0]
+     tfidf_name = os.path.basename(args.idfmodel).split(".")[0]
+ 
+     missing_bow = []    # Stores missing words in the TFIDF model
+     missing_cbow = []   # Stores missing words in the W2V model
+     sidx = 0 # The index of the sentence according to the input file
+     logging.info("\n\nEmbedding sentences and saving then to a the output file..\n%s\n" % output_name)
+ 
+     with open(output_name, "w") as fo:
+         for sent in sentences:
+             sidx += 1
+             series = wisse.wisse(embeddings = embedding, vectorizer = tfidf, 
+                                                 tf_tfidf = True, combiner='sum')
+             try:
+                 mc, mb, vector = series.transform(sent)
+             except TypeError:
+                 continue
+ 
+             # At this point you can use the embedding 'vector' for any application as it
+             # is a numpy array. Also you can simply save the vectors in text format as
+             # follows:
+             missing_cbow += mc
+             missing_bow += mb
+             fo.write("%d\t%s\n" % (sidx, np.array2string(vector,
+                                 formatter = {'float_kind':lambda x: "%.6f" % x},
+                                 max_line_width = 20000).strip(']').strip('[') ))
+ 
+     missing_name = (os.path.basename(args.input).split(".")[0] + "_" +
+                                                         embedding_name + "_" +
+                                                         tfidf_name + ".missing")
+     logging.info("\n\nSaving missing vocabulary to %s ..\n\n" % missing_name)
+ 
+     with open(missing_name, "w") as f:
+         f.write("# missing word embeddings:\n")
+         for w in set(missing_cbow):
+             f.write("%s\n" % w)
+ 
+         f.write("# missing MI weights:\n")
+         for w in set(missing_bow):
+             f.write("%s\n" % w)
+ 
+     logging.info("FINISHED! \n")