Showing
8 changed files
with
1786 additions
and
0 deletions
corpora/abstracts-titles.txt
0 → 100644
This diff is collapsed. Click to expand it.
embeddings/abstracts-titles.vec
0 → 100644
This diff could not be displayed because it is too large.
1 | +# missing word embeddings: | ||
2 | +29c | ||
3 | +14 | ||
4 | +31 | ||
5 | +26a | ||
6 | +beta1 | ||
7 | +486 | ||
8 | +profibrotic | ||
9 | +92a | ||
10 | +29a | ||
11 | +matricellular | ||
12 | +155 | ||
13 | +13 | ||
14 | +b1 | ||
15 | +etiopathology | ||
16 | +nexin | ||
17 | +avb6 | ||
18 | +21 | ||
19 | +19 | ||
20 | +acetylglucosaminidase | ||
21 | +326 | ||
22 | +dermatan | ||
23 | +1b | ||
24 | +p38 | ||
25 | +47 | ||
26 | +68 | ||
27 | +3p | ||
28 | +dedifferentiating | ||
29 | +alpha3beta1 | ||
30 | +chymase | ||
31 | +2alpha | ||
32 | +90 | ||
33 | +7d | ||
34 | +5100 | ||
35 | +18a | ||
36 | +1a | ||
37 | +alpha3 | ||
38 | +150 | ||
39 | +1343 | ||
40 | +140 | ||
41 | +541 | ||
42 | +fibrogenesis | ||
43 | +221 | ||
44 | +mitophagy | ||
45 | +b4 | ||
46 | +p63 | ||
47 | +10 | ||
48 | +5p | ||
49 | +haptotactic | ||
50 | +nonkinase | ||
51 | +farnesoid | ||
52 | +lymphopoietin | ||
53 | +7a | ||
54 | +18 | ||
55 | +338 | ||
56 | +1beta | ||
57 | +424 | ||
58 | +199a | ||
59 | +154 | ||
60 | +smad2 | ||
61 | +17 | ||
62 | +185 | ||
63 | +bronchoalveolar | ||
64 | +101 | ||
65 | +profibrogenic | ||
66 | +153 | ||
67 | +gambogic | ||
68 | +ubiquitinating | ||
69 | +p110y | ||
70 | +200 | ||
71 | +323a | ||
72 | +196a | ||
73 | +dysregulates | ||
74 | +fibrogenic | ||
75 | +salvianolic | ||
76 | +# missing MI weights: | ||
77 | +Tumor | ||
78 | +sTNFR | ||
79 | +Compromised | ||
80 | +Modulating | ||
81 | +Renin | ||
82 | +Invasive | ||
83 | +Cytokine | ||
84 | +D1 | ||
85 | +C3aR | ||
86 | +Prostaglandin | ||
87 | +Diagnostic | ||
88 | +HLA | ||
89 | +Interstitial | ||
90 | +Transforming | ||
91 | +Phenotypes | ||
92 | +Up | ||
93 | +Clinical | ||
94 | +TGFb | ||
95 | +Interplay | ||
96 | +Animal | ||
97 | +Decoction | ||
98 | +Foxp3high | ||
99 | +Leucine | ||
100 | +CCL2 | ||
101 | +SDKP | ||
102 | +IL | ||
103 | +Correct | ||
104 | +Elk1 | ||
105 | +Participation | ||
106 | +Azithromycin | ||
107 | +Amplification | ||
108 | +1A | ||
109 | +Endothelin | ||
110 | +Kinase | ||
111 | +Possible | ||
112 | +Associated | ||
113 | +MicroRNA | ||
114 | +sL1 | ||
115 | +Action | ||
116 | +For | ||
117 | +Development | ||
118 | +Establishment | ||
119 | +Extracellular | ||
120 | +Epstein | ||
121 | +MUM | ||
122 | +Serine | ||
123 | +Interactions | ||
124 | +Macrophage | ||
125 | +Pseudomonas | ||
126 | +Cytokines | ||
127 | +IGFBP | ||
128 | +Calu | ||
129 | +PI3K | ||
130 | +B4 | ||
131 | +Chinese | ||
132 | +Cigarette | ||
133 | +BALF | ||
134 | +NK | ||
135 | +alphaEbeta7 | ||
136 | +microRNAs | ||
137 | +Gene | ||
138 | +True | ||
139 | +Pneumonia | ||
140 | +CUX1 | ||
141 | +miRNA | ||
142 | +Ubiquitin | ||
143 | +Wnt | ||
144 | +CDCP1 | ||
145 | +Barr | ||
146 | +ORP150 | ||
147 | +Curcumin | ||
148 | +aB | ||
149 | +Myofibroblast | ||
150 | +JNK | ||
151 | +Aortic | ||
152 | +L5 | ||
153 | +Hsp90 | ||
154 | +Smoke | ||
155 | +IFN | ||
156 | +Reactive | ||
157 | +Significance | ||
158 | +aVb6 | ||
159 | +Th2 | ||
160 | +Constitutive | ||
161 | +Melatonin | ||
162 | +Mediates | ||
163 | +AP | ||
164 | +Lindau | ||
165 | +ALK5 | ||
166 | +AKT2 | ||
167 | +EMT | ||
168 | +Reversion | ||
169 | +Patients | ||
170 | +Solution | ||
171 | +Focal | ||
172 | +Immunoglobulin | ||
173 | +RhoA | ||
174 | +Neutrophil | ||
175 | +BMPR2 | ||
176 | +CXCL9 | ||
177 | +Lower | ||
178 | +Olodaterol | ||
179 | +Lavage | ||
180 | +Induces | ||
181 | +Induced | ||
182 | +Beyond | ||
183 | +MMP | ||
184 | +Fstl1 | ||
185 | +cAMP | ||
186 | +Aging | ||
187 | +SPARC | ||
188 | +Activity | ||
189 | +ECM | ||
190 | +Human | ||
191 | +Fibrotic | ||
192 | +Repression | ||
193 | +Prostatic | ||
194 | +NOX4 | ||
195 | +Against | ||
196 | +Discovery | ||
197 | +Resveratrol | ||
198 | +Interleukin | ||
199 | +Deleted | ||
200 | +Differentiation | ||
201 | +Spiruchostatin | ||
202 | +Insulin | ||
203 | +PXS64 | ||
204 | +MCTC | ||
205 | +HP | ||
206 | +Triptolide | ||
207 | +Values | ||
208 | +Differential | ||
209 | +Lysyl | ||
210 | +FGF | ||
211 | +Promote | ||
212 | +Yin | ||
213 | +Tissue | ||
214 | +BMP | ||
215 | +CAM | ||
216 | +Discoidin | ||
217 | +Overproduction | ||
218 | +Wilms | ||
219 | +Regulates | ||
220 | +Reduced | ||
221 | +Mesenchymal | ||
222 | +Inflammation | ||
223 | +PINK1 | ||
224 | +Release | ||
225 | +E2 | ||
226 | +CD44V6 | ||
227 | +TGFbeta1 | ||
228 | +Rg1 | ||
229 | +H441 | ||
230 | +kDa | ||
231 | +M2 | ||
232 | +Expression | ||
233 | +Data | ||
234 | +Contribution | ||
235 | +Negative | ||
236 | +Nintedanib | ||
237 | +Pulmonary | ||
238 | +Emphysema | ||
239 | +Intrinsic | ||
240 | +17A | ||
241 | +Marks | ||
242 | +26S | ||
243 | +RXFP1 | ||
244 | +Neovessel | ||
245 | +Genetic | ||
246 | +Protective | ||
247 | +Tensin | ||
248 | +Inducer | ||
249 | +Like | ||
250 | +Determining | ||
251 | +Increased | ||
252 | +Microsatellite | ||
253 | +Sphingosine | ||
254 | +TRPV4 | ||
255 | +Club | ||
256 | +Methylation | ||
257 | +Phenoconversion | ||
258 | +Serpin | ||
259 | +Activated | ||
260 | +Muc5ac | ||
261 | +MyD88 | ||
262 | +Glucagon | ||
263 | +IPF | ||
264 | +SIRT6 | ||
265 | +Rapamycin | ||
266 | +Essential | ||
267 | +TNFalpha | ||
268 | +Corilagin | ||
269 | +Sorafenib | ||
270 | +Epithelial | ||
271 | +T869C | ||
272 | +Induction | ||
273 | +Long | ||
274 | +Wt1 | ||
275 | +Molecules | ||
276 | +TGFBR2 | ||
277 | +P110 | ||
278 | +Nuclear | ||
279 | +Old | ||
280 | +Targeting | ||
281 | +WNT7B | ||
282 | +Thy | ||
283 | +Potential | ||
284 | +TGF | ||
285 | +Tubastatin | ||
286 | +Semaphorin | ||
287 | +Attenuating | ||
288 | +Smad | ||
289 | +Pigment | ||
290 | +Homolog | ||
291 | +Binding | ||
292 | +microRNA | ||
293 | +C57BL | ||
294 | +Regulating | ||
295 | +Implications | ||
296 | +Hydrogen | ||
297 | +BARD1 | ||
298 | +A549 | ||
299 | +Homeostasis | ||
300 | +Selectivity | ||
301 | +Medical | ||
302 | +Model | ||
303 | +Cytoskeletal | ||
304 | +Differing | ||
305 | +BMP3 | ||
306 | +Enhances | ||
307 | +NADPH | ||
308 | +Fibrogenesis | ||
309 | +Defect | ||
310 | +Two | ||
311 | +FAK | ||
312 | +RNA | ||
313 | +Quantifying | ||
314 | +Epigenetic | ||
315 | +Profibrotic | ||
316 | +Ambroxol | ||
317 | +Trigger | ||
318 | +Titration | ||
319 | +Transcription | ||
320 | +Regulation | ||
321 | +Mitochondrial | ||
322 | +H1N1 | ||
323 | +Recent | ||
324 | +BMPER | ||
325 | +PPARs | ||
326 | +VCAM | ||
327 | +Microsomal | ||
328 | +Hippel | ||
329 | +Renshen | ||
330 | +Absence | ||
331 | +Anchorage | ||
332 | +Applying | ||
333 | +Free | ||
334 | +OSF | ||
335 | +PGE | ||
336 | +Tannic | ||
337 | +Plasminogen | ||
338 | +TGFBR | ||
339 | +Channel | ||
340 | +Age | ||
341 | +Cell | ||
342 | +Connective | ||
343 | +Proteasomal | ||
344 | +RAGE | ||
345 | +Bach1 | ||
346 | +Pirfenidone | ||
347 | +Outcomes | ||
348 | +GATA | ||
349 | +Small | ||
350 | +Autoimmunity | ||
351 | +III | ||
352 | +VEGF | ||
353 | +Control | ||
354 | +HSP27 | ||
355 | +Cartilage | ||
356 | +Periostin | ||
357 | +Idiopathic | ||
358 | +COL1A1 | ||
359 | +CBP | ||
360 | +Bronchoalveolar | ||
361 | +Crosstalk | ||
362 | +Amplified | ||
363 | +Evidence | ||
364 | +Simvastatin | ||
365 | +Sphingolipids | ||
366 | +Mechanisms | ||
367 | +JAK2 | ||
368 | +Rats | ||
369 | +Mice | ||
370 | +Protease | ||
371 | +From | ||
372 | +LPA1 | ||
373 | +Collagen | ||
374 | +Carbon | ||
375 | +Molecular | ||
376 | +Stat3 | ||
377 | +Genomewide | ||
378 | +Stem | ||
379 | +S1P | ||
380 | +Novel | ||
381 | +EBV | ||
382 | +Serum | ||
383 | +Abrogation | ||
384 | +That | ||
385 | +Pingfei | ||
386 | +Stromal | ||
387 | +Current | ||
388 | +Molecule | ||
389 | +MAP3K19 | ||
390 | +Decisive | ||
391 | +Protein | ||
392 | +Fluid | ||
393 | +HDAC4 | ||
394 | +Angiotensin | ||
395 | +SOCS1 | ||
396 | +Different | ||
397 | +Membrane | ||
398 | +Domain | ||
399 | +Secretory | ||
400 | +Signalling | ||
401 | +NCI | ||
402 | +Bax | ||
403 | +ADAM | ||
404 | +Are | ||
405 | +Beta1 | ||
406 | +Activation | ||
407 | +Problem | ||
408 | +Prognostic | ||
409 | +II | ||
410 | +Sputum | ||
411 | +Phosphatase | ||
412 | +Inhibition | ||
413 | +Profile | ||
414 | +Dogs | ||
415 | +HRCT | ||
416 | +lncRNA | ||
417 | +Storage | ||
418 | +Nitrated | ||
419 | +Box | ||
420 | +Forkhead | ||
421 | +CREB | ||
422 | +Sirtuin | ||
423 | +Cryptogenic | ||
424 | +Decreased | ||
425 | +Inhibits | ||
426 | +Formation | ||
427 | +MK2 | ||
428 | +Comparison | ||
429 | +Mediated | ||
430 | +Latent | ||
431 | +Recombinant | ||
432 | +Microencapsulation | ||
433 | +PHGDH | ||
434 | +Organizing | ||
435 | +Dysfunction | ||
436 | +Way | ||
437 | +Using | ||
438 | +Peripheral | ||
439 | +Markers | ||
440 | +MiR | ||
441 | +Anti | ||
442 | +Studies | ||
443 | +May | ||
444 | +Significant | ||
445 | +Morphogenic | ||
446 | +Low | ||
447 | +Lactic | ||
448 | +Overexpression | ||
449 | +Protects | ||
450 | +Arsenic | ||
451 | +Caveolin | ||
452 | +pH | ||
453 | +Inhibit | ||
454 | +Proteasome | ||
455 | +MicroRNAs | ||
456 | +Toll | ||
457 | +Herpes | ||
458 | +CTGF | ||
459 | +Normal | ||
460 | +Defective | ||
461 | +CD44 | ||
462 | +Large | ||
463 | +Ligands | ||
464 | +Axis | ||
465 | +NH2 | ||
466 | +Progression | ||
467 | +Smad3 | ||
468 | +Phenotype | ||
469 | +Ets | ||
470 | +Identification | ||
471 | +kB | ||
472 | +Role | ||
473 | +Relation | ||
474 | +Mode | ||
475 | +Developmental | ||
476 | +Fibrosis | ||
477 | +Stanniocalcin | ||
478 | +WNT10A | ||
479 | +Integrated | ||
480 | +Syndecan | ||
481 | +Metalloproteinase | ||
482 | +TOB2 | ||
483 | +USP11 | ||
484 | +WISP1 | ||
485 | +Dysregulated | ||
486 | +Th17 | ||
487 | +Progressive | ||
488 | +Key | ||
489 | +Subpleural | ||
490 | +Mast | ||
491 | +Rho | ||
492 | +Growth | ||
493 | +Upregulation | ||
494 | +Alleviates | ||
495 | +Re | ||
496 | +Preventive | ||
497 | +ITGB6 | ||
498 | +Fibroblasts | ||
499 | +ATG4B | ||
500 | +Comparative | ||
501 | +Cthrc1 | ||
502 | +mRNA | ||
503 | +Peptide | ||
504 | +SNAI | ||
505 | +BM | ||
506 | +ATPase | ||
507 | +AKT | ||
508 | +Fibroblastic | ||
509 | +Matriptase | ||
510 | +Sub | ||
511 | +Sustained | ||
512 | +Pleiotropic | ||
513 | +New | ||
514 | +Regulator | ||
515 | +Receptor | ||
516 | +Therapeutic | ||
517 | +Vimentin | ||
518 | +IGF | ||
519 | +Cells | ||
520 | +LIGHT | ||
521 | +Production | ||
522 | +D2 | ||
523 | +Dehydroepiandrosterone | ||
524 | +Lin28B | ||
525 | +Antifibrotic | ||
526 | +Raised | ||
527 | +Proliferation | ||
528 | +Dependent | ||
529 | +COL1 | ||
530 | +Lysocardiolipin | ||
531 | +Epithelium | ||
532 | +STAT3 | ||
533 | +Prevents | ||
534 | +Th1 | ||
535 | +NF | ||
536 | +CCN5 | ||
537 | +Snail | ||
538 | +Myogenic | ||
539 | +CD4 | ||
540 | +Akt | ||
541 | +TGFb1 | ||
542 | +Accelerated | ||
543 | +PDGF | ||
544 | +Intratracheal | ||
545 | +TGFB1 | ||
546 | +Cysteine | ||
547 | +Oxidant | ||
548 | +Effect | ||
549 | +Reprogramming | ||
550 | +IIP | ||
551 | +MS80 | ||
552 | +FOXF1 | ||
553 | +Promotes | ||
554 | +Assessment | ||
555 | +BLM | ||
556 | +CC16 | ||
557 | +BAL | ||
558 | +CD248 | ||
559 | +Ginsenoside | ||
560 | +Secreted | ||
561 | +Association | ||
562 | +IQ | ||
563 | +mTORC2 | ||
564 | +Established | ||
565 | +The | ||
566 | +Combined | ||
567 | +Jun | ||
568 | +UIP | ||
569 | +Sulf2 | ||
570 | +Thalidomide | ||
571 | +Bioenergetics | ||
572 | +TNF | ||
573 | +CCN2 | ||
574 | +NEU1 | ||
575 | +Attenuates | ||
576 | +HMGA2 | ||
577 | +Group | ||
578 | +Conversion | ||
579 | +Predisposition | ||
580 | +Transglutaminase | ||
581 | +Pathway | ||
582 | +Reviews | ||
583 | +Treg | ||
584 | +DDR2 | ||
585 | +Autophagy | ||
586 | +Hyper | ||
587 | +Bile | ||
588 | +Sunitinib | ||
589 | +Stiffening | ||
590 | +Signal | ||
591 | +Resolution | ||
592 | +De | ||
593 | +Type | ||
594 | +Factor | ||
595 | +Smad2 | ||
596 | +Single | ||
597 | +PPAR | ||
598 | +WNT5A | ||
599 | +Novo | ||
600 | +An | ||
601 | +EGFR | ||
602 | +Cub | ||
603 | +GLI | ||
604 | +HSP47 | ||
605 | +Early | ||
606 | +ERK1 | ||
607 | +TGFbeta | ||
608 | +Deficiency | ||
609 | +hydroxytryptamine2A | ||
610 | +BAX | ||
611 | +Inhibitory | ||
612 | +Integrin | ||
613 | +Suppression | ||
614 | +Shikonin | ||
615 | +SMAD3 | ||
616 | +Effects | ||
617 | +Metformin | ||
618 | +F1 | ||
619 | +MAPK | ||
620 | +Modulation | ||
621 | +Bleomycin | ||
622 | +Injury | ||
623 | +Elevated | ||
624 | +Cellular | ||
625 | +Radioligand | ||
626 | +Citrus | ||
627 | +TIAM1 | ||
628 | +Subjects | ||
629 | +Lung | ||
630 | +ARPC2 | ||
631 | +H19 | ||
632 | +EZH2 | ||
633 | +Pathways | ||
634 | +Is | ||
635 | +Microarray | ||
636 | +Fas | ||
637 | +CCN1 | ||
638 | +Ac | ||
639 | +miRNAs | ||
640 | +Myofibroblasts | ||
641 | +FFPE | ||
642 | +Inhibitor | ||
643 | +During | ||
644 | +Matrix | ||
645 | +Nrf2 | ||
646 | +Immunomodulation | ||
647 | +C5aR | ||
648 | +Gremlin | ||
649 | +High | ||
650 | +Concentration | ||
651 | +Evaluation | ||
652 | +Roles | ||
653 | +Number | ||
654 | +Bone | ||
655 | +ACLP | ||
656 | +Hypertension | ||
657 | +Lipogenic | ||
658 | +Uncoupling | ||
659 | +Signaling | ||
660 | +Lrp5 | ||
661 | +Berberine | ||
662 | +A4 | ||
663 | +CD11c | ||
664 | +miR | ||
665 | +Chop | ||
666 | +Galectin | ||
667 | +Alveolar | ||
668 | +Transition | ||
669 | +Plasma | ||
670 | +Impacts | ||
671 | +Smad4 | ||
672 | +Its | ||
673 | +Pathogenesis | ||
674 | +Inappropriate | ||
675 | +Investigation | ||
676 | +Beta | ||
677 | +Ca | ||
678 | +ERK | ||
679 | +Deregulation | ||
680 | +MSCs | ||
681 | +PTEN | ||
682 | +Lipoxin | ||
683 | +Nitric | ||
684 | +C1q | ||
685 | +KCa3 | ||
686 | +kappaB | ||
687 | +Involvement | ||
688 | +MCP | ||
689 | +Pleural | ||
690 | +EMMPRIN | ||
691 | +Smooth | ||
692 | +Synthesis | ||
693 | +Blockade | ||
694 | +Compared | ||
695 | +Transgelin |
scripts/__pycache__/wisse.cpython-36.pyc
0 → 100644
No preview for this file type
1 | +# missing word embeddings: | ||
2 | +profibrogenic | ||
3 | +199a | ||
4 | +p38 | ||
5 | +beta1 | ||
6 | +68 | ||
7 | +etiopathology | ||
8 | +1343 | ||
9 | +lymphopoietin | ||
10 | +29c | ||
11 | +185 | ||
12 | +5p | ||
13 | +17 | ||
14 | +dermatan | ||
15 | +1a | ||
16 | +13 | ||
17 | +424 | ||
18 | +101 | ||
19 | +p63 | ||
20 | +140 | ||
21 | +b1 | ||
22 | +fibrogenic | ||
23 | +gambogic | ||
24 | +nonkinase | ||
25 | +21 | ||
26 | +alpha3 | ||
27 | +154 | ||
28 | +2alpha | ||
29 | +chymase | ||
30 | +18a | ||
31 | +196a | ||
32 | +5100 | ||
33 | +smad2 | ||
34 | +7d | ||
35 | +541 | ||
36 | +1b | ||
37 | +acetylglucosaminidase | ||
38 | +326 | ||
39 | +47 | ||
40 | +dysregulates | ||
41 | +92a | ||
42 | +200 | ||
43 | +29a | ||
44 | +90 | ||
45 | +31 | ||
46 | +mitophagy | ||
47 | +b4 | ||
48 | +3p | ||
49 | +nexin | ||
50 | +dedifferentiating | ||
51 | +155 | ||
52 | +150 | ||
53 | +ubiquitinating | ||
54 | +10 | ||
55 | +486 | ||
56 | +19 | ||
57 | +avb6 | ||
58 | +fibrogenesis | ||
59 | +farnesoid | ||
60 | +haptotactic | ||
61 | +alpha3beta1 | ||
62 | +14 | ||
63 | +323a | ||
64 | +matricellular | ||
65 | +7a | ||
66 | +profibrotic | ||
67 | +bronchoalveolar | ||
68 | +26a | ||
69 | +18 | ||
70 | +salvianolic | ||
71 | +338 | ||
72 | +1beta | ||
73 | +p110y | ||
74 | +221 | ||
75 | +153 | ||
76 | +# missing MI weights: | ||
77 | +Compared | ||
78 | +Are | ||
79 | +True | ||
80 | +Ambroxol | ||
81 | +Diagnostic | ||
82 | +Alveolar | ||
83 | +Smad2 | ||
84 | +Neovessel | ||
85 | +RXFP1 | ||
86 | +Normal | ||
87 | +Shikonin | ||
88 | +Spiruchostatin | ||
89 | +ORP150 | ||
90 | +Tubastatin | ||
91 | +That | ||
92 | +Bone | ||
93 | +WISP1 | ||
94 | +Wt1 | ||
95 | +Smad4 | ||
96 | +ECM | ||
97 | +Syndecan | ||
98 | +Radioligand | ||
99 | +BAX | ||
100 | +De | ||
101 | +FAK | ||
102 | +Prevents | ||
103 | +Endothelin | ||
104 | +kB | ||
105 | +Promote | ||
106 | +Reversion | ||
107 | +Determining | ||
108 | +Cytokines | ||
109 | +Glucagon | ||
110 | +Pathways | ||
111 | +Myogenic | ||
112 | +SOCS1 | ||
113 | +Investigation | ||
114 | +Regulating | ||
115 | +Targeting | ||
116 | +Decoction | ||
117 | +Stromal | ||
118 | +PPARs | ||
119 | +HP | ||
120 | +Focal | ||
121 | +Transgelin | ||
122 | +Association | ||
123 | +Effects | ||
124 | +EGFR | ||
125 | +Gene | ||
126 | +Human | ||
127 | +Metalloproteinase | ||
128 | +Lower | ||
129 | +Rg1 | ||
130 | +Binding | ||
131 | +Therapeutic | ||
132 | +Mesenchymal | ||
133 | +CD248 | ||
134 | +Formation | ||
135 | +Cysteine | ||
136 | +Caveolin | ||
137 | +Type | ||
138 | +Signaling | ||
139 | +Molecular | ||
140 | +Alleviates | ||
141 | +Early | ||
142 | +Transforming | ||
143 | +Potential | ||
144 | +COL1A1 | ||
145 | +Plasminogen | ||
146 | +Factor | ||
147 | +Semaphorin | ||
148 | +CC16 | ||
149 | +Integrated | ||
150 | +Like | ||
151 | +Stat3 | ||
152 | +Tissue | ||
153 | +Signalling | ||
154 | +Phenotype | ||
155 | +TGFBR2 | ||
156 | +Homolog | ||
157 | +III | ||
158 | +MiR | ||
159 | +Lactic | ||
160 | +Pulmonary | ||
161 | +Fibroblastic | ||
162 | +Defect | ||
163 | +Molecules | ||
164 | +Yin | ||
165 | +MCP | ||
166 | +MicroRNA | ||
167 | +LIGHT | ||
168 | +Beyond | ||
169 | +Recombinant | ||
170 | +Compromised | ||
171 | +Ginsenoside | ||
172 | +P110 | ||
173 | +Production | ||
174 | +Lipogenic | ||
175 | +HRCT | ||
176 | +Its | ||
177 | +Implications | ||
178 | +Problem | ||
179 | +NH2 | ||
180 | +Fibrogenesis | ||
181 | +TOB2 | ||
182 | +SMAD3 | ||
183 | +Lin28B | ||
184 | +Significance | ||
185 | +Differential | ||
186 | +Cytokine | ||
187 | +Progressive | ||
188 | +Solution | ||
189 | +Identification | ||
190 | +Peptide | ||
191 | +Synthesis | ||
192 | +Protein | ||
193 | +Macrophage | ||
194 | +PDGF | ||
195 | +Repression | ||
196 | +CREB | ||
197 | +Cellular | ||
198 | +Plasma | ||
199 | +A4 | ||
200 | +Latent | ||
201 | +Wnt | ||
202 | +Proteasome | ||
203 | +Kinase | ||
204 | +Proteasomal | ||
205 | +Pathway | ||
206 | +Sirtuin | ||
207 | +MSCs | ||
208 | +D2 | ||
209 | +Absence | ||
210 | +Cells | ||
211 | +Thalidomide | ||
212 | +Regulation | ||
213 | +Hippel | ||
214 | +pH | ||
215 | +Chinese | ||
216 | +Th17 | ||
217 | +Uncoupling | ||
218 | +Periostin | ||
219 | +Promotes | ||
220 | +Amplification | ||
221 | +Smad | ||
222 | +Profibrotic | ||
223 | +Patients | ||
224 | +Subpleural | ||
225 | +Cytoskeletal | ||
226 | +Progression | ||
227 | +Lavage | ||
228 | +Angiotensin | ||
229 | +Domain | ||
230 | +Peripheral | ||
231 | +Inhibitor | ||
232 | +Associated | ||
233 | +Involvement | ||
234 | +Serum | ||
235 | +Toll | ||
236 | +Activation | ||
237 | +SPARC | ||
238 | +Attenuating | ||
239 | +Resveratrol | ||
240 | +PI3K | ||
241 | +Induced | ||
242 | +Matrix | ||
243 | +Leucine | ||
244 | +BALF | ||
245 | +Defective | ||
246 | +Negative | ||
247 | +JNK | ||
248 | +Receptor | ||
249 | +Reprogramming | ||
250 | +CD11c | ||
251 | +Nuclear | ||
252 | +hydroxytryptamine2A | ||
253 | +Is | ||
254 | +Sustained | ||
255 | +Essential | ||
256 | +Beta | ||
257 | +CBP | ||
258 | +miRNA | ||
259 | +Pathogenesis | ||
260 | +Aging | ||
261 | +Neutrophil | ||
262 | +Nitrated | ||
263 | +Resolution | ||
264 | +Signal | ||
265 | +Low | ||
266 | +New | ||
267 | +Contribution | ||
268 | +Homeostasis | ||
269 | +HLA | ||
270 | +Sub | ||
271 | +26S | ||
272 | +HMGA2 | ||
273 | +Treg | ||
274 | +Significant | ||
275 | +Blockade | ||
276 | +HSP27 | ||
277 | +Clinical | ||
278 | +Th1 | ||
279 | +Triptolide | ||
280 | +Dependent | ||
281 | +Inflammation | ||
282 | +VEGF | ||
283 | +PINK1 | ||
284 | +H1N1 | ||
285 | +E2 | ||
286 | +Discovery | ||
287 | +Interplay | ||
288 | +Secretory | ||
289 | +CCL2 | ||
290 | +Dehydroepiandrosterone | ||
291 | +Modulation | ||
292 | +Mechanisms | ||
293 | +MMP | ||
294 | +Mediated | ||
295 | +H19 | ||
296 | +Morphogenic | ||
297 | +Ac | ||
298 | +Corilagin | ||
299 | +Tannic | ||
300 | +Hypertension | ||
301 | +WNT10A | ||
302 | +MCTC | ||
303 | +Possible | ||
304 | +Studies | ||
305 | +Pigment | ||
306 | +Wilms | ||
307 | +Hydrogen | ||
308 | +Azithromycin | ||
309 | +Number | ||
310 | +Transglutaminase | ||
311 | +Outcomes | ||
312 | +NCI | ||
313 | +RNA | ||
314 | +WNT5A | ||
315 | +Mice | ||
316 | +Methylation | ||
317 | +Novel | ||
318 | +Nitric | ||
319 | +Cell | ||
320 | +HSP47 | ||
321 | +TRPV4 | ||
322 | +Protease | ||
323 | +Release | ||
324 | +For | ||
325 | +Ligands | ||
326 | +TGF | ||
327 | +Epithelium | ||
328 | +Aortic | ||
329 | +NADPH | ||
330 | +Herpes | ||
331 | +Bach1 | ||
332 | +Ca | ||
333 | +Dysregulated | ||
334 | +Membrane | ||
335 | +TGFBR | ||
336 | +Connective | ||
337 | +Decreased | ||
338 | +TIAM1 | ||
339 | +Serpin | ||
340 | +Fstl1 | ||
341 | +CCN1 | ||
342 | +BLM | ||
343 | +Sphingosine | ||
344 | +C57BL | ||
345 | +Data | ||
346 | +Dogs | ||
347 | +Organizing | ||
348 | +IGF | ||
349 | +COL1 | ||
350 | +CD44V6 | ||
351 | +BMPER | ||
352 | +ARPC2 | ||
353 | +Galectin | ||
354 | +Lindau | ||
355 | +Inappropriate | ||
356 | +Microencapsulation | ||
357 | +Oxidant | ||
358 | +M2 | ||
359 | +Renshen | ||
360 | +Sputum | ||
361 | +Snail | ||
362 | +Inducer | ||
363 | +Prognostic | ||
364 | +Storage | ||
365 | +MAPK | ||
366 | +Citrus | ||
367 | +PPAR | ||
368 | +Collagen | ||
369 | +Matriptase | ||
370 | +Arsenic | ||
371 | +Long | ||
372 | +F1 | ||
373 | +Deleted | ||
374 | +Genomewide | ||
375 | +PXS64 | ||
376 | +Lysyl | ||
377 | +ERK | ||
378 | +Calu | ||
379 | +MyD88 | ||
380 | +aB | ||
381 | +Activity | ||
382 | +Applying | ||
383 | +Secreted | ||
384 | +Control | ||
385 | +BM | ||
386 | +Mitochondrial | ||
387 | +Age | ||
388 | +EZH2 | ||
389 | +Overproduction | ||
390 | +Way | ||
391 | +The | ||
392 | +DDR2 | ||
393 | +1A | ||
394 | +Rho | ||
395 | +Bronchoalveolar | ||
396 | +TGFbeta1 | ||
397 | +Akt | ||
398 | +ERK1 | ||
399 | +Novo | ||
400 | +Curcumin | ||
401 | +FGF | ||
402 | +C5aR | ||
403 | +17A | ||
404 | +Lysocardiolipin | ||
405 | +Protects | ||
406 | +Predisposition | ||
407 | +Thy | ||
408 | +C1q | ||
409 | +Nintedanib | ||
410 | +High | ||
411 | +KCa3 | ||
412 | +Olodaterol | ||
413 | +Reviews | ||
414 | +Proliferation | ||
415 | +Immunomodulation | ||
416 | +Attenuates | ||
417 | +Gremlin | ||
418 | +Cthrc1 | ||
419 | +Vimentin | ||
420 | +Elk1 | ||
421 | +Lipoxin | ||
422 | +IQ | ||
423 | +Roles | ||
424 | +BAL | ||
425 | +Relation | ||
426 | +Autophagy | ||
427 | +IGFBP | ||
428 | +Inhibition | ||
429 | +BMP | ||
430 | +Anchorage | ||
431 | +ITGB6 | ||
432 | +Mode | ||
433 | +Modulating | ||
434 | +miRNAs | ||
435 | +Inhibit | ||
436 | +PHGDH | ||
437 | +Up | ||
438 | +Phosphatase | ||
439 | +TGFbeta | ||
440 | +C3aR | ||
441 | +Pseudomonas | ||
442 | +Comparative | ||
443 | +Reduced | ||
444 | +Crosstalk | ||
445 | +Conversion | ||
446 | +Injury | ||
447 | +Phenotypes | ||
448 | +CD4 | ||
449 | +MicroRNAs | ||
450 | +Regulates | ||
451 | +TNFalpha | ||
452 | +Pirfenidone | ||
453 | +Raised | ||
454 | +Old | ||
455 | +Cartilage | ||
456 | +Prostaglandin | ||
457 | +BMP3 | ||
458 | +BARD1 | ||
459 | +Deficiency | ||
460 | +RhoA | ||
461 | +AKT2 | ||
462 | +NF | ||
463 | +Cigarette | ||
464 | +GATA | ||
465 | +MAP3K19 | ||
466 | +sTNFR | ||
467 | +NK | ||
468 | +Different | ||
469 | +Subjects | ||
470 | +Autoimmunity | ||
471 | +Mast | ||
472 | +Single | ||
473 | +Microsomal | ||
474 | +WNT7B | ||
475 | +MK2 | ||
476 | +TGFb1 | ||
477 | +CCN2 | ||
478 | +Growth | ||
479 | +Prostatic | ||
480 | +PGE | ||
481 | +Abrogation | ||
482 | +Stem | ||
483 | +EBV | ||
484 | +Microsatellite | ||
485 | +Nrf2 | ||
486 | +Epstein | ||
487 | +Club | ||
488 | +TGFB1 | ||
489 | +ATG4B | ||
490 | +Differentiation | ||
491 | +EMMPRIN | ||
492 | +Smad3 | ||
493 | +Genetic | ||
494 | +Sorafenib | ||
495 | +IFN | ||
496 | +Impacts | ||
497 | +Key | ||
498 | +Activated | ||
499 | +AKT | ||
500 | +Th2 | ||
501 | +PTEN | ||
502 | +USP11 | ||
503 | +IL | ||
504 | +Effect | ||
505 | +HDAC4 | ||
506 | +Free | ||
507 | +Sunitinib | ||
508 | +Established | ||
509 | +Fluid | ||
510 | +Decisive | ||
511 | +Inhibits | ||
512 | +Marks | ||
513 | +mTORC2 | ||
514 | +Trigger | ||
515 | +Concentration | ||
516 | +Intratracheal | ||
517 | +Participation | ||
518 | +Against | ||
519 | +Expression | ||
520 | +kappaB | ||
521 | +Role | ||
522 | +Rats | ||
523 | +Intrinsic | ||
524 | +Epigenetic | ||
525 | +Smooth | ||
526 | +NOX4 | ||
527 | +Tumor | ||
528 | +Rapamycin | ||
529 | +microRNA | ||
530 | +Overexpression | ||
531 | +Current | ||
532 | +Muc5ac | ||
533 | +Combined | ||
534 | +II | ||
535 | +D1 | ||
536 | +Accelerated | ||
537 | +Regulator | ||
538 | +Pleural | ||
539 | +Invasive | ||
540 | +alphaEbeta7 | ||
541 | +From | ||
542 | +MUM | ||
543 | +Immunoglobulin | ||
544 | +Beta1 | ||
545 | +Small | ||
546 | +Sphingolipids | ||
547 | +Stiffening | ||
548 | +FFPE | ||
549 | +miR | ||
550 | +LPA1 | ||
551 | +B4 | ||
552 | +Two | ||
553 | +Extracellular | ||
554 | +Enhances | ||
555 | +Evaluation | ||
556 | +Recent | ||
557 | +Elevated | ||
558 | +Re | ||
559 | +IIP | ||
560 | +CD44 | ||
561 | +Interactions | ||
562 | +CXCL9 | ||
563 | +Protective | ||
564 | +Ets | ||
565 | +Preventive | ||
566 | +Establishment | ||
567 | +ALK5 | ||
568 | +Increased | ||
569 | +Values | ||
570 | +kDa | ||
571 | +Large | ||
572 | +May | ||
573 | +Transcription | ||
574 | +ACLP | ||
575 | +Cryptogenic | ||
576 | +Ubiquitin | ||
577 | +GLI | ||
578 | +L5 | ||
579 | +Discoidin | ||
580 | +Bleomycin | ||
581 | +Carbon | ||
582 | +Renin | ||
583 | +CUX1 | ||
584 | +Correct | ||
585 | +Constitutive | ||
586 | +SNAI | ||
587 | +Bile | ||
588 | +Assessment | ||
589 | +Fibrotic | ||
590 | +Differing | ||
591 | +Development | ||
592 | +Channel | ||
593 | +Simvastatin | ||
594 | +CAM | ||
595 | +Fibroblasts | ||
596 | +Melatonin | ||
597 | +SIRT6 | ||
598 | +STAT3 | ||
599 | +Tensin | ||
600 | +Pingfei | ||
601 | +Stanniocalcin | ||
602 | +Bax | ||
603 | +Group | ||
604 | +mRNA | ||
605 | +Selectivity | ||
606 | +Emphysema | ||
607 | +Barr | ||
608 | +Berberine | ||
609 | +Metformin | ||
610 | +Hsp90 | ||
611 | +CDCP1 | ||
612 | +T869C | ||
613 | +EMT | ||
614 | +ADAM | ||
615 | +Cub | ||
616 | +Pneumonia | ||
617 | +Induces | ||
618 | +FOXF1 | ||
619 | +Upregulation | ||
620 | +H441 | ||
621 | +RAGE | ||
622 | +Myofibroblasts | ||
623 | +JAK2 | ||
624 | +Interstitial | ||
625 | +Amplified | ||
626 | +Fibrosis | ||
627 | +Microarray | ||
628 | +Developmental | ||
629 | +CTGF | ||
630 | +Serine | ||
631 | +Integrin | ||
632 | +AP | ||
633 | +Fas | ||
634 | +During | ||
635 | +CCN5 | ||
636 | +Insulin | ||
637 | +Pleiotropic | ||
638 | +TGFb | ||
639 | +Evidence | ||
640 | +Phenoconversion | ||
641 | +Comparison | ||
642 | +Smoke | ||
643 | +Box | ||
644 | +microRNAs | ||
645 | +Anti | ||
646 | +Suppression | ||
647 | +A549 | ||
648 | +Chop | ||
649 | +Jun | ||
650 | +Myofibroblast | ||
651 | +Dysfunction | ||
652 | +Axis | ||
653 | +IPF | ||
654 | +MS80 | ||
655 | +S1P | ||
656 | +Inhibitory | ||
657 | +Interleukin | ||
658 | +Action | ||
659 | +Bioenergetics | ||
660 | +Transition | ||
661 | +Hyper | ||
662 | +Lrp5 | ||
663 | +Model | ||
664 | +cAMP | ||
665 | +Medical | ||
666 | +SDKP | ||
667 | +UIP | ||
668 | +Animal | ||
669 | +Forkhead | ||
670 | +lncRNA | ||
671 | +Lung | ||
672 | +Antifibrotic | ||
673 | +Induction | ||
674 | +Titration | ||
675 | +Epithelial | ||
676 | +OSF | ||
677 | +ATPase | ||
678 | +Reactive | ||
679 | +TNF | ||
680 | +aVb6 | ||
681 | +Molecule | ||
682 | +NEU1 | ||
683 | +Deregulation | ||
684 | +Idiopathic | ||
685 | +An | ||
686 | +Using | ||
687 | +Quantifying | ||
688 | +BMPR2 | ||
689 | +Foxp3high | ||
690 | +sL1 | ||
691 | +Profile | ||
692 | +Sulf2 | ||
693 | +Mediates | ||
694 | +Markers | ||
695 | +VCAM |
scripts/wisse.py
0 → 100644
1 | +#!/usr/bin/python | ||
2 | +# -*- coding: latin-1 -*- | ||
3 | +# Python2.7 | ||
4 | + | ||
5 | +import numpy as np | ||
6 | +import logging | ||
7 | +import os | ||
8 | +from functools import partial | ||
9 | +from pdb import set_trace as st | ||
10 | +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', | ||
11 | + level=logging.INFO) | ||
12 | + | ||
13 | + | ||
14 | +class wisse(object): | ||
15 | + """ Both the TFIDFVectorizer and the word embedding model must be pretrained, either from the local | ||
16 | + sentence corpus or from model persintence. | ||
17 | + """ | ||
18 | + def __init__(self, embeddings, vectorizer, tf_tfidf, combiner = "sum"): | ||
19 | + self.tokenize = vectorizer.build_tokenizer() | ||
20 | + self.tfidf = vectorizer | ||
21 | + self.embedding = embeddings | ||
22 | + self.pred_tfidf = tf_tfidf | ||
23 | + if combiner.startswith("avg"): | ||
24 | + self.comb = partial(np.mean, axis = 0) | ||
25 | + else: | ||
26 | + self.comb = partial(np.sum, axis = 0) | ||
27 | + | ||
28 | + | ||
29 | + def fit(self, X, y = None): # Scikit-learn template | ||
30 | + if isinstance(X, list): | ||
31 | + self.sentences = X | ||
32 | + | ||
33 | + return self | ||
34 | + | ||
35 | + | ||
36 | + def transform(self, X): | ||
37 | + if isinstance(X, list): | ||
38 | + return self.fit(X) | ||
39 | + | ||
40 | + elif isinstance(X, str): | ||
41 | + return self.infer_sentence(X) | ||
42 | + | ||
43 | + | ||
44 | + def fit_transform(self, X, y=None): | ||
45 | + return self.transform(X) | ||
46 | + | ||
47 | + | ||
48 | + def infer_sentence(self, sent): | ||
49 | + ss = self.tokenize(sent) | ||
50 | + missing_bow = [] | ||
51 | + missing_cbow = [] | ||
52 | + series = {} | ||
53 | + | ||
54 | + if not ss == []: | ||
55 | + self.weights, m = self.infer_tfidf_weights(ss) | ||
56 | + else: | ||
57 | + return None | ||
58 | + | ||
59 | + missing_bow += m | ||
60 | + | ||
61 | + for w in self.weights: | ||
62 | + try: | ||
63 | + series[w] = (self.weights[w], self.embedding[w]) | ||
64 | + except KeyError: | ||
65 | + series[w] = None | ||
66 | + missing_cbow.append(w) | ||
67 | + continue | ||
68 | + except IndexError: | ||
69 | + continue | ||
70 | + | ||
71 | + if self.weights == {}: return None | ||
72 | + # Embedding the sentence... : | ||
73 | + sentence = np.array([series[w][1] for w in series if not series[w] is None]) | ||
74 | + series = {} | ||
75 | + | ||
76 | + return missing_cbow, missing_bow, self.comb(sentence) | ||
77 | + | ||
78 | + | ||
79 | + def infer_tfidf_weights(self, sentence): | ||
80 | + existent = {} | ||
81 | + missing = [] | ||
82 | + | ||
83 | + if not self.tfidf: | ||
84 | + for word in sentence: | ||
85 | + existent[word] = 1.0 | ||
86 | + | ||
87 | + return existent, missing | ||
88 | + | ||
89 | + if self.pred_tfidf: | ||
90 | + unseen = self.tfidf.transform([" ".join(sentence)]).toarray() | ||
91 | + for word in sentence: | ||
92 | + try: | ||
93 | + existent[word] = unseen[0][self.tfidf.vocabulary_[word]] | ||
94 | + except KeyError: | ||
95 | + missing.append(word) | ||
96 | + continue | ||
97 | + else: | ||
98 | + for word in sentence: | ||
99 | + try: | ||
100 | + weight = vectorizer.idf_[vectorizer.vocabulary_[word]] | ||
101 | + existent[word] = weight if weight > 2 else 0.01 | ||
102 | + except KeyError: | ||
103 | + missing.append(word) | ||
104 | + continue | ||
105 | + | ||
106 | + return existent, missing | ||
107 | + | ||
108 | + | ||
109 | + def __iter__(self): | ||
110 | + for s in self.sentences: | ||
111 | + yield self.transform(s) | ||
112 | + | ||
113 | + | ||
114 | +def save_dense(directory, filename, array): | ||
115 | + directory=os.path.normpath(directory) + '/' | ||
116 | +# try: | ||
117 | + if filename.isalpha(): | ||
118 | + np.save(directory + filename, array) | ||
119 | + else: | ||
120 | + return None | ||
121 | +# except UnicodeEncodeError: | ||
122 | +# return None | ||
123 | + | ||
124 | +def load_dense(filename): | ||
125 | + return np.load(filename) | ||
126 | + | ||
127 | + | ||
128 | +def load_sparse_bsr(filename): | ||
129 | + loader = np.load(filename) | ||
130 | + return bsr_matrix((loader['data'], loader['indices'], loader['indptr']), | ||
131 | + shape=loader['shape']) | ||
132 | + | ||
133 | + | ||
134 | +def save_sparse_bsr(directory, filename, array): | ||
135 | +# note that .npz extension is added automatically | ||
136 | + directory=os.path.normpath(directory) + '/' | ||
137 | + if word.isalpha(): | ||
138 | + array=array.tobsr() | ||
139 | + np.savez(directory + filename, data=array.data, indices=array.indices, | ||
140 | + indptr=array.indptr, shape=array.shape) | ||
141 | + else: | ||
142 | + return None | ||
143 | + | ||
144 | + | ||
145 | +class vector_space(object): | ||
146 | + def __init__(self, directory, sparse = False): | ||
147 | + self.sparse = sparse | ||
148 | + ext = ".npz" if sparse else ".npy" | ||
149 | + if directory.endswith(".tar.gz"): | ||
150 | + self._tar = True | ||
151 | + import tarfile | ||
152 | + self.tar = tarfile.open(directory) | ||
153 | + file_list = self.tar.getnames() #[os.path.basename(n) for n in self.tar.getnames()] | ||
154 | + self.words = {os.path.basename(word).replace(ext, ''): word | ||
155 | + for word in file_list} | ||
156 | + else: | ||
157 | + self._tar = False | ||
158 | + directory = os.path.normpath(directory) + '/' | ||
159 | + file_list = os.listdir(directory) | ||
160 | + self.words = {word.replace(ext, ''): directory + word | ||
161 | + for word in file_list} | ||
162 | + | ||
163 | + | ||
164 | + def __getitem__(self, item): | ||
165 | + if self.sparse: | ||
166 | + if self._tar: | ||
167 | + member = self.tar.getmember(self.words[item]) | ||
168 | + word = self.tar.extractfile(member) | ||
169 | + else: | ||
170 | + word = self.words[item] | ||
171 | + #return load_sparse_bsr(self.words[item]) | ||
172 | + return load_sparse_bsr(word) | ||
173 | + | ||
174 | + else: | ||
175 | + if self._tar: | ||
176 | + member = self.tar.getmember(self.words[item]) | ||
177 | + word = self.tar.extractfile(member) | ||
178 | + else: | ||
179 | + word = self.words[item] | ||
180 | + #return load_sparse_bsr(self.words[item]) | ||
181 | + return load_dense(word) | ||
182 | + | ||
183 | + | ||
184 | +def keyed2indexed(keyed_model, output_dir = "word_embeddings/", parallel = True, n_jobs = -1): | ||
185 | + output_dir = os.path.normpath(output_dir) + '/' | ||
186 | + if not os.path.exists(output_dir): | ||
187 | + os.makedirs(output_dir) | ||
188 | + | ||
189 | + if parallel: | ||
190 | + from joblib import Parallel, delayed | ||
191 | + | ||
192 | + Parallel(n_jobs = n_jobs, verbose = 10)(delayed(save_dense)(output_dir, word, keyed_model[word]) | ||
193 | + for word, _ in keyed_model.vocab.items()) | ||
194 | + else: | ||
195 | + for word, _ in keyed_model.vocab.items(): | ||
196 | + save_dense(output_dir, word, keyed_model[word]) | ||
197 | + | ||
198 | + | ||
199 | +class streamer(object): | ||
200 | + def __init__(self, file_name): | ||
201 | + self.file_name = file_name | ||
202 | + | ||
203 | + def __iter__(self): | ||
204 | + for s in open(self.file_name): | ||
205 | + yield s.strip() |
scripts/wisse.pyc
0 → 100644
No preview for this file type
scripts/wisse_example.py
0 → 100644
1 | +#!/usr/bin/python | ||
2 | +# -*- coding: latin-1 -*- | ||
3 | +# Python2.7 | ||
4 | +from gensim.models.keyedvectors import KeyedVectors as vDB | ||
5 | +from sklearn.feature_extraction.text import TfidfVectorizer | ||
6 | +import numpy as np | ||
7 | +#import numexpr as ne | ||
8 | +import argparse | ||
9 | +#import _pickle as pickle | ||
10 | +#import cPickle as pickle | ||
11 | +import logging | ||
12 | +import os | ||
13 | +from functools import partial | ||
14 | +import wisse | ||
15 | + | ||
16 | + | ||
17 | +load_vectors = vDB.load_word2vec_format | ||
18 | + | ||
19 | +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', | ||
20 | + level=logging.INFO) | ||
21 | + | ||
22 | + | ||
23 | +if __name__ == "__main__": | ||
24 | + | ||
25 | + parser = argparse.ArgumentParser(description="""This use example shows sentence | ||
26 | + embedding by using WISSE. The input is a text file which has a sentece in | ||
27 | + each of its rows. The output file has two tab-separated columns: the index | ||
28 | + line of the sentece in the input file and the sentence vector representation | ||
29 | + .""") | ||
30 | + parser.add_argument("--idfmodel", help = """Input file containing IDF | ||
31 | + pre-trained weights. If not provided, | ||
32 | + all word vector weights will be set to | ||
33 | + 1.0. If 'local' tf-idf weights will be | ||
34 | + computed locally from the input file | ||
35 | + (pickled sklearn object).""", | ||
36 | + default = None) | ||
37 | + parser.add_argument("--embedmodel", help = """Input file containing word | ||
38 | + embeddings model (binary and text | ||
39 | + are allowed).""", required = True) | ||
40 | + parser.add_argument("--output", help = """Output file containing the sentence | ||
41 | + embeddings.""", default = "") | ||
42 | + parser.add_argument("--input", help = """Input file containing a sentence | ||
43 | + by row.""", required = True) | ||
44 | + parser.add_argument("--comb", help = """Desired word vector combination for | ||
45 | + sentence representation {sum, avg}. | ||
46 | + (default = 'sum')""", default = "sum") | ||
47 | + parser.add_argument("--suffix", nargs = '?', help = """A suffix to be added | ||
48 | + to the output file (default = '')""", | ||
49 | + default = "", required = False) | ||
50 | + parser.add_argument("--tfidf", help="""To predict TFIDF complete weights | ||
51 | + ('tfidf') or use only partial IDFs | ||
52 | + ('idf'). (default = 'tfidf')""", | ||
53 | + default = "tfidf") | ||
54 | + parser.add_argument("--localw", help = """TFIDF word vector weights | ||
55 | + computed locally from the input file of | ||
56 | + sentences {freq, binary, sublinear} | ||
57 | + (default='none').""", default = "none") | ||
58 | + parser.add_argument("--stop", help = """Toggles stripping stop words in | ||
59 | + locally computed word vector weights.""", | ||
60 | + action = "store_true") | ||
61 | + parser.add_argument("--format", help = """The format of the embedding model | ||
62 | + file: {binary, text, wisse}. | ||
63 | + default = 'binary'""", default = "binary") | ||
64 | + args = parser.parse_args() | ||
65 | + | ||
66 | + | ||
67 | + if not args.format.startswith("wisse"): | ||
68 | + if not os.path.isfile(args.embedmodel): | ||
69 | + logging.info("""Embedding model file does not exist (EXIT): | ||
70 | + \n%s\n ...""" % args.embedmodel) | ||
71 | + exit() | ||
72 | + elif not os.path.exists(args.embedmodel): | ||
73 | + logging.info("""Embedding model directory does not exist (EXIT): | ||
74 | + \n%s\n ...""" % args.embedmodel) | ||
75 | + exit() | ||
76 | + | ||
77 | + if not os.path.isfile(args.idfmodel) and not args.idfmodel.startswith("local"): | ||
78 | + logging.info("""IDF model file does not exist (EXIT): | ||
79 | + \n%s\n ...""" % args.idfmodel) | ||
80 | + exit() | ||
81 | + if not os.path.isfile(args.input): | ||
82 | + logging.info("""Input file does not exist (EXIT): | ||
83 | + \n%s\n ...""" % args.input) | ||
84 | + exit() | ||
85 | + if args.output != "": | ||
86 | + if os.path.dirname(args.output) != "": | ||
87 | + if not os.path.exists(os.path.dirname(args.output)): | ||
88 | + logging.info("""Output directory does not exist (EXIT): | ||
89 | + \n%s\n ...""" % args.output) | ||
90 | + exit() | ||
91 | + else: | ||
92 | + output_name = args.output | ||
93 | + else: | ||
94 | + output_name = args.output | ||
95 | + else: | ||
96 | + suffix = "_".join([embedding_name, | ||
97 | + args.comb, | ||
98 | + args.tfidf, | ||
99 | + "local" if args.idfmodel.startswith("local") else tfidf_name, | ||
100 | + args.suffix]).strip("_") | ||
101 | + output_name = args.input + ".output_" + suffix | ||
102 | + | ||
103 | + | ||
104 | + if args.tfidf.startswith("tfidf"): | ||
105 | + pred_tfidf = True | ||
106 | + elif args.tfidf.startswith("idf"): | ||
107 | + pred_tfidf = False | ||
108 | + else: | ||
109 | + pred_tfidf = False | ||
110 | + tfidf = False | ||
111 | + | ||
112 | + vectorizer = TfidfVectorizer(min_df = 1, | ||
113 | + encoding = "latin-1", | ||
114 | + decode_error = "replace", | ||
115 | + lowercase = True, | ||
116 | + binary = True if args.localw.startswith("bin") else False, | ||
117 | + sublinear_tf = True if args.localw.startswith("subl") else False, | ||
118 | + stop_words = "english" if args.stop else None) | ||
119 | + | ||
120 | + sentences = wisse.streamer(args.input) | ||
121 | + | ||
122 | + if args.idfmodel.startswith("local"): | ||
123 | + logging.info("Fitting local TFIDF weights from: %s ..." % args.input) | ||
124 | + tfidf = vectorizer.fit(sentences) | ||
125 | + | ||
126 | + elif os.path.isfile(args.idfmodel): | ||
127 | + logging.info("Loading global TFIDF weights from: %s ..." % args.idfmodel) | ||
128 | + with open(args.idfmodel, 'rb') as f: | ||
129 | + tfidf = pickle.load(f)#, encoding = 'latin-1') | ||
130 | + | ||
131 | + else: | ||
132 | + tfidf = False | ||
133 | + | ||
134 | + try: | ||
135 | + if args.format.startswith("bin"): | ||
136 | + embedding = load_vectors(args.embedmodel, binary = True, | ||
137 | + encoding = "latin-1") | ||
138 | + elif args.format.startswith("tex"): | ||
139 | + embedding = load_vectors(args.embedmodel, binary = False, | ||
140 | + encoding = "latin-1") | ||
141 | + else: | ||
142 | + embedding = wisse.vector_space(args.embedmodel, sparse = False) | ||
143 | + | ||
144 | + except: | ||
145 | + logging.info( | ||
146 | + """Error while loading word embedding model. Verify if the file | ||
147 | + is broken (EXIT)...\n%s\n""" % args.embedmodel) | ||
148 | + exit() | ||
149 | + | ||
150 | + embedding_name = os.path.basename(args.embedmodel).split(".")[0] | ||
151 | + tfidf_name = os.path.basename(args.idfmodel).split(".")[0] | ||
152 | + | ||
153 | + missing_bow = [] # Stores missing words in the TFIDF model | ||
154 | + missing_cbow = [] # Stores missing words in the W2V model | ||
155 | + sidx = 0 # The index of the sentence according to the input file | ||
156 | + logging.info("\n\nEmbedding sentences and saving then to a the output file..\n%s\n" % output_name) | ||
157 | + | ||
158 | + with open(output_name, "w") as fo: | ||
159 | + for sent in sentences: | ||
160 | + sidx += 1 | ||
161 | + series = wisse.wisse(embeddings = embedding, vectorizer = tfidf, | ||
162 | + tf_tfidf = True, combiner='sum') | ||
163 | + try: | ||
164 | + mc, mb, vector = series.transform(sent) | ||
165 | + except TypeError: | ||
166 | + continue | ||
167 | + | ||
168 | + # At this point you can use the embedding 'vector' for any application as it | ||
169 | + # is a numpy array. Also you can simply save the vectors in text format as | ||
170 | + # follows: | ||
171 | + missing_cbow += mc | ||
172 | + missing_bow += mb | ||
173 | + fo.write("%d\t%s\n" % (sidx, np.array2string(vector, | ||
174 | + formatter = {'float_kind':lambda x: "%.6f" % x}, | ||
175 | + max_line_width = 20000).strip(']').strip('[') )) | ||
176 | + | ||
177 | + missing_name = (os.path.basename(args.input).split(".")[0] + "_" + | ||
178 | + embedding_name + "_" + | ||
179 | + tfidf_name + ".missing") | ||
180 | + logging.info("\n\nSaving missing vocabulary to %s ..\n\n" % missing_name) | ||
181 | + | ||
182 | + with open(missing_name, "w") as f: | ||
183 | + f.write("# missing word embeddings:\n") | ||
184 | + for w in set(missing_cbow): | ||
185 | + f.write("%s\n" % w) | ||
186 | + | ||
187 | + f.write("# missing MI weights:\n") | ||
188 | + for w in set(missing_bow): | ||
189 | + f.write("%s\n" % w) | ||
190 | + | ||
191 | + logging.info("FINISHED! \n") |
-
Please register or login to post a comment