Conditional Random Fields

Carlos-Francisco Méndez-Cruz
Commit a7cccc5f56f9ac764c09b92d6405e221332b3f31 a7cccc5f 1 parent 89694655
Showing 3 changed files with 1080 additions and 356 deletions
data-sets/genes.txt
tagging_Sklearn_crfsuite.py
training-validation-v1.py
--- a/data-sets/genes.txt 0 → 100644
View file @a7cccc5
+++ b/data-sets/genes.txt 0 → 100644
View file @a7cccc5
+ TGF-beta1
+ Insulin-like growth factor-1
+ CD86
+ HS6ST2
+ Snail1/2
+ interferon-gamma
+ Focal adhesion kinase
+ protease-activated receptor-1
+ mPAP
+ nuclear factor erythroid 2-related factor 2
+ NF-kB
+ Fos
+ HDAC6
+ CD90
+ interleukin-12p40
+ Mitogen-activated protein kinase-activated protein kinase-2
+ collagen-1, ET-1
+ smooth muscle a-actin
+ caspase-3
+ Angiotensin II
+ IL-23
+ HDAC
+ matriptase
+ CD124
+ Keap1
+ transforming growth factor-beta-1
+ TGF-a
+ Cysteine-rich protein 1
+ glycogen synthase kinase-3beta
+ Cartilage oligomeric matrix protein
+ TGFb2
+ miR-338*
+ C3aR
+ E  -cadherin
+ TGF beta 1
+ miR-200b
+ pVHL
+ Activin
+ BMP-8B
+ Foxp-3
+ HAI-1
+ IL-1b
+ WNT5A
+ AQP5
+ MT1
+ stromal-cell-derived factor-1
+ tgfb2
+ monocyte chemotactic protein-1
+ SpA
+ IL-1 beta, -2, -4, -5, -6, -8, -10, -17
+ insulin-like growth factor-I
+ ECSOD
+ SPARC
+ E-CAD
+ TGF-b(1)
+ il17a
+ COL1A2
+ TGF-b-1
+ Atg4b
+ ET-B
+ KGF
+ NOX-4
+ col1a2
+ pten
+ miR-26a
+ S1PL
+ alpha-smooth muscle actin
+ Glut-1 and glucokinase
+ TGFbeta1
+ Transforming growth factor beta1
+ CCl4
+ transforming growth factor  -b1
+ serum amyloid P
+ miR-338
+ Bone morphogenetic protein 3
+ COL3A1
+ ALK5
+ PSPH
+ HO-1
+ histone deacetylase 4
+ tsp1
+ FGF-10
+ interferon (IFN)-gamma
+ E-Cad
+ Protease nexin-1
+ ILK
+ TGF-b receptor II
+ TGFB
+ insulin promoting factor-1
+ EGFR
+ LXRa
+ Interleukin 17A
+ MiR-185
+ Semaphorin-7A
+ SPHK1
+ transforming growth factor-b(1)
+ K-ras
+ p27
+ pai1
+ Mmp19
+ Col1A1
+ Follistatin-like 1
+ Serine Protease
+ Smad3 and 4
+ Bone morphogenetic protein
+ PDGF-B
+ BMP-4
+ min(-1
+ ORP150
+ bmp1
+ RAGE
+ SGPL1
+ SOCS-1
+ tagln2
+ MMP-12
+ DRB1
+ MyD88
+ STC1
+ 150-kDa oxygen-regulated protein
+ MMP-9
+ IP10
+ terminal deoxynucleotidyl transferase
+ TGF-beta 1
+ NADPH oxidase-4
+ transforming growth factor-beta
+ Prostaglandin A(1)
+ p70
+ IgG1
+ N-acetyl-l-cysteine
+ Tgfbr1/2
+ HGFA
+ DNMT1
+ nuclear factor kappa B
+ plasminogen activator inhibitor 1
+ S1P
+ angiopoietin-2
+ KCa3.1
+ ENA-78
+ WNT3a
+ miR-134
+ P53
+ matrix metalloproteinase (MMP)  -2
+ vimentin
+ MMP-2
+ endoglin
+ tumor necrosis factor superfamily protein 14
+ Transforming Growth Factor Beta 1
+ Cebpb
+ Mknk2
+ SDF-1-TR1
+ Endothelin-1
+ HFL-1
+ MIG
+ TGF-beta receptors type I and II (T beta R-I and T beta R-II
+ IL-8
+ YY-1
+ MMP1
+ interferon gamma (IFN-y
+ MIP-1 alpha
+ IL-4RA
+ HS6ST1
+ BMPR2
+ sL1
+ transforming growth factor-alpha
+ Transgelin
+ TGF-beta 3
+ lox
+ CTGF
+ MCP-1
+ tumor necrosis factor
+ MT3
+ cytosolic phospholipase A(2)
+ Thymic stromal lymphopoietin
+ phosphoglycerate dehydrogenase
+ Bax
+ Caveolin-1 (cav-1
+ UCHL5
+ TIMP-1
+ JunD
+ Transforming growth factor (TGF)-b1
+ MicroRNA  (miR)-221
+ miR-424
+ YAP
+ Aortic carboxypeptidase-like protein
+ Microsomal prostaglandin E synthase-1
+ b2 -adrenoceptors
+ YKL-40
+ VE-cadherin
+ transforming growth factor-beta(1)
+ PML
+ CXCL12
+ VEGF-C
+ LMP1
+ miR-30a
+ insulin-like growth factor)-1
+ histone deacetylase
+ b-catenin
+ RANTES
+ latent membrane protein (LMP) 1
+ Itgb6
+ CXCL1
+ VEGFR-3
+ glucokinase
+ TNFSF14
+ matrix metalloproteinase-3
+ TNFa
+ renin
+ VCAM1
+ GATA-6
+ Transforming growth factor-b(1
+ angiotensinogen
+ Smad7
+ cC1q-R
+ IL-1R
+ tgfbr1/2
+ FoxP3
+ a-SMA
+ CCL12
+ CCN2
+ PDE1A
+ PAR-2
+ serum albumin (HSA)-thioredoxin 1
+ tissue inhibitor of metalloproteinase (TIMP)-1
+ HIF-1a
+ FRNK
+ TGF-alpha
+ miR-200c
+ poly-ADP ribose polymerase
+ Spiruchostatin A
+ Sgpl1
+ FXa
+ Extracellular superoxide dismutase
+ Tgf-b
+ Tb4
+ CUX1
+ gC1q-R
+ FGF10
+ Tpo
+ AKT
+ insulin-like growth factor-1
+ IL-4R alpha
+ IFN-gamma
+ JAK2
+ MMP-7
+ smad3
+ NOX4
+ Bach1
+ caspase-9
+ transforming growth factor b1
+ interleukin-6
+ Serpin B3
+ Pentraxin-2
+ T-cell lymphoma invasion and metastasis 1
+ GR
+ prostaglandin F (PGF) receptor
+ serine protease
+ MMP-19
+ SMAD 3
+ MMP7
+ p62
+ connective tissue growth factor
+ Renin
+ discoidin domain receptor 2
+ mothers against decapentaplegic homolog 3
+ IL-1RA
+ Trx
+ HAI-2
+ WNT1-inducible signaling pathway protein 1
+ Ubiquitin carboxyl-terminal hydrolase-L5
+ CC16
+ interleukin-10
+ LTBP-1
+ EDA
+ BMP-5
+ miR-154
+ CD80
+ p110
+ LTBP1 and 2
+ periostin
+ EP2
+ TGF-beta
+ C3a
+ H2O2 and tumor necrosis factor alpha
+ CD1
+ Mir-154
+ cyclooxygenase-2
+ LTBP] 1, 2, and 4
+ matrix metalloproteinase-14(+)/matrix metalloproteinase-2(+) myofibroblasts
+ Chop
+ TGF-beta(1)
+ MK2
+ SMO
+ PPARy
+ insulin-like growth factor binding protein-3
+ TGF-beta(1), collagen type Ialpha1
+ hyaluronan synthase 2
+ endothelin type A receptors
+ TNF-alpha
+ TNC
+ transforming growth factor (TGF)-beta 1
+ Enhancer of zeste homolog 2
+ Snail
+ IL-1Ra
+ MMP-1
+ interleukin-8
+ PGA(1)
+ lymphotoxin beta receptor
+ TGF-beta 2
+ Lefty A
+ Sulf1
+ serine hydroxymethyltransferase 2
+ IFN-y
+ LOX
+ Transforming growth factor (TGF)-b
+ TGF- b
+ NOS
+ Smad 7
+ hypoxia-inducible factor 1a
+ Transglutaminase 2
+ mTORC2
+ C5a
+ annexin V
+ thyroid transcription factor (TTF)-1
+ CXCL9
+ transforming growth factor-beta 1
+ Vascular endothelial growth factor
+ beclin-1
+ extracellular signal-regulated kinase (ERK)1/2
+ TGF-{beta}1
+ Caspase-3
+ CCN5
+ IL-17A
+ ARPC2
+ matrix metalloproteinase 9
+ p16
+ glucagon like peptide-1
+ L1-CAM
+ TGF  -b
+ fibroblast growth factor-1
+ TGFbeta
+ IGFBP-1
+ ubiquitin carboxyl-terminal hydrolase-L5
+ SMAD3
+ glucocorticoid receptor
+ Transforming growth factor b1
+ TNF)-alpha
+ IL18
+ TOB2
+ TbRII
+ WNT7B
+ SMAD-3
+ HLA-A, -B, -DRB1, tumor necrosis factor alpha
+ TF
+ miR-3107
+ zonula occludens-1
+ Nuclear factor-erythroid-related factor 2
+ Sulf2
+ ADAMTS9
+ Surfactant Protein-C
+ TNFalpha
+ IL10
+ actin related protein 2/3 complex, subunit 2
+ secreted protein acidic and rich in cysteine
+ Krebs Von Den Lungen-6
+ Il-1b
+ HNP-1
+ Fstl1
+ miR-382
+ matrix metalloproteinase (MMP)-2 and -9
+ SphK1/2
+ IQGAP1
+ SNAI2
+ Rictor
+ SHH
+ ACTA2
+ SIRT1
+ Sema 7a-CD4
+ WNT10A
+ Insulin-like growth factor binding protein-3
+ FSP-1
+ poly(ADP-ribose) polymerase
+ LRP5
+ MMP-3
+ Interleukin (IL) 8
+ Wilms' tumor 1
+ Fibrillin-2
+ tnf-alpha
+ aSMA
+ IL-9
+ HLA-A
+ cartilage oligomeric matrix protein
+ thrombin
+ tumor necrosis factor alpha
+ beta-catenin
+ FAK
+ Th1
+ YY1
+ NFkB
+ Lox
+ Caveolin-1
+ Membrane-type (MT)-MMPs
+ Galectin-3
+ smoothened
+ Smad3
+ claudins-1 and -3
+ ERK1/2
+ Bone Morphogenic Protein Receptor 2
+ acyl-CoA oxidase 1
+ serpine1
+ VASH-2
+ miR-326
+ TGFB1
+ phosphoinositide 3-kinase
+ bone morphogenetic protein
+ interleukin (IL)-13
+ c-Myc
+ TGF-b3
+ NFATc2
+ TIMP-2
+ SMAD2
+ CD25
+ Smad2/3
+ V-ATPase
+ LMP-1
+ C1q receptor
+ glutathione peroxidase 1
+ C5a receptor
+ IL-1 alpha, -1R, -1RA, -2, -4, -4R alpha, -6, -10
+ platelet-derived growth factor isoforms (PDGF) A and B
+ IL-1-beta
+ Transforming growth factor-beta1
+ galectin-3
+ PAR1
+ SIRT7
+ p65
+ Transforming growth factor beta
+ cPLA(2)
+ desmin
+ Histone deacetylase 6
+ EMT
+ transforming growth factor (TGF)-beta1
+ IGFBP-1 and -2
+ TGFBR-2
+ transforming growth factor beta
+ HSP90
+ miR-29b
+ CD248
+ PPARbeta
+ follistatin
+ TGF-beta(1
+ Janus kinase type 2
+ A-myb
+ nuclear factor E2-related factor 2
+ Heat shock protein (HSP) 47
+ VCAM-1
+ mmu-miR-326
+ PARP
+ LXA4 receptor
+ G-CSF
+ transforming-growth factor beta 1
+ Matriptase
+ MiR-5100
+ IL-6
+ VEGFR
+ CXCL-9
+ Rpn6
+ IL-10
+ alpha1 type I collagen
+ Smad4
+ matrix metalloproteinase-9
+ PHLPP
+ Tumor necrosis factor-alpha
+ thyroid transcription factor-1
+ insulin
+ Ang-2
+ basic FGF
+ tagln
+ TGFbeta(1)
+ b-FGF
+ miR-210
+ Lrp5 and 6
+ PDGF-b
+ FN1
+ HMGA2
+ LYCAT
+ Tumor necrosis factor a
+ IL-2
+ IL1-b
+ PAI-1
+ VEGFR-2
+ igf1
+ Ho-1
+ aquaporin-5
+ VEGF receptor-2
+ COMP
+ c-jun
+ mir-155
+ megakaryoblastic leukemia 1
+ Kca3.1
+ tissue inhibitors of metalloproteinases-1
+ Secreted protein acidic and rich in cysteine
+ CD-1
+ Cyclin D1
+ tenascin C
+ phosphoserine aminotransferase 1
+ Lin28B
+ Gremlin
+ tropomodulin 3
+ PIAS4
+ interleukin 10
+ epidermal growth factor receptor
+ c-IAP2
+ fibroblast growth factor receptor 2
+ CRP1
+ Collagen Triple Helix Repeat-Containing-1
+ transforming growth factor-b1
+ PTX-2
+ CD11b
+ IL-4 R alpha
+ TG2
+ IGFBP-2
+ cytochrome b
+ BLTR
+ lysyl oxidase
+ alpha smooth muscle actin
+ UCH37
+ Receptor for advanced glycation end products
+ IL-1 beta
+ miR-376c
+ miR-153
+ Smad2/3/4
+ LEF/TCF
+ thymosin b4
+ plasminogen activator inhibitor-1
+ beta-galactosidase
+ Stanniocalcin-1
+ THP-1
+ Egr-1
+ beta-gal
+ PDGFR
+ Transforming growth factor b-1
+ transforming growth factor beta 1
+ miR-410
+ TGF-b(1
+ focal adhesion kinase
+ STAT3
+ Prostaglandin F(2alpha) receptor
+ Nox-4
+ Toll-like receptor 9
+ CCL2
+ GM-CSF
+ folate receptor beta
+ Elk1
+ interleukin (IL)-1beta
+ mTOR
+ vascular cell adhesion molecule 1
+ E-cadherin
+ PPARgamma
+ Serpine1
+ PAI1
+ TIMP
+ SFTPC
+ VEGF and IL-12
+ LTBP 4
+ Nuclear factor erythroid 2-related factor 2
+ Jun NH2-terminal kinase
+ FAK(Y397
+ IL-18
+ Transforming growth factor-b
+ il-1b
+ SphK
+ DDR2
+ FOXF1
+ TIMP1
+ SHMT2
+ SOD3
+ TGFb1
+ FN
+ TIMP2
+ FRbeta
+ Interleukin 4
+ E-cad
+ p38
+ VEGF-D
+ Periostin
+ Sp1
+ CC1q-R
+ KL-6
+ ADAM19
+ miR-185
+ USP11
+ IL8
+ Akt2
+ BMPER
+ IFN-gammaR
+ Akt
+ IL-1
+ hepatocyte growth factor
+ MAPKAPK2
+ uncoupling protein 2
+ thrombospondin-1
+ serum response factor
+ CD55
+ Gpx1
+ Id3
+ PAR-1
+ keratinocyte growth factor
+ TIGAR
+ NADPH oxidase 4
+ integrin-linked kinase
+ interleukin-1 receptor antagonist protein
+ PHGDH
+ mPGES-1
+ matrix metalloproteinase 14
+ STIP1
+ CCN1
+ angiopoietin-1
+ CD44
+ TGF-b1
+ PN-1
+ BMP endothelial cell precursor-derived regulator
+ MFG-E8
+ PPAR
+ protein kinase B
+ IGFBP-3
+ EMMPRIN
+ cyclosporine A
+ semaphorin-7A
+ SNAI1
+ Pink1
+ PINK1
+ Bone morphogenetic protein-4
+ CBP
+ IL-17
+ AT1
+ TGFBR2
+ N-acetyl-L-cysteine
+ endothelin-1
+ smad-2
+ Interleukin (IL)-6
+ ET-1
+ AP-1
+ HDAC4
+ c-Fos
+ HSP27
+ WISP1
+ Transforming growth factor beta 1
+ jag1
+ Nrf2
+ cyclooxygenase 2
+ smad6/7
+ WNT5a
+ mir-154
+ SP-D
+ Matrix metalloproteinase (MMP)-19
+ Vasohibin-2
+ caspase 3
+ Smad1/5
+ miR-200a
+ TNF-a
+ IGFBP-3 and -5
+ p53
+ Serpin B4
+ Transcription factor GATA-6
+ ACLP
+ transgelin
+ NADPH Oxidase 4
+ ZO-1
+ Cthrc1
+ VEGF-A
+ Plasminogen activator inhibitor 1
+ p300
+ extent, type B receptors
+ il12p40
+ miR-29c
+ IL-1beta
+ interleukin (IL)-17
+ transforming growth factor b(1)
+ LTB(4) receptor
+ BMP
+ extracellular signal--regulated kinase
+ interleukin-1 beta
+ TLR4
+ AGT
+ PP1
+ IGF-1
+ Thymosin b4
+ SOCS1
+ SMAD)2
+ E prostanoid receptor 2
+ b2 -AR
+ microRNA (miR)-155
+ peroxisome proliferator-activated receptor-y
+ Discoidin Domain Receptor 2
+ smad2/3
+ gp130
+ miR-31
+ MKL1
+ PPARalpha
+ TTF-1
+ Erk1/2
+ ERK
+ RXFP1
+ interleukin-18
+ protease nexin-1
+ Syndecan-2
+ RhoA
+ CD34
+ N  -cadherin
+ Rta
+ PI3K
+ fibroblast specific protein-1
+ IGFBP-5
+ PDGB
+ gremlin
+ HMG-CoA) reductase
+ Yin Yang 1
+ interleukin-1
+ p38 mitogen-activated protein kinase
+ Vi
+ CD11c
+ IL-4
+ NEU1
+ VEGF
+ CD46
+ protease-activated receptor (PAR)-2
+ C/EBP homologous protein
+ ATG4B
+ IKKa
+ AKT2
+ calnexin
+ CXCR3
+ peroxisome proliferator-activated receptor y
+ fibroblast growth factor-2
+ TGF-beta receptor II
+ CsA
+ miR  -221
+ BAX inhibitor-1
+ miR-5100
+ Ang-1
+ PEX13p
+ SDC2
+ PARK2
+ 5-HTR(1A/B) and 5-HTR(2B
+ fibronectin
+ interleukin (IL)-8
+ BMP-7
+ EP1
+ CDCP1
+ protease-activated receptor-2
+ CD8
+ CD206
+ TGF-beta receptors (T beta R-I and T beta R-II
+ HGF
+ c-Jun NH-terminal kinase
+ Col3a1
+ IRAP
+ Bcl-2
+ GLP-1
+ N-cadherin
+ Sema 7a
+ SDF-1
+ Wnt
+ GLP-1 receptor
+ sphingosine kinase 1
+ Smad2
+ transforming growth factor b-1
+ p63
+ TLR9
+ IL-13
+ X-linked inhibitor of apoptosis
+ CD19
+ syndecan-2
+ EGR1
+ STUB1
+ Lysocardiolipin acyltransferase
+ IL8, -6, and -1B
+ Wnt1-inducible signaling protein 1
+ TGF-b
+ tenascin
+ hypoxia-inducible factor-1a
+ IP-10
+ XIAP
+ transforming growth factor beta1
+ caveolin-1
+ endothelial nitric oxide synthase
+ IGF-2
+ CCR2
+ inducible nitric oxide synthase
+ bone morphogenetic protein 7
+ platelet-derived growth factor, insulin-like growth factor type I, and transforming growth factor beta
+ Itgb1/6
+ HIF-1
+ SRF
+ miR-29a
+ MPP-9
+ miR-155
+ PDGF-A and -B
+ FoxO3a
+ Cub domain containing protein 1
+ Muc5ac
+ Yin yang 1
+ Transforming growth factor b
+ Ltbp1
+ NOX-2
+ tissue inhibitor of metalloproteinase (TIMP)-1, -2, -3, and -4
+ Nox1
+ X-box-binding protein 1
+ miR-21
+ Wnt7B
+ HSP90b
+ PPAR-a
+ leucine-rich alpha-2 glycoprotein
+ TNF alpha
+ estrogen receptor 1
+ TSLP
+ signal transducer and activator of transcription 3
+ IL-8 and b-FGF
+ matrix metalloproteinase-7
+ mitogen-activated protein kinase-activated protein kinase-2
+ Smad-3
+ matrix metalloproteinase (MMP)-9
+ beta 1 integrin
+ interleukin (IL)-6
+ basic-fibroblast growth factor
+ gastrin
+ Pdgfb
+ Itga2/3
+ HLF
+ snail
+ TGFb(1)
+ Smurf2
+ STAT1
+ tissue factor
+ Glucagon like peptide-1
+ NAC
+ Lrp5
+ transforming growth factor b expression
+ insulin-like growth factor (IGF)-I
+ superoxide dismutase 3
+ vascular endothelial growth factor receptor
+ Wt1
+ as c
+ Transforming growth factor (TGF)-beta1
+ IGF-I
+ UCP2
+ Protease activated receptor-1
+ G1 and G2
+ transforming growth factor beta-1
+ FHIT
+ Wnt5A
+ TGF beta1
+ MRTF-A
+ platelet-derived growth factor receptor
+ SphK1
+ extracellular superoxide dismutase
+ Acta2
+ Toll-like receptor 4
+ ICAT
+ CXCL10
+ alpha-SMA
+ Bax inhibitor-1
+ keratin 6 and 14
+ AT2
+ MT1- and MT2-MMPs
+ NOX1
+ beta 2
+ PAI-1-siRNA
+ fibrillin-2
+ col3a1
+ IPF-1, insulin, and glucokinase
+ cyclin D1
+ COX-2
+ CD4
+ MT2
+ Transforming growth factor-beta
+ matrix metalloprotease-1
+ Thy-1
+ ATG7
+ neuraminidase 1
+ Mkl1
+ LPA1
+ Ost-4
+ MMP  -9
+ HIF1a
+ Semaphorin 7a
+ EP3
+ Transforming growth factor-b1
+ PSAT1
+ High mobility group AT-hook 2
+ jagged 1
+ n-cadherin
+ Janus kinase 2
+ let-7d
+ Fas ligand
+ integrin alpha v
+ MK2(-/-) MEF
+ interleukin-1beta
+ p21
+ Col1a2
+ MT3-MMP
+ PDGF-A
+ JNK
+ Transforming Growth Factor- b
+ PP2A
+ miR
+ claudins-1, -3, and -5
+ BARD1
+ relaxin/insulin-like family peptide receptor 1
+ MMP2
+ ATG5
+ MEK
+ CAV1
+ SIRT3
+ ANGII
+ activin
+ p38 MAPK
+ interleukin-1 (IL-1)Ra
+ hsa-miR-326
+ MAP3K19
+ surfactant protein C
+ Nox4
+ collagen (Col)1a1
+ SAP
+ miR-9-5p
+ interleukin (IL)-1b
+ p21(waf1
+ MicroRNA-29c
+ H19
+ Protease-activated receptor-1
+ ALXR
+ miR-487b
+ TGF beta
+ Connective tissue growth factor
+ matrix metalloproteinase-14
+ SERPINE1
+ mir-21
+ CC chemokine receptor 2
+ PTEN
+ IL-1 alpha
+ IPF-1
+ c-IAP)1
+ Il-17a
+ Pigment epithelium-derived factor
+ fibroblast growth factor 10
+ connective-tissue growth factor
+ BMP3
+ transforming growth factor-beta1
+ Annexin V
+ HS6ST1/2
+ fibroblast growth factor-10
+ BI-1
+ lymphotactin
+ tenascin-C
+ miR-455
+ MT1-MMP
+ transforming growth factor-b
+ 3-hydroxy-3-methylglutaryl CoenzymeA (HMG CoA) reductase
+ SMAD2/3
+ MiR-338
+ MMP-2 and -9
+ LTBP)-1
+ suppressor of cytokine signaling 1
+ neutrophil peptide (HNP)-1
+ SphK2
+ S1P lyase
+ ltbp1/2
+ iNOS
+ TGFb(1
+ ACE
+ BCL-2
+ Oct-4
+ SMA
+ MMP-9 and tissue inhibitor of metalloproteinase-1
+ Hepatocyte growth factor
+ Ets1
+ beta-actin
+ VASH-1
+ CD117
+ THBS1
+ HO1
+ Hsp90
+ Extracellular Matrix Metalloproteinase Inducer
+ COL1A1
+ AKT1
+ FGF-1
+ interleukin 6
+ caspase-3/7
+ IL6
+ receptor for advanced glycation end products
+ EP4
+ TGFb
+ HSP47
+ miR-140
+ heat shock protein (HSP)90
+ insulin-like growth factor binding proteins (IGFBP)-3 and -5
+ EZH2
+ Toll-Like Receptor 9
+ Col3A1
+ Transforming Growth Factor- Beta1
+ Osteopontin
+ hFL1
+ CXCR4
+ MMP19
+ IL-33
+ miR-17   92
+ fibrillin-1
+ ET-A
+ HDAC10
+ ALK-5
+ IL-31
+ beclin 1
+ c-Jun
+ Sema7a
+ MT5-MMP
+ PI3
+ ITGB6
+ TIAM1
+ angiotensin II
+ LRG
+ IL-1alpha
+ TbetaRII
+ transforming growth factor b
+ FIEL1
+ C5aR
+ PEDF
+ C1q
+ IL-1ra
+ tissue inhibitor of metalloproteinase
+ ErbB2
+ TGF  -b1
+ PTEN-induced putative kinase 1
+ AAG
+ CD103
+ a-smooth muscle actin
+ cytokeratin 19
+ CREB)-binding protein
+ p110a
+ Ets Domain-containing Protein Elk1
+ insulin-like growth factor I
+ Cytokeratin-8
+ TIMP3
+ BMP-15
+ LC3
+ Vimentin
+ mTORC1
+ SIRT6
+ heme oxygenase-1
+ Transforming growth factor-beta 1
+ miR-127
+ MIP-1 beta
+ TRPV4
+ Neutrophil elastase
+ ANG converting enzyme
+ ERK-1
+ bFGF
+ tumor necrosis factor-alpha
+ Serpin B3/B4
+ focal adhesion kinase-related nonkinase
+ Stat3
+ miR-1343
+ SMAD7
+ Endosialin
+ FGF-2
+ miR-101
+ L1CAM
+ thymic stromal lymphopoietin
+ vascular endothelial growth factor
+ PEX13
+ heat shock protein (HSP) 47
+ transient receptor potential vanilloid 4
+ monocyte chemoattractant protein 1
+ SPP1
+ CD68
+ TGF- b1
+ T beta RII
+ TGFb-1
+ Forkhead Box F1
--- a/tagging_Sklearn_crfsuite.py deleted 100644 → 0
View file @8969465
+++ b/tagging_Sklearn_crfsuite.py deleted 100644 → 0
View file @8969465
- # -*- coding: UTF-8 -*-
- 
- import os
- from itertools import chain
- from optparse import OptionParser
- from time import time
- from collections import Counter
- 
- import nltk
- import sklearn
- import scipy.stats
- import sys
- 
- from sklearn.externals import joblib
- from sklearn.metrics import make_scorer
- from sklearn.cross_validation import cross_val_score
- from sklearn.grid_search import RandomizedSearchCV
- 
- import sklearn_crfsuite
- from sklearn_crfsuite import scorers
- from sklearn_crfsuite import metrics
- 
- from nltk.corpus import stopwords
- from trainingTesting_Sklearn_crfsuite import word2features
- from trainingTesting_Sklearn_crfsuite import sent2features
- # from trainingTesting_Sklearn_crfsuite import hasNonAlphaNum
- # from trainingTesting_Sklearn_crfsuite import hasDigit
- 
- # Objective
- # Tagging transformed file with CRF model with sklearn-crfsuite.
- #
- # Input parameters
- # --inputPath=PATH    Path of transformed files x|y|z
- # --modelPath        Path to CRF model
- # --modelName    Model name
- # --outputPath=PATH    Output path to place output files
- # --filteringStopWords   Filtering stop words
- # --filterSymbols      Filtering punctuation marks
- 
- # Output
- # 1) Tagged files in transformed format
- 
- # Examples
- # Sentences
- # C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName aspectsTraining.fStopWords_False.fSymbols_True --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged --filterSymbols > output.taggingCRF.20161107.txt
- # C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName sentencesTraining.fStopWords_False.fSymbols_False --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged > output.taggingCRF.20161107.txt
- 
- #################################
- #           FUNCTIONS           #
- #################################
- # def hasDigit(text):
- #     has = False
- #     if len(text) < 3:
- #         return False
- #     myRegex = nltk.re.compile('[0-9]')
- #     if myRegex.search(text) != None:
- #         has = True
- #     return has
- #
- #
- # def hasNonAlphaNum(text):
- #     has = False
- #     if len(text) < 3:
- #         return False
- #     myRegex = nltk.re.compile('\W')
- #     if myRegex.search(text) != None:
- #         has = True
- #     return has
- 
- # IMPORTED FROM TRAINING SCRIPT
- # def word2features(sent, i):
- #     # print "i: " + str(i)
- #     # print "sent[i]" + sent[i]
- #     listElem = sent[i].split('|')
- #     word = listElem[0]
- #     lemma = listElem[1]
- #     postag = listElem[2]
- #
- #     features = {
- #         # Names of TF and genes change by lower and upper characters: 'word.lower()': word.lower(),
- #         # Suffixes
- #         'word[-3:]': word[-3:],
- #         'word[-2:]': word[-2:],
- #         'word[-1:]': word[-1:],
- #         'word.isupper()': word.isupper(),
- #         'word.istitle()': word.istitle(),
- #         'word.hasDigit()': hasDigit(word),
- #         'word.hasNonAlphaNum': hasNonAlphaNum(word),
- #         # 'word.isdigit()': word.isdigit(),
- #         'word': word,
- #         'lemma': lemma,
- #         'lemma[-3:]': lemma[-3:],
- #         'lemma[-2:]': lemma[-2:],
- #         'lemma[-1:]': lemma[-1:],
- #         'postag': postag,
- #         # Prefixes
- #         'postag[:2]': postag[:2],
- #         'postag[:1]': postag[:1],
- #     }
- #     if i > 0:
- #         listElem = sent[i - 1].split('|')
- #         word1 = listElem[0]
- #         lemma1 = listElem[1]
- #         postag1 = listElem[2]
- #         features.update({
- #             '-1:word.lower()': word1.lower(),
- #             '-1:word.istitle()': word1.istitle(),
- #             '-1:word.isupper()': word1.isupper(),
- #             '-1:word.hasDigit()': hasDigit(word1),
- #             '-1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
- #             '-1:word': word1,
- #             '-1:lemma': lemma1,
- #             '-1:postag': postag1,
- #             '-1:postag[:2]': postag1[:2],
- #             '-1:postag[:1]': postag1[:1],
- #         })
- #     # else:
- #     #    features['BOS'] = True
- #
- #     if i < len(sent) - 1:
- #         listElem = sent[i + 1].split('|')
- #         word1 = listElem[0]
- #         lemma1 = listElem[1]
- #         postag1 = listElem[2]
- #         features.update({
- #             '+1:word.lower()': word1.lower(),
- #             '+1:word.istitle()': word1.istitle(),
- #             '+1:word.isupper()': word1.isupper(),
- #             '+1:word.hasDigit()': hasDigit(word1),
- #             '+1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
- #             '+1:word': word1,
- #             '+1:lemma': lemma1,
- #             '+1:postag': postag1,
- #             '+1:postag[:2]': postag1[:2],
- #             '+1:postag[:1]': postag1[:1],
- #         })
- #     # else:
- #     #    features['EOS'] = True
- #     if i > 1:
- #         listElem = sent[i - 2].split('|')
- #         word2 = listElem[0]
- #         lemma2 = listElem[1]
- #         postag2 = listElem[2]
- #         features.update({
- #             '-2:word.lower()': word2.lower(),
- #             '-2:word.istitle()': word2.istitle(),
- #             '-2:word.isupper()': word2.isupper(),
- #             '-2:word.hasDigit()': hasDigit(word2),
- #             '-2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
- #             '-2:word': word2,
- #             '-2:lemma': lemma2,
- #             '-2:postag': postag2,
- #             '-2:postag[:2]': postag2[:2],
- #             '-2:postag[:1]': postag2[:1],
- #         })
- #
- #     if i < len(sent) - 2:
- #         listElem = sent[i + 2].split('|')
- #         word2 = listElem[0]
- #         lemma2 = listElem[1]
- #         postag2 = listElem[2]
- #         features.update({
- #             '+2:word.lower()': word2.lower(),
- #             '+2:word.istitle()': word2.istitle(),
- #             '+2:word.isupper()': word2.isupper(),
- #             '+2:word.hasDigit()': hasDigit(word2),
- #             '+2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
- #             '+2:word': word2,
- #             '+2:lemma': lemma2,
- #             '+2:postag': postag2,
- #             '+2:postag[:2]': postag2[:2],
- #             '+2:postag[:1]': postag2[:1],
- #         })
- #
- #     trigrams = False
- #     if trigrams:
- #         if i > 2:
- #             listElem = sent[i - 3].split('|')
- #             word3 = listElem[0]
- #             lemma3 = listElem[1]
- #             postag3 = listElem[2]
- #             features.update({
- #                 '-3:word.lower()': word3.lower(),
- #                 '-3:word.istitle()': word3.istitle(),
- #                 '-3:word.isupper()': word3.isupper(),
- #                 '-3:word.hasDigit()': hasDigit(word3),
- #                 '-3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
- #                 '-3:word': word3,
- #                 '-3:lemma': lemma3,
- #                 '-3:postag': postag3,
- #                 '-3:postag[:2]': postag3[:2],
- #                 '-3:postag[:1]': postag3[:1],
- #             })
- #
- #         if i < len(sent) - 3:
- #             listElem = sent[i + 3].split('|')
- #             word3 = listElem[0]
- #             lemma3 = listElem[1]
- #             postag3 = listElem[2]
- #             features.update({
- #                 '+3:word.lower()': word3.lower(),
- #                 '+3:word.istitle()': word3.istitle(),
- #                 '+3:word.isupper()': word3.isupper(),
- #                 '+3:word.hasDigit()': hasDigit(word3),
- #                 '+3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
- #                 '+3:word': word3,
- #                 '+3:lemma': lemma3,
- #                 '+3:postag': postag3,
- #                 '+3:postag[:2]': postag3[:2],
- #                 '+3:postag[:1]': postag3[:1],
- #             })
- #
- #     return features
- 
- 
- # def sent2features(sent):
- #     return [word2features(sent, i) for i in range(len(sent))]
- 
- 
- __author__ = 'CMendezC'
- 
- ##########################################
- #               MAIN PROGRAM             #
- ##########################################
- 
- if __name__ == "__main__":
-     # Defining parameters
-     parser = OptionParser()
-     parser.add_option("--inputPath", dest="inputPath",
-                       help="Path of training data set", metavar="PATH")
-     parser.add_option("--outputPath", dest="outputPath",
-                       help="Output path to place output files",
-                       metavar="PATH")
-     parser.add_option("--modelPath", dest="modelPath",
-                       help="Path to read CRF model",
-                       metavar="PATH")
-     parser.add_option("--modelName", dest="modelName",
-                       help="Model name", metavar="TEXT")
-     parser.add_option("--filterStopWords", default=False,
-                       action="store_true", dest="filterStopWords",
-                       help="Filtering stop words")
-     parser.add_option("--filterSymbols", default=False,
-                       action="store_true", dest="filterSymbols",
-                       help="Filtering punctuation marks")
- 
-     (options, args) = parser.parse_args()
-     if len(args) > 0:
-         parser.error("Any parameter given.")
-         sys.exit(1)
- 
-     print('-------------------------------- PARAMETERS --------------------------------')
-     print("Path to read input files: " + options.inputPath)
-     print("Mode name: " + str(options.modelName))
-     print("Model path: " + options.modelPath)
-     print("Path to place output files: " + options.outputPath)
-     print("Filtering stop words: " + str(options.filterStopWords))
-     symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
-                '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
-     # symbols = [sym.decode('utf-8') for sym in ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
-     #            '}', '[', ']', '*', '%', '$', '#', '&', '°']]
-     # symbols = [u'.', u',', u':', u';', u'?', u'!', u'\'', u'"', u'<', u'>', u'(', u')', u'-', u'_', u'/', u'\\', u'¿', u'¡', u'+', u'{',
-     #             u'}', u'[', u']', u'*', u'%', u'$', u'#', u'&', u'°', u'`']
-     print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
- 
-     print('-------------------------------- PROCESSING --------------------------------')
- 
-     stopwords = [word.decode('utf-8') for word in stopwords.words('english')]
- 
-     # Read CRF model
-     t0 = time()
-     print('Reading CRF model...')
-     crf = joblib.load(os.path.join(options.modelPath, 'models', options.modelName + '.mod'))
-     print("Reading CRF model done in: %fs" % (time() - t0))
- 
-     print('Processing corpus...')
-     t0 = time()
-     labels = list(['MF', 'TF', 'DFAM', 'DMOT', 'DPOS', 'PRO'])
-     # Walk directory to read files
-     for path, dirs, files in os.walk(options.inputPath):
-         # For each file in dir
-         for file in files:
-             print("   Preprocessing file..." + str(file))
-             sentencesInputData = []
-             sentencesOutputData = []
-             with open(os.path.join(options.inputPath, file), "r") as iFile:
-                 lines = iFile.readlines()
-                 for line in lines:
-                     listLine = []
-                     # line = line.decode("utf-8")
-                     for token in line.strip('\n').split():
-                         if options.filterStopWords:
-                             listToken = token.split('|')
-                             lemma = listToken[1]
-                             # Original if lemma in stopwords.words('english'):
-                             if lemma in stopwords:
-                                 continue
-                         if options.filterSymbols:
-                             listToken = token.split('|')
-                             lemma = listToken[1]
-                             if lemma in symbols:
-                                 if lemma == ',':
-                                     print "Coma , identificada"
-                                 continue
-                         listLine.append(token)
-                     sentencesInputData.append(listLine)
-                 print "   Sentences input data: " + str(len(sentencesInputData))
-                 # print sentencesInputData[0]
-                 # print(sent2features(sentencesInputData[0])[0])
-                 # print(sent2labels(sentencesInputData[0]))
-                 X_input = [sent2features(s) for s in sentencesInputData]
-                 print(sent2features(sentencesInputData[0])[0])
-                 # y_test = [sent2labels(s) for s in sentencesInputData]
-                 # Predicting tags
-                 t1 = time()
-                 print "   Predicting tags with model"
-                 y_pred = crf.predict(X_input)
-                 print y_pred[0]
-                 print("      Prediction done in: %fs" % (time() - t1))
-                 # Tagging with CRF model
-                 print "   Tagging file"
-                 for line, tagLine in zip(lines, y_pred):
-                     outputLine = ''
-                     idx_tagLine = 0
-                     line = line.strip('\n')
-                     print "\nLine: " + str(line)
-                     print "CRF tagged line: " + str(tagLine)
-                     for token in line.split():
-                         listToken = token.split('|')
-                         word = listToken[0]
-                         lemma = listToken[1]
-                         tag = listToken[2]
-                         if options.filterStopWords:
-                             if lemma in stopwords:
-                                 outputLine += token + ' '
-                                 continue
-                         if options.filterSymbols:
-                             if lemma in symbols:
-                                 if lemma == ',':
-                                     print "Coma , identificada"
-                                 outputLine += token + ' '
-                                 continue
-                         CRFtag = tagLine[idx_tagLine]
-                         if (tag not in labels) and (CRFtag != 'O'):
-                             print "*** CRF change token {} to {}".format(token, CRFtag)
-                             outputLine += word + '|' + lemma + '|' + CRFtag + ' '
-                         else:
-                             outputLine += word + '|' + lemma + '|' + tag + ' '
-                         idx_tagLine += 1
-                     sentencesOutputData.append(outputLine.rstrip())
-             with open(os.path.join(options.outputPath, file), "w") as oFile:
-                 for line in sentencesOutputData:
-                     oFile.write(line + '\n')
- 
-     print("Processing corpus done in: %fs" % (time() - t0))
--- a/training-validation-v1.py
View file @a7cccc5
+++ b/training-validation-v1.py
View file @a7cccc5
@@ -198,8 +198,8 @@ if __name__ == "__main__":
 
     print("Reading corpus done in: %fs" % (time() - t0))
 
-     print(sent2features(sentencesTrainingData[0])[0])
-     print(sent2features(sentencesTestData[0])[0])
+     #print(sent2features(sentencesTrainingData[0])[0])
+     #print(sent2features(sentencesTestData[0])[0])
     t0 = time()
 
     X_train = [sent2features(s) for s in sentencesTrainingData]