Larisa Morales Soto

Merge branch 'master' of pakal.ccg.unam.mx:larisams/gene-disease-embeddings

1 +#Cluster: 1
2 +3_s 3
3 +17_s 17
4 +33_s 33
5 +42_s 42
6 +43_s 43
7 +59_s 59
8 +66_s 66
9 +82_s 82
10 +84_s 84
11 +100_s 100
12 +123_s 123
13 +124_s 124
14 +132_s 131
15 +136_s 135
16 +137_s 136
17 +148_s 147
18 +188_s 186
19 +196_s 194
20 +216_s 214
21 +219_s 217
22 +226_s 224
23 +227_s 225
24 +229_s 227
25 +230_s 228
26 +234_s 232
27 +238_s 236
28 +244_s 242
29 +245_s 243
30 +269_s 266
31 +275_s 272
32 +281_s 278
33 +287_s 284
34 +290_s 287
35 +301_s 297
36 +302_s 298
37 +303_s 299
38 +315_s 311
39 +332_s 328
40 +350_s 346
41 +361_s 357
42 +366_s 362
43 +379_s 375
44 +415_s 411
45 +421_s 417
46 +444_s 440
47 +445_s 441
48 +446_s 442
49 +453_s 449
50 +#Cluster: 2
51 +6_s 6
52 +18_s 18
53 +24_s 24
54 +25_s 25
55 +29_s 29
56 +37_s 37
57 +38_s 38
58 +44_s 44
59 +47_s 47
60 +55_s 55
61 +65_s 65
62 +69_s 69
63 +70_s 70
64 +71_s 71
65 +72_s 72
66 +79_s 79
67 +81_s 81
68 +86_s 86
69 +92_s 92
70 +94_s 94
71 +101_s 101
72 +102_s 102
73 +104_s 104
74 +110_s 110
75 +112_s 112
76 +115_s 115
77 +118_s 118
78 +120_s 120
79 +127_s 126
80 +128_s 127
81 +130_s 129
82 +144_s 143
83 +146_s 145
84 +156_s 155
85 +161_s 160
86 +164_s 163
87 +166_s 165
88 +169_s 168
89 +179_s 177
90 +182_s 180
91 +186_s 184
92 +189_s 187
93 +192_s 190
94 +195_s 193
95 +204_s 202
96 +205_s 203
97 +208_s 206
98 +217_s 215
99 +221_s 219
100 +222_s 220
101 +223_s 221
102 +225_s 223
103 +233_s 231
104 +236_s 234
105 +241_s 239
106 +242_s 240
107 +247_s 245
108 +260_s 257
109 +262_s 259
110 +265_s 262
111 +272_s 269
112 +278_s 275
113 +279_s 276
114 +280_s 277
115 +284_s 281
116 +289_s 286
117 +297_s 293
118 +300_s 296
119 +307_s 303
120 +308_s 304
121 +310_s 306
122 +316_s 312
123 +318_s 314
124 +320_s 316
125 +326_s 322
126 +330_s 326
127 +335_s 331
128 +336_s 332
129 +340_s 336
130 +348_s 344
131 +352_s 348
132 +353_s 349
133 +355_s 351
134 +357_s 353
135 +371_s 367
136 +383_s 379
137 +384_s 380
138 +392_s 388
139 +399_s 395
140 +402_s 398
141 +408_s 404
142 +411_s 407
143 +413_s 409
144 +414_s 410
145 +418_s 414
146 +419_s 415
147 +424_s 420
148 +428_s 424
149 +429_s 425
150 +433_s 429
151 +438_s 434
152 +440_s 436
153 +447_s 443
154 +450_s 446
155 +#Cluster: 3
156 +1_s 1
157 +5_s 5
158 +7_s 7
159 +8_s 8
160 +10_s 10
161 +12_s 12
162 +13_s 13
163 +14_s 14
164 +16_s 16
165 +20_s 20
166 +22_s 22
167 +23_s 23
168 +28_s 28
169 +31_s 31
170 +34_s 34
171 +48_s 48
172 +49_s 49
173 +54_s 54
174 +58_s 58
175 +64_s 64
176 +67_s 67
177 +74_s 74
178 +75_s 75
179 +80_s 80
180 +83_s 83
181 +87_s 87
182 +88_s 88
183 +89_s 89
184 +90_s 90
185 +91_s 91
186 +93_s 93
187 +98_s 98
188 +106_s 106
189 +109_s 109
190 +117_s 117
191 +138_s 137
192 +141_s 140
193 +142_s 141
194 +147_s 146
195 +149_s 148
196 +150_s 149
197 +159_s 158
198 +162_s 161
199 +163_s 162
200 +165_s 164
201 +171_s 169
202 +174_s 172
203 +177_s 175
204 +184_s 182
205 +190_s 188
206 +197_s 195
207 +198_s 196
208 +199_s 197
209 +203_s 201
210 +210_s 208
211 +214_s 212
212 +215_s 213
213 +220_s 218
214 +237_s 235
215 +239_s 237
216 +240_s 238
217 +243_s 241
218 +246_s 244
219 +248_s 246
220 +250_s 248
221 +251_s 249
222 +253_s 250
223 +254_s 251
224 +255_s 252
225 +256_s 253
226 +264_s 261
227 +268_s 265
228 +273_s 270
229 +286_s 283
230 +291_s 288
231 +293_s 289
232 +295_s 291
233 +298_s 294
234 +299_s 295
235 +305_s 301
236 +313_s 309
237 +314_s 310
238 +317_s 313
239 +322_s 318
240 +325_s 321
241 +328_s 324
242 +333_s 329
243 +343_s 339
244 +345_s 341
245 +347_s 343
246 +349_s 345
247 +354_s 350
248 +356_s 352
249 +360_s 356
250 +362_s 358
251 +363_s 359
252 +364_s 360
253 +365_s 361
254 +367_s 363
255 +368_s 364
256 +372_s 368
257 +374_s 370
258 +375_s 371
259 +376_s 372
260 +377_s 373
261 +378_s 374
262 +380_s 376
263 +387_s 383
264 +388_s 384
265 +390_s 386
266 +391_s 387
267 +398_s 394
268 +401_s 397
269 +403_s 399
270 +407_s 403
271 +409_s 405
272 +416_s 412
273 +420_s 416
274 +431_s 427
275 +437_s 433
276 +442_s 438
277 +443_s 439
278 +448_s 444
279 +451_s 447
280 +452_s 448
281 +#Cluster: 4
282 +9_s 9
283 +19_s 19
284 +21_s 21
285 +26_s 26
286 +32_s 32
287 +39_s 39
288 +52_s 52
289 +73_s 73
290 +77_s 77
291 +95_s 95
292 +107_s 107
293 +113_s 113
294 +125_s 125
295 +129_s 128
296 +131_s 130
297 +145_s 144
298 +153_s 152
299 +158_s 157
300 +172_s 170
301 +175_s 173
302 +180_s 178
303 +200_s 198
304 +202_s 200
305 +207_s 205
306 +224_s 222
307 +228_s 226
308 +231_s 229
309 +259_s 256
310 +263_s 260
311 +274_s 271
312 +283_s 280
313 +288_s 285
314 +306_s 302
315 +321_s 317
316 +323_s 319
317 +334_s 330
318 +359_s 355
319 +382_s 378
320 +395_s 391
321 +417_s 413
322 +434_s 430
323 +454_s 450
324 +#Cluster: 5
325 +2_s 2
326 +4_s 4
327 +11_s 11
328 +15_s 15
329 +27_s 27
330 +30_s 30
331 +35_s 35
332 +36_s 36
333 +40_s 40
334 +41_s 41
335 +45_s 45
336 +46_s 46
337 +50_s 50
338 +51_s 51
339 +53_s 53
340 +56_s 56
341 +57_s 57
342 +60_s 60
343 +61_s 61
344 +62_s 62
345 +63_s 63
346 +68_s 68
347 +76_s 76
348 +78_s 78
349 +85_s 85
350 +96_s 96
351 +97_s 97
352 +99_s 99
353 +103_s 103
354 +105_s 105
355 +108_s 108
356 +111_s 111
357 +114_s 114
358 +116_s 116
359 +119_s 119
360 +121_s 121
361 +122_s 122
362 +133_s 132
363 +134_s 133
364 +135_s 134
365 +139_s 138
366 +140_s 139
367 +143_s 142
368 +151_s 150
369 +152_s 151
370 +154_s 153
371 +155_s 154
372 +157_s 156
373 +160_s 159
374 +167_s 166
375 +168_s 167
376 +173_s 171
377 +176_s 174
378 +178_s 176
379 +181_s 179
380 +183_s 181
381 +185_s 183
382 +187_s 185
383 +191_s 189
384 +193_s 191
385 +194_s 192
386 +201_s 199
387 +206_s 204
388 +209_s 207
389 +211_s 209
390 +212_s 210
391 +213_s 211
392 +218_s 216
393 +232_s 230
394 +235_s 233
395 +249_s 247
396 +257_s 254
397 +258_s 255
398 +261_s 258
399 +266_s 263
400 +267_s 264
401 +270_s 267
402 +271_s 268
403 +276_s 273
404 +277_s 274
405 +282_s 279
406 +285_s 282
407 +294_s 290
408 +296_s 292
409 +304_s 300
410 +309_s 305
411 +311_s 307
412 +312_s 308
413 +319_s 315
414 +324_s 320
415 +327_s 323
416 +329_s 325
417 +331_s 327
418 +337_s 333
419 +338_s 334
420 +339_s 335
421 +341_s 337
422 +342_s 338
423 +344_s 340
424 +346_s 342
425 +351_s 347
426 +358_s 354
427 +369_s 365
428 +370_s 366
429 +373_s 369
430 +381_s 377
431 +385_s 381
432 +386_s 382
433 +389_s 385
434 +393_s 389
435 +394_s 390
436 +396_s 392
437 +397_s 393
438 +400_s 396
439 +404_s 400
440 +405_s 401
441 +406_s 402
442 +410_s 406
443 +412_s 408
444 +422_s 418
445 +423_s 419
446 +425_s 421
447 +426_s 422
448 +427_s 423
449 +430_s 426
450 +432_s 428
451 +435_s 431
452 +436_s 432
453 +439_s 435
454 +441_s 437
455 +449_s 445
456 +455_s 451
1 +#Cluster: 1
2 +3_s 3
3 +17_s 17
4 +33_s 33
5 +42_s 42
6 +43_s 43
7 +59_s 59
8 +66_s 66
9 +82_s 82
10 +84_s 84
11 +100_s 100
12 +123_s 123
13 +124_s 124
14 +132_s 131
15 +136_s 135
16 +137_s 136
17 +148_s 147
18 +188_s 186
19 +196_s 194
20 +216_s 214
21 +219_s 217
22 +226_s 224
23 +227_s 225
24 +229_s 227
25 +230_s 228
26 +234_s 232
27 +238_s 236
28 +244_s 242
29 +245_s 243
30 +269_s 266
31 +275_s 272
32 +281_s 278
33 +287_s 284
34 +290_s 287
35 +301_s 297
36 +302_s 298
37 +303_s 299
38 +315_s 311
39 +332_s 328
40 +350_s 346
41 +361_s 357
42 +366_s 362
43 +379_s 375
44 +415_s 411
45 +421_s 417
46 +444_s 440
47 +445_s 441
48 +446_s 442
49 +453_s 449
50 +#Cluster: 2
51 +6_s 6
52 +18_s 18
53 +24_s 24
54 +25_s 25
55 +29_s 29
56 +37_s 37
57 +38_s 38
58 +44_s 44
59 +47_s 47
60 +55_s 55
61 +65_s 65
62 +69_s 69
63 +70_s 70
64 +71_s 71
65 +72_s 72
66 +79_s 79
67 +81_s 81
68 +86_s 86
69 +92_s 92
70 +94_s 94
71 +101_s 101
72 +102_s 102
73 +104_s 104
74 +110_s 110
75 +112_s 112
76 +115_s 115
77 +118_s 118
78 +120_s 120
79 +127_s 126
80 +128_s 127
81 +130_s 129
82 +144_s 143
83 +146_s 145
84 +156_s 155
85 +161_s 160
86 +164_s 163
87 +166_s 165
88 +169_s 168
89 +179_s 177
90 +182_s 180
91 +186_s 184
92 +189_s 187
93 +192_s 190
94 +195_s 193
95 +204_s 202
96 +205_s 203
97 +208_s 206
98 +217_s 215
99 +221_s 219
100 +222_s 220
101 +223_s 221
102 +225_s 223
103 +233_s 231
104 +236_s 234
105 +241_s 239
106 +242_s 240
107 +247_s 245
108 +260_s 257
109 +262_s 259
110 +265_s 262
111 +272_s 269
112 +278_s 275
113 +279_s 276
114 +280_s 277
115 +284_s 281
116 +289_s 286
117 +297_s 293
118 +300_s 296
119 +307_s 303
120 +308_s 304
121 +310_s 306
122 +316_s 312
123 +318_s 314
124 +320_s 316
125 +326_s 322
126 +330_s 326
127 +335_s 331
128 +336_s 332
129 +340_s 336
130 +348_s 344
131 +352_s 348
132 +353_s 349
133 +355_s 351
134 +357_s 353
135 +371_s 367
136 +383_s 379
137 +384_s 380
138 +392_s 388
139 +399_s 395
140 +402_s 398
141 +408_s 404
142 +411_s 407
143 +413_s 409
144 +414_s 410
145 +418_s 414
146 +419_s 415
147 +424_s 420
148 +428_s 424
149 +429_s 425
150 +433_s 429
151 +438_s 434
152 +440_s 436
153 +447_s 443
154 +450_s 446
155 +#Cluster: 3
156 +1_s 1
157 +5_s 5
158 +7_s 7
159 +8_s 8
160 +10_s 10
161 +12_s 12
162 +13_s 13
163 +14_s 14
164 +16_s 16
165 +20_s 20
166 +22_s 22
167 +23_s 23
168 +28_s 28
169 +31_s 31
170 +34_s 34
171 +48_s 48
172 +49_s 49
173 +54_s 54
174 +58_s 58
175 +64_s 64
176 +67_s 67
177 +74_s 74
178 +75_s 75
179 +80_s 80
180 +83_s 83
181 +87_s 87
182 +88_s 88
183 +89_s 89
184 +90_s 90
185 +91_s 91
186 +93_s 93
187 +98_s 98
188 +106_s 106
189 +109_s 109
190 +117_s 117
191 +138_s 137
192 +141_s 140
193 +142_s 141
194 +147_s 146
195 +149_s 148
196 +150_s 149
197 +159_s 158
198 +162_s 161
199 +163_s 162
200 +165_s 164
201 +171_s 169
202 +174_s 172
203 +177_s 175
204 +184_s 182
205 +190_s 188
206 +197_s 195
207 +198_s 196
208 +199_s 197
209 +203_s 201
210 +210_s 208
211 +214_s 212
212 +215_s 213
213 +220_s 218
214 +237_s 235
215 +239_s 237
216 +240_s 238
217 +243_s 241
218 +246_s 244
219 +248_s 246
220 +250_s 248
221 +251_s 249
222 +253_s 250
223 +254_s 251
224 +255_s 252
225 +256_s 253
226 +264_s 261
227 +268_s 265
228 +273_s 270
229 +286_s 283
230 +291_s 288
231 +293_s 289
232 +295_s 291
233 +298_s 294
234 +299_s 295
235 +305_s 301
236 +313_s 309
237 +314_s 310
238 +317_s 313
239 +322_s 318
240 +325_s 321
241 +328_s 324
242 +333_s 329
243 +343_s 339
244 +345_s 341
245 +347_s 343
246 +349_s 345
247 +354_s 350
248 +356_s 352
249 +360_s 356
250 +362_s 358
251 +363_s 359
252 +364_s 360
253 +365_s 361
254 +367_s 363
255 +368_s 364
256 +372_s 368
257 +374_s 370
258 +375_s 371
259 +376_s 372
260 +377_s 373
261 +378_s 374
262 +380_s 376
263 +387_s 383
264 +388_s 384
265 +390_s 386
266 +391_s 387
267 +398_s 394
268 +401_s 397
269 +403_s 399
270 +407_s 403
271 +409_s 405
272 +416_s 412
273 +420_s 416
274 +431_s 427
275 +437_s 433
276 +442_s 438
277 +443_s 439
278 +448_s 444
279 +451_s 447
280 +452_s 448
281 +#Cluster: 4
282 +9_s 9
283 +19_s 19
284 +21_s 21
285 +26_s 26
286 +32_s 32
287 +39_s 39
288 +52_s 52
289 +73_s 73
290 +77_s 77
291 +95_s 95
292 +107_s 107
293 +113_s 113
294 +125_s 125
295 +129_s 128
296 +131_s 130
297 +145_s 144
298 +153_s 152
299 +158_s 157
300 +172_s 170
301 +175_s 173
302 +180_s 178
303 +200_s 198
304 +202_s 200
305 +207_s 205
306 +224_s 222
307 +228_s 226
308 +231_s 229
309 +259_s 256
310 +263_s 260
311 +274_s 271
312 +283_s 280
313 +288_s 285
314 +306_s 302
315 +321_s 317
316 +323_s 319
317 +334_s 330
318 +359_s 355
319 +382_s 378
320 +395_s 391
321 +417_s 413
322 +434_s 430
323 +454_s 450
324 +#Cluster: 5
325 +11_s 11
326 +27_s 27
327 +30_s 30
328 +36_s 36
329 +40_s 40
330 +46_s 46
331 +53_s 53
332 +57_s 57
333 +60_s 60
334 +62_s 62
335 +63_s 63
336 +78_s 78
337 +96_s 96
338 +99_s 99
339 +114_s 114
340 +122_s 122
341 +134_s 133
342 +135_s 134
343 +154_s 153
344 +155_s 154
345 +160_s 159
346 +167_s 166
347 +168_s 167
348 +173_s 171
349 +187_s 185
350 +194_s 192
351 +206_s 204
352 +211_s 209
353 +213_s 211
354 +249_s 247
355 +257_s 254
356 +258_s 255
357 +267_s 264
358 +271_s 268
359 +276_s 273
360 +282_s 279
361 +285_s 282
362 +296_s 292
363 +319_s 315
364 +346_s 342
365 +351_s 347
366 +358_s 354
367 +385_s 381
368 +386_s 382
369 +389_s 385
370 +396_s 392
371 +405_s 401
372 +423_s 419
373 +427_s 423
374 +430_s 426
375 +449_s 445
376 +455_s 451
377 +#Cluster: 6
378 +2_s 2
379 +4_s 4
380 +15_s 15
381 +35_s 35
382 +41_s 41
383 +45_s 45
384 +50_s 50
385 +51_s 51
386 +56_s 56
387 +61_s 61
388 +68_s 68
389 +76_s 76
390 +85_s 85
391 +97_s 97
392 +103_s 103
393 +105_s 105
394 +108_s 108
395 +111_s 111
396 +116_s 116
397 +119_s 119
398 +121_s 121
399 +133_s 132
400 +139_s 138
401 +140_s 139
402 +143_s 142
403 +151_s 150
404 +152_s 151
405 +157_s 156
406 +176_s 174
407 +178_s 176
408 +181_s 179
409 +183_s 181
410 +185_s 183
411 +191_s 189
412 +193_s 191
413 +201_s 199
414 +209_s 207
415 +212_s 210
416 +218_s 216
417 +232_s 230
418 +235_s 233
419 +261_s 258
420 +266_s 263
421 +270_s 267
422 +277_s 274
423 +294_s 290
424 +304_s 300
425 +309_s 305
426 +311_s 307
427 +312_s 308
428 +324_s 320
429 +327_s 323
430 +329_s 325
431 +331_s 327
432 +337_s 333
433 +338_s 334
434 +339_s 335
435 +341_s 337
436 +342_s 338
437 +344_s 340
438 +369_s 365
439 +370_s 366
440 +373_s 369
441 +381_s 377
442 +393_s 389
443 +394_s 390
444 +397_s 393
445 +400_s 396
446 +404_s 400
447 +406_s 402
448 +410_s 406
449 +412_s 408
450 +422_s 418
451 +425_s 421
452 +426_s 422
453 +432_s 428
454 +435_s 431
455 +436_s 432
456 +439_s 435
457 +441_s 437
...@@ -66,3 +66,21 @@ print_cluster(cls4, "SentenceMembership_4clusters.txt") ...@@ -66,3 +66,21 @@ print_cluster(cls4, "SentenceMembership_4clusters.txt")
66 66
67 67
68 68
69 +####
70 +# 5 clusters
71 +png("Dendogram_5clusters.png", height = 600, width = 975)
72 +plot(senclus, hang=-1)
73 +cls5 <- rect.hclust(senclus, k=5, border = 3:4)
74 +dev.off()
75 +
76 +print_cluster(cls5, "SentenceMembership_5clusters.txt")
77 +
78 +
79 +###
80 +# 6 clusters
81 +png("Dendogram_6clusters.png", height = 600, width = 975)
82 +plot(senclus, hang=-1)
83 +cls6 <- rect.hclust(senclus, k=6, border = 3:4)
84 +dev.off()
85 +print_cluster(cls6, "SentenceMembership_6clusters.txt")
86 +
......
...@@ -5,3 +5,8 @@ grep -E "\|t\|" ../corpora/full-sentences.txt | perl -ne 'if(/(\d+\|t\|)(.*)/){p ...@@ -5,3 +5,8 @@ grep -E "\|t\|" ../corpora/full-sentences.txt | perl -ne 'if(/(\d+\|t\|)(.*)/){p
5 # Run wisse 5 # Run wisse
6 6
7 python3.4 wisse_example.py --input ../corpora/articles-titles.txt --idfmodel local --embedmodel /export/space1/users/compu2/bionlp/word-embeddings/w2v/almac/ignacio/data/word2vec/indexed_w2v_En_vector_space_H300 --localw binary --output ../embeddings/articles-titles.vec --format wisse & 7 python3.4 wisse_example.py --input ../corpora/articles-titles.txt --idfmodel local --embedmodel /export/space1/users/compu2/bionlp/word-embeddings/w2v/almac/ignacio/data/word2vec/indexed_w2v_En_vector_space_H300 --localw binary --output ../embeddings/articles-titles.vec --format wisse &
8 +
9 +# Within cluster sentence analysis
10 +
11 +grep -c -E '<g>.*<d>|<d>.*<g>' cluster1_tagged.txt cluster2_tagged.txt cluster3_tagged.txt cluster4_tagged.txt
12 +wc -l cluster1_tagged.txt cluster2_tagged.txt cluster3_tagged.txt cluster4_tagged.txt
......