Larisa Morales Soto

Merge branch 'master' of pakal.ccg.unam.mx:larisams/gene-disease-embeddings

......@@ -14,4 +14,8 @@ The main goal of this proyect is to obtain sentence embeddings of 450 abstract t
### Programs and commands used
```Shell
\scripts
```
### Results from clustering
```Shell
\results
```
\ No newline at end of file
......
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
#Cluster: 1
1_s 1
3_s 3
5_s 5
6_s 6
7_s 7
8_s 8
10_s 10
12_s 12
13_s 13
14_s 14
16_s 16
17_s 17
18_s 18
20_s 20
22_s 22
23_s 23
24_s 24
25_s 25
28_s 28
29_s 29
31_s 31
33_s 33
34_s 34
37_s 37
38_s 38
42_s 42
43_s 43
44_s 44
47_s 47
48_s 48
49_s 49
54_s 54
55_s 55
58_s 58
59_s 59
64_s 64
65_s 65
66_s 66
67_s 67
69_s 69
70_s 70
71_s 71
72_s 72
74_s 74
75_s 75
79_s 79
80_s 80
81_s 81
82_s 82
83_s 83
84_s 84
86_s 86
87_s 87
88_s 88
89_s 89
90_s 90
91_s 91
92_s 92
93_s 93
94_s 94
98_s 98
100_s 100
101_s 101
102_s 102
104_s 104
106_s 106
109_s 109
110_s 110
112_s 112
115_s 115
117_s 117
118_s 118
120_s 120
123_s 123
124_s 124
127_s 126
128_s 127
130_s 129
132_s 131
136_s 135
137_s 136
138_s 137
141_s 140
142_s 141
144_s 143
146_s 145
147_s 146
148_s 147
149_s 148
150_s 149
156_s 155
159_s 158
161_s 160
162_s 161
163_s 162
164_s 163
165_s 164
166_s 165
169_s 168
171_s 169
174_s 172
177_s 175
179_s 177
182_s 180
184_s 182
186_s 184
188_s 186
189_s 187
190_s 188
192_s 190
195_s 193
196_s 194
197_s 195
198_s 196
199_s 197
203_s 201
204_s 202
205_s 203
208_s 206
210_s 208
214_s 212
215_s 213
216_s 214
217_s 215
219_s 217
220_s 218
221_s 219
222_s 220
223_s 221
225_s 223
226_s 224
227_s 225
229_s 227
230_s 228
233_s 231
234_s 232
236_s 234
237_s 235
238_s 236
239_s 237
240_s 238
241_s 239
242_s 240
243_s 241
244_s 242
245_s 243
246_s 244
247_s 245
248_s 246
250_s 248
251_s 249
253_s 250
254_s 251
255_s 252
256_s 253
260_s 257
262_s 259
264_s 261
265_s 262
268_s 265
269_s 266
272_s 269
273_s 270
275_s 272
278_s 275
279_s 276
280_s 277
281_s 278
284_s 281
286_s 283
287_s 284
289_s 286
290_s 287
291_s 288
293_s 289
295_s 291
297_s 293
298_s 294
299_s 295
300_s 296
301_s 297
302_s 298
303_s 299
305_s 301
307_s 303
308_s 304
310_s 306
313_s 309
314_s 310
315_s 311
316_s 312
317_s 313
318_s 314
320_s 316
322_s 318
325_s 321
326_s 322
328_s 324
330_s 326
332_s 328
333_s 329
335_s 331
336_s 332
340_s 336
343_s 339
345_s 341
347_s 343
348_s 344
349_s 345
350_s 346
352_s 348
353_s 349
354_s 350
355_s 351
356_s 352
357_s 353
360_s 356
361_s 357
362_s 358
363_s 359
364_s 360
365_s 361
366_s 362
367_s 363
368_s 364
371_s 367
372_s 368
374_s 370
375_s 371
376_s 372
377_s 373
378_s 374
379_s 375
380_s 376
383_s 379
384_s 380
387_s 383
388_s 384
390_s 386
391_s 387
392_s 388
398_s 394
399_s 395
401_s 397
402_s 398
403_s 399
407_s 403
408_s 404
409_s 405
411_s 407
413_s 409
414_s 410
415_s 411
416_s 412
418_s 414
419_s 415
420_s 416
421_s 417
424_s 420
428_s 424
429_s 425
431_s 427
433_s 429
437_s 433
438_s 434
440_s 436
442_s 438
443_s 439
444_s 440
445_s 441
446_s 442
447_s 443
448_s 444
450_s 446
451_s 447
452_s 448
453_s 449
#Cluster: 2
2_s 2
4_s 4
9_s 9
11_s 11
15_s 15
19_s 19
21_s 21
26_s 26
27_s 27
30_s 30
32_s 32
35_s 35
36_s 36
39_s 39
40_s 40
41_s 41
45_s 45
46_s 46
50_s 50
51_s 51
52_s 52
53_s 53
56_s 56
57_s 57
60_s 60
61_s 61
62_s 62
63_s 63
68_s 68
73_s 73
76_s 76
77_s 77
78_s 78
85_s 85
95_s 95
96_s 96
97_s 97
99_s 99
103_s 103
105_s 105
107_s 107
108_s 108
111_s 111
113_s 113
114_s 114
116_s 116
119_s 119
121_s 121
122_s 122
125_s 125
129_s 128
131_s 130
133_s 132
134_s 133
135_s 134
139_s 138
140_s 139
143_s 142
145_s 144
151_s 150
152_s 151
153_s 152
154_s 153
155_s 154
157_s 156
158_s 157
160_s 159
167_s 166
168_s 167
172_s 170
173_s 171
175_s 173
176_s 174
178_s 176
180_s 178
181_s 179
183_s 181
185_s 183
187_s 185
191_s 189
193_s 191
194_s 192
200_s 198
201_s 199
202_s 200
206_s 204
207_s 205
209_s 207
211_s 209
212_s 210
213_s 211
218_s 216
224_s 222
228_s 226
231_s 229
232_s 230
235_s 233
249_s 247
257_s 254
258_s 255
259_s 256
261_s 258
263_s 260
266_s 263
267_s 264
270_s 267
271_s 268
274_s 271
276_s 273
277_s 274
282_s 279
283_s 280
285_s 282
288_s 285
294_s 290
296_s 292
304_s 300
306_s 302
309_s 305
311_s 307
312_s 308
319_s 315
321_s 317
323_s 319
324_s 320
327_s 323
329_s 325
331_s 327
334_s 330
337_s 333
338_s 334
339_s 335
341_s 337
342_s 338
344_s 340
346_s 342
351_s 347
358_s 354
359_s 355
369_s 365
370_s 366
373_s 369
381_s 377
382_s 378
385_s 381
386_s 382
389_s 385
393_s 389
394_s 390
395_s 391
396_s 392
397_s 393
400_s 396
404_s 400
405_s 401
406_s 402
410_s 406
412_s 408
417_s 413
422_s 418
423_s 419
425_s 421
426_s 422
427_s 423
430_s 426
432_s 428
434_s 430
435_s 431
436_s 432
439_s 435
441_s 437
449_s 445
454_s 450
455_s 451
\ No newline at end of file
#Cluster: 1
1_s 1
3_s 3
5_s 5
6_s 6
7_s 7
8_s 8
10_s 10
12_s 12
13_s 13
14_s 14
16_s 16
17_s 17
18_s 18
20_s 20
22_s 22
23_s 23
24_s 24
25_s 25
28_s 28
29_s 29
31_s 31
33_s 33
34_s 34
37_s 37
38_s 38
42_s 42
43_s 43
44_s 44
47_s 47
48_s 48
49_s 49
54_s 54
55_s 55
58_s 58
59_s 59
64_s 64
65_s 65
66_s 66
67_s 67
69_s 69
70_s 70
71_s 71
72_s 72
74_s 74
75_s 75
79_s 79
80_s 80
81_s 81
82_s 82
83_s 83
84_s 84
86_s 86
87_s 87
88_s 88
89_s 89
90_s 90
91_s 91
92_s 92
93_s 93
94_s 94
98_s 98
100_s 100
101_s 101
102_s 102
104_s 104
106_s 106
109_s 109
110_s 110
112_s 112
115_s 115
117_s 117
118_s 118
120_s 120
123_s 123
124_s 124
127_s 126
128_s 127
130_s 129
132_s 131
136_s 135
137_s 136
138_s 137
141_s 140
142_s 141
144_s 143
146_s 145
147_s 146
148_s 147
149_s 148
150_s 149
156_s 155
159_s 158
161_s 160
162_s 161
163_s 162
164_s 163
165_s 164
166_s 165
169_s 168
171_s 169
174_s 172
177_s 175
179_s 177
182_s 180
184_s 182
186_s 184
188_s 186
189_s 187
190_s 188
192_s 190
195_s 193
196_s 194
197_s 195
198_s 196
199_s 197
203_s 201
204_s 202
205_s 203
208_s 206
210_s 208
214_s 212
215_s 213
216_s 214
217_s 215
219_s 217
220_s 218
221_s 219
222_s 220
223_s 221
225_s 223
226_s 224
227_s 225
229_s 227
230_s 228
233_s 231
234_s 232
236_s 234
237_s 235
238_s 236
239_s 237
240_s 238
241_s 239
242_s 240
243_s 241
244_s 242
245_s 243
246_s 244
247_s 245
248_s 246
250_s 248
251_s 249
253_s 250
254_s 251
255_s 252
256_s 253
260_s 257
262_s 259
264_s 261
265_s 262
268_s 265
269_s 266
272_s 269
273_s 270
275_s 272
278_s 275
279_s 276
280_s 277
281_s 278
284_s 281
286_s 283
287_s 284
289_s 286
290_s 287
291_s 288
293_s 289
295_s 291
297_s 293
298_s 294
299_s 295
300_s 296
301_s 297
302_s 298
303_s 299
305_s 301
307_s 303
308_s 304
310_s 306
313_s 309
314_s 310
315_s 311
316_s 312
317_s 313
318_s 314
320_s 316
322_s 318
325_s 321
326_s 322
328_s 324
330_s 326
332_s 328
333_s 329
335_s 331
336_s 332
340_s 336
343_s 339
345_s 341
347_s 343
348_s 344
349_s 345
350_s 346
352_s 348
353_s 349
354_s 350
355_s 351
356_s 352
357_s 353
360_s 356
361_s 357
362_s 358
363_s 359
364_s 360
365_s 361
366_s 362
367_s 363
368_s 364
371_s 367
372_s 368
374_s 370
375_s 371
376_s 372
377_s 373
378_s 374
379_s 375
380_s 376
383_s 379
384_s 380
387_s 383
388_s 384
390_s 386
391_s 387
392_s 388
398_s 394
399_s 395
401_s 397
402_s 398
403_s 399
407_s 403
408_s 404
409_s 405
411_s 407
413_s 409
414_s 410
415_s 411
416_s 412
418_s 414
419_s 415
420_s 416
421_s 417
424_s 420
428_s 424
429_s 425
431_s 427
433_s 429
437_s 433
438_s 434
440_s 436
442_s 438
443_s 439
444_s 440
445_s 441
446_s 442
447_s 443
448_s 444
450_s 446
451_s 447
452_s 448
453_s 449
#Cluster: 2
9_s 9
19_s 19
21_s 21
26_s 26
32_s 32
39_s 39
52_s 52
73_s 73
77_s 77
95_s 95
107_s 107
113_s 113
125_s 125
129_s 128
131_s 130
145_s 144
153_s 152
158_s 157
172_s 170
175_s 173
180_s 178
200_s 198
202_s 200
207_s 205
224_s 222
228_s 226
231_s 229
259_s 256
263_s 260
274_s 271
283_s 280
288_s 285
306_s 302
321_s 317
323_s 319
334_s 330
359_s 355
382_s 378
395_s 391
417_s 413
434_s 430
454_s 450
#Cluster: 3
2_s 2
4_s 4
11_s 11
15_s 15
27_s 27
30_s 30
35_s 35
36_s 36
40_s 40
41_s 41
45_s 45
46_s 46
50_s 50
51_s 51
53_s 53
56_s 56
57_s 57
60_s 60
61_s 61
62_s 62
63_s 63
68_s 68
76_s 76
78_s 78
85_s 85
96_s 96
97_s 97
99_s 99
103_s 103
105_s 105
108_s 108
111_s 111
114_s 114
116_s 116
119_s 119
121_s 121
122_s 122
133_s 132
134_s 133
135_s 134
139_s 138
140_s 139
143_s 142
151_s 150
152_s 151
154_s 153
155_s 154
157_s 156
160_s 159
167_s 166
168_s 167
173_s 171
176_s 174
178_s 176
181_s 179
183_s 181
185_s 183
187_s 185
191_s 189
193_s 191
194_s 192
201_s 199
206_s 204
209_s 207
211_s 209
212_s 210
213_s 211
218_s 216
232_s 230
235_s 233
249_s 247
257_s 254
258_s 255
261_s 258
266_s 263
267_s 264
270_s 267
271_s 268
276_s 273
277_s 274
282_s 279
285_s 282
294_s 290
296_s 292
304_s 300
309_s 305
311_s 307
312_s 308
319_s 315
324_s 320
327_s 323
329_s 325
331_s 327
337_s 333
338_s 334
339_s 335
341_s 337
342_s 338
344_s 340
346_s 342
351_s 347
358_s 354
369_s 365
370_s 366
373_s 369
381_s 377
385_s 381
386_s 382
389_s 385
393_s 389
394_s 390
396_s 392
397_s 393
400_s 396
404_s 400
405_s 401
406_s 402
410_s 406
412_s 408
422_s 418
423_s 419
425_s 421
426_s 422
427_s 423
430_s 426
432_s 428
435_s 431
436_s 432
439_s 435
441_s 437
449_s 445
455_s 451
\ No newline at end of file
#Cluster: 1
3_s 3
17_s 17
33_s 33
42_s 42
43_s 43
59_s 59
66_s 66
82_s 82
84_s 84
100_s 100
123_s 123
124_s 124
132_s 131
136_s 135
137_s 136
148_s 147
188_s 186
196_s 194
216_s 214
219_s 217
226_s 224
227_s 225
229_s 227
230_s 228
234_s 232
238_s 236
244_s 242
245_s 243
269_s 266
275_s 272
281_s 278
287_s 284
290_s 287
301_s 297
302_s 298
303_s 299
315_s 311
332_s 328
350_s 346
361_s 357
366_s 362
379_s 375
415_s 411
421_s 417
444_s 440
445_s 441
446_s 442
453_s 449
#Cluster: 2
1_s 1
5_s 5
6_s 6
7_s 7
8_s 8
10_s 10
12_s 12
13_s 13
14_s 14
16_s 16
18_s 18
20_s 20
22_s 22
23_s 23
24_s 24
25_s 25
28_s 28
29_s 29
31_s 31
34_s 34
37_s 37
38_s 38
44_s 44
47_s 47
48_s 48
49_s 49
54_s 54
55_s 55
58_s 58
64_s 64
65_s 65
67_s 67
69_s 69
70_s 70
71_s 71
72_s 72
74_s 74
75_s 75
79_s 79
80_s 80
81_s 81
83_s 83
86_s 86
87_s 87
88_s 88
89_s 89
90_s 90
91_s 91
92_s 92
93_s 93
94_s 94
98_s 98
101_s 101
102_s 102
104_s 104
106_s 106
109_s 109
110_s 110
112_s 112
115_s 115
117_s 117
118_s 118
120_s 120
127_s 126
128_s 127
130_s 129
138_s 137
141_s 140
142_s 141
144_s 143
146_s 145
147_s 146
149_s 148
150_s 149
156_s 155
159_s 158
161_s 160
162_s 161
163_s 162
164_s 163
165_s 164
166_s 165
169_s 168
171_s 169
174_s 172
177_s 175
179_s 177
182_s 180
184_s 182
186_s 184
189_s 187
190_s 188
192_s 190
195_s 193
197_s 195
198_s 196
199_s 197
203_s 201
204_s 202
205_s 203
208_s 206
210_s 208
214_s 212
215_s 213
217_s 215
220_s 218
221_s 219
222_s 220
223_s 221
225_s 223
233_s 231
236_s 234
237_s 235
239_s 237
240_s 238
241_s 239
242_s 240
243_s 241
246_s 244
247_s 245
248_s 246
250_s 248
251_s 249
253_s 250
254_s 251
255_s 252
256_s 253
260_s 257
262_s 259
264_s 261
265_s 262
268_s 265
272_s 269
273_s 270
278_s 275
279_s 276
280_s 277
284_s 281
286_s 283
289_s 286
291_s 288
293_s 289
295_s 291
297_s 293
298_s 294
299_s 295
300_s 296
305_s 301
307_s 303
308_s 304
310_s 306
313_s 309
314_s 310
316_s 312
317_s 313
318_s 314
320_s 316
322_s 318
325_s 321
326_s 322
328_s 324
330_s 326
333_s 329
335_s 331
336_s 332
340_s 336
343_s 339
345_s 341
347_s 343
348_s 344
349_s 345
352_s 348
353_s 349
354_s 350
355_s 351
356_s 352
357_s 353
360_s 356
362_s 358
363_s 359
364_s 360
365_s 361
367_s 363
368_s 364
371_s 367
372_s 368
374_s 370
375_s 371
376_s 372
377_s 373
378_s 374
380_s 376
383_s 379
384_s 380
387_s 383
388_s 384
390_s 386
391_s 387
392_s 388
398_s 394
399_s 395
401_s 397
402_s 398
403_s 399
407_s 403
408_s 404
409_s 405
411_s 407
413_s 409
414_s 410
416_s 412
418_s 414
419_s 415
420_s 416
424_s 420
428_s 424
429_s 425
431_s 427
433_s 429
437_s 433
438_s 434
440_s 436
442_s 438
443_s 439
447_s 443
448_s 444
450_s 446
451_s 447
452_s 448
#Cluster: 3
9_s 9
19_s 19
21_s 21
26_s 26
32_s 32
39_s 39
52_s 52
73_s 73
77_s 77
95_s 95
107_s 107
113_s 113
125_s 125
129_s 128
131_s 130
145_s 144
153_s 152
158_s 157
172_s 170
175_s 173
180_s 178
200_s 198
202_s 200
207_s 205
224_s 222
228_s 226
231_s 229
259_s 256
263_s 260
274_s 271
283_s 280
288_s 285
306_s 302
321_s 317
323_s 319
334_s 330
359_s 355
382_s 378
395_s 391
417_s 413
434_s 430
454_s 450
#Cluster: 4
2_s 2
4_s 4
11_s 11
15_s 15
27_s 27
30_s 30
35_s 35
36_s 36
40_s 40
41_s 41
45_s 45
46_s 46
50_s 50
51_s 51
53_s 53
56_s 56
57_s 57
60_s 60
61_s 61
62_s 62
63_s 63
68_s 68
76_s 76
78_s 78
85_s 85
96_s 96
97_s 97
99_s 99
103_s 103
105_s 105
108_s 108
111_s 111
114_s 114
116_s 116
119_s 119
121_s 121
122_s 122
133_s 132
134_s 133
135_s 134
139_s 138
140_s 139
143_s 142
151_s 150
152_s 151
154_s 153
155_s 154
157_s 156
160_s 159
167_s 166
168_s 167
173_s 171
176_s 174
178_s 176
181_s 179
183_s 181
185_s 183
187_s 185
191_s 189
193_s 191
194_s 192
201_s 199
206_s 204
209_s 207
211_s 209
212_s 210
213_s 211
218_s 216
232_s 230
235_s 233
249_s 247
257_s 254
258_s 255
261_s 258
266_s 263
267_s 264
270_s 267
271_s 268
276_s 273
277_s 274
282_s 279
285_s 282
294_s 290
296_s 292
304_s 300
309_s 305
311_s 307
312_s 308
319_s 315
324_s 320
327_s 323
329_s 325
331_s 327
337_s 333
338_s 334
339_s 335
341_s 337
342_s 338
344_s 340
346_s 342
351_s 347
358_s 354
369_s 365
370_s 366
373_s 369
381_s 377
385_s 381
386_s 382
389_s 385
393_s 389
394_s 390
396_s 392
397_s 393
400_s 396
404_s 400
405_s 401
406_s 402
410_s 406
412_s 408
422_s 418
423_s 419
425_s 421
426_s 422
427_s 423
430_s 426
432_s 428
435_s 431
436_s 432
439_s 435
441_s 437
449_s 445
455_s 451
\ No newline at end of file
library(methods)
library(cluster)
# Funcion para imprimir los clusters
print_cluster <- function(obj, filename) {
for(cl in 1:length(obj)) {
write.table(paste("\nCluster: ", cl, "\n"), file = filename, append = TRUE, quote = FALSE, row.names = TRUE, col.names = FALSE)
write.table(obj[[cl]], file = filename, append = TRUE, quote = FALSE, row.names = TRUE, col.names = FALSE, sep = " ")
}
}
###################################################################################################################################################
# Receive arguments
arg = commandArgs(trailingOnly = T)
if (length(arg)==0) {
stop("Must supply input file.n", call.=FALSE)
}
################################################# Run analysis ##################################################
vecs <- read.table(arg[1],
header = F, row.names = 1, sep = ' ',
colClasses = c("character", rep("numeric", 299)))
senclus <- hclust(dist(vecs), method = 'ward.D')
print("agglomerative coefficient: ")
print(coef.hclust(senclus))
# Guardamos la imagen del dendograma original
png("Dendogram_ward.png", height = 608, width = 975)
plot(senclus, hang = -1)
dev.off()
###
# Particion en dos clusters
png("Dendogram_2clusters.png", height = 608, width = 975)
plot(senclus, hang = -1)
cls2 <- rect.hclust(senclus, k=2, border = 3:4)
dev.off()
# Escribir archivo
print_cluster(cls2, "SentenceMembership_2clusters.txt")
#######
# Particion en tres clusters
png("Dendogram_3clusters.png", height = 608, width = 975)
plot(senclus, hang = -1)
cls3 <- rect.hclust(senclus, k=3, border = 3:4)
dev.off()
# Escribir archivo
print_cluster(cls3, "SentenceMembership_3clusters.txt")
#####
# Particion en cuatro clusters
png("Dendogram_4clusters.png", height = 608, width = 975)
plot(senclus, hang = -1)
cls4 <- rect.hclust(senclus, k=4, border = 3:4)
dev.off()
# Escribir archivo
print_cluster(cls4, "SentenceMembership_4clusters.txt")
from optparse import OptionParser
# Recibir input y output
parser = OptionParser()
parser.add_option("-i", dest="inF",help="Input vector file. Sentence is separated by tabs from values which are sparated by simple space", metavar="PATH")
parser.add_option("-o", dest="otF",help="output file name", metavar="PATH")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Please indicate an input directory")
sys.exit(1)
# Asignar variables
infile = options.inF
outfile = options.otF
# Abrir nuevo archivo
newfile = open(outfile, 'w')
# Reemplazar tab por espacio
with open(infile) as vectors:
for line in vectors:
# Aislar el numero de articulo de sus valores
elements = line.rstrip().split('\t')
# Ponemos una letra para facilitar la indentificacion posterior
index = elements[0] + '_s'
# Armar la nueva linea
newline = ' '.join([index,elements[1]])
newline = newline + '\n'
newfile.write(newline)
newfile.close()
\ No newline at end of file