Showing
10 changed files
with
565 additions
and
0 deletions
corpora/article-titles-mod.txt
0 → 100644
This diff could not be displayed because it is too large.
results/Dendogram_2clusters.png
0 → 100644

55 KB
results/Dendogram_3clusters.png
0 → 100644

55.2 KB
results/Dendogram_4clusters.png
0 → 100644

55.4 KB
results/Dendogram_ward.png
0 → 100644

27.8 KB
results/SentenceMembership_2clusters.txt
0 → 100644
This diff is collapsed. Click to expand it.
results/SentenceMembership_3clusters.txt
0 → 100644
This diff is collapsed. Click to expand it.
results/SentenceMembership_4clusters.txt
0 → 100644
1 | +1 | ||
2 | +Cluster: 1 | ||
3 | + | ||
4 | +3_s 3 | ||
5 | +17_s 17 | ||
6 | +33_s 33 | ||
7 | +42_s 42 | ||
8 | +43_s 43 | ||
9 | +59_s 59 | ||
10 | +66_s 66 | ||
11 | +82_s 82 | ||
12 | +84_s 84 | ||
13 | +100_s 100 | ||
14 | +123_s 123 | ||
15 | +124_s 124 | ||
16 | +132_s 131 | ||
17 | +136_s 135 | ||
18 | +137_s 136 | ||
19 | +148_s 147 | ||
20 | +188_s 186 | ||
21 | +196_s 194 | ||
22 | +216_s 214 | ||
23 | +219_s 217 | ||
24 | +226_s 224 | ||
25 | +227_s 225 | ||
26 | +229_s 227 | ||
27 | +230_s 228 | ||
28 | +234_s 232 | ||
29 | +238_s 236 | ||
30 | +244_s 242 | ||
31 | +245_s 243 | ||
32 | +269_s 266 | ||
33 | +275_s 272 | ||
34 | +281_s 278 | ||
35 | +287_s 284 | ||
36 | +290_s 287 | ||
37 | +301_s 297 | ||
38 | +302_s 298 | ||
39 | +303_s 299 | ||
40 | +315_s 311 | ||
41 | +332_s 328 | ||
42 | +350_s 346 | ||
43 | +361_s 357 | ||
44 | +366_s 362 | ||
45 | +379_s 375 | ||
46 | +415_s 411 | ||
47 | +421_s 417 | ||
48 | +444_s 440 | ||
49 | +445_s 441 | ||
50 | +446_s 442 | ||
51 | +453_s 449 | ||
52 | +1 | ||
53 | +Cluster: 2 | ||
54 | + | ||
55 | +1_s 1 | ||
56 | +5_s 5 | ||
57 | +6_s 6 | ||
58 | +7_s 7 | ||
59 | +8_s 8 | ||
60 | +10_s 10 | ||
61 | +12_s 12 | ||
62 | +13_s 13 | ||
63 | +14_s 14 | ||
64 | +16_s 16 | ||
65 | +18_s 18 | ||
66 | +20_s 20 | ||
67 | +22_s 22 | ||
68 | +23_s 23 | ||
69 | +24_s 24 | ||
70 | +25_s 25 | ||
71 | +28_s 28 | ||
72 | +29_s 29 | ||
73 | +31_s 31 | ||
74 | +34_s 34 | ||
75 | +37_s 37 | ||
76 | +38_s 38 | ||
77 | +44_s 44 | ||
78 | +47_s 47 | ||
79 | +48_s 48 | ||
80 | +49_s 49 | ||
81 | +54_s 54 | ||
82 | +55_s 55 | ||
83 | +58_s 58 | ||
84 | +64_s 64 | ||
85 | +65_s 65 | ||
86 | +67_s 67 | ||
87 | +69_s 69 | ||
88 | +70_s 70 | ||
89 | +71_s 71 | ||
90 | +72_s 72 | ||
91 | +74_s 74 | ||
92 | +75_s 75 | ||
93 | +79_s 79 | ||
94 | +80_s 80 | ||
95 | +81_s 81 | ||
96 | +83_s 83 | ||
97 | +86_s 86 | ||
98 | +87_s 87 | ||
99 | +88_s 88 | ||
100 | +89_s 89 | ||
101 | +90_s 90 | ||
102 | +91_s 91 | ||
103 | +92_s 92 | ||
104 | +93_s 93 | ||
105 | +94_s 94 | ||
106 | +98_s 98 | ||
107 | +101_s 101 | ||
108 | +102_s 102 | ||
109 | +104_s 104 | ||
110 | +106_s 106 | ||
111 | +109_s 109 | ||
112 | +110_s 110 | ||
113 | +112_s 112 | ||
114 | +115_s 115 | ||
115 | +117_s 117 | ||
116 | +118_s 118 | ||
117 | +120_s 120 | ||
118 | +127_s 126 | ||
119 | +128_s 127 | ||
120 | +130_s 129 | ||
121 | +138_s 137 | ||
122 | +141_s 140 | ||
123 | +142_s 141 | ||
124 | +144_s 143 | ||
125 | +146_s 145 | ||
126 | +147_s 146 | ||
127 | +149_s 148 | ||
128 | +150_s 149 | ||
129 | +156_s 155 | ||
130 | +159_s 158 | ||
131 | +161_s 160 | ||
132 | +162_s 161 | ||
133 | +163_s 162 | ||
134 | +164_s 163 | ||
135 | +165_s 164 | ||
136 | +166_s 165 | ||
137 | +169_s 168 | ||
138 | +171_s 169 | ||
139 | +174_s 172 | ||
140 | +177_s 175 | ||
141 | +179_s 177 | ||
142 | +182_s 180 | ||
143 | +184_s 182 | ||
144 | +186_s 184 | ||
145 | +189_s 187 | ||
146 | +190_s 188 | ||
147 | +192_s 190 | ||
148 | +195_s 193 | ||
149 | +197_s 195 | ||
150 | +198_s 196 | ||
151 | +199_s 197 | ||
152 | +203_s 201 | ||
153 | +204_s 202 | ||
154 | +205_s 203 | ||
155 | +208_s 206 | ||
156 | +210_s 208 | ||
157 | +214_s 212 | ||
158 | +215_s 213 | ||
159 | +217_s 215 | ||
160 | +220_s 218 | ||
161 | +221_s 219 | ||
162 | +222_s 220 | ||
163 | +223_s 221 | ||
164 | +225_s 223 | ||
165 | +233_s 231 | ||
166 | +236_s 234 | ||
167 | +237_s 235 | ||
168 | +239_s 237 | ||
169 | +240_s 238 | ||
170 | +241_s 239 | ||
171 | +242_s 240 | ||
172 | +243_s 241 | ||
173 | +246_s 244 | ||
174 | +247_s 245 | ||
175 | +248_s 246 | ||
176 | +250_s 248 | ||
177 | +251_s 249 | ||
178 | +253_s 250 | ||
179 | +254_s 251 | ||
180 | +255_s 252 | ||
181 | +256_s 253 | ||
182 | +260_s 257 | ||
183 | +262_s 259 | ||
184 | +264_s 261 | ||
185 | +265_s 262 | ||
186 | +268_s 265 | ||
187 | +272_s 269 | ||
188 | +273_s 270 | ||
189 | +278_s 275 | ||
190 | +279_s 276 | ||
191 | +280_s 277 | ||
192 | +284_s 281 | ||
193 | +286_s 283 | ||
194 | +289_s 286 | ||
195 | +291_s 288 | ||
196 | +293_s 289 | ||
197 | +295_s 291 | ||
198 | +297_s 293 | ||
199 | +298_s 294 | ||
200 | +299_s 295 | ||
201 | +300_s 296 | ||
202 | +305_s 301 | ||
203 | +307_s 303 | ||
204 | +308_s 304 | ||
205 | +310_s 306 | ||
206 | +313_s 309 | ||
207 | +314_s 310 | ||
208 | +316_s 312 | ||
209 | +317_s 313 | ||
210 | +318_s 314 | ||
211 | +320_s 316 | ||
212 | +322_s 318 | ||
213 | +325_s 321 | ||
214 | +326_s 322 | ||
215 | +328_s 324 | ||
216 | +330_s 326 | ||
217 | +333_s 329 | ||
218 | +335_s 331 | ||
219 | +336_s 332 | ||
220 | +340_s 336 | ||
221 | +343_s 339 | ||
222 | +345_s 341 | ||
223 | +347_s 343 | ||
224 | +348_s 344 | ||
225 | +349_s 345 | ||
226 | +352_s 348 | ||
227 | +353_s 349 | ||
228 | +354_s 350 | ||
229 | +355_s 351 | ||
230 | +356_s 352 | ||
231 | +357_s 353 | ||
232 | +360_s 356 | ||
233 | +362_s 358 | ||
234 | +363_s 359 | ||
235 | +364_s 360 | ||
236 | +365_s 361 | ||
237 | +367_s 363 | ||
238 | +368_s 364 | ||
239 | +371_s 367 | ||
240 | +372_s 368 | ||
241 | +374_s 370 | ||
242 | +375_s 371 | ||
243 | +376_s 372 | ||
244 | +377_s 373 | ||
245 | +378_s 374 | ||
246 | +380_s 376 | ||
247 | +383_s 379 | ||
248 | +384_s 380 | ||
249 | +387_s 383 | ||
250 | +388_s 384 | ||
251 | +390_s 386 | ||
252 | +391_s 387 | ||
253 | +392_s 388 | ||
254 | +398_s 394 | ||
255 | +399_s 395 | ||
256 | +401_s 397 | ||
257 | +402_s 398 | ||
258 | +403_s 399 | ||
259 | +407_s 403 | ||
260 | +408_s 404 | ||
261 | +409_s 405 | ||
262 | +411_s 407 | ||
263 | +413_s 409 | ||
264 | +414_s 410 | ||
265 | +416_s 412 | ||
266 | +418_s 414 | ||
267 | +419_s 415 | ||
268 | +420_s 416 | ||
269 | +424_s 420 | ||
270 | +428_s 424 | ||
271 | +429_s 425 | ||
272 | +431_s 427 | ||
273 | +433_s 429 | ||
274 | +437_s 433 | ||
275 | +438_s 434 | ||
276 | +440_s 436 | ||
277 | +442_s 438 | ||
278 | +443_s 439 | ||
279 | +447_s 443 | ||
280 | +448_s 444 | ||
281 | +450_s 446 | ||
282 | +451_s 447 | ||
283 | +452_s 448 | ||
284 | +1 | ||
285 | +Cluster: 3 | ||
286 | + | ||
287 | +9_s 9 | ||
288 | +19_s 19 | ||
289 | +21_s 21 | ||
290 | +26_s 26 | ||
291 | +32_s 32 | ||
292 | +39_s 39 | ||
293 | +52_s 52 | ||
294 | +73_s 73 | ||
295 | +77_s 77 | ||
296 | +95_s 95 | ||
297 | +107_s 107 | ||
298 | +113_s 113 | ||
299 | +125_s 125 | ||
300 | +129_s 128 | ||
301 | +131_s 130 | ||
302 | +145_s 144 | ||
303 | +153_s 152 | ||
304 | +158_s 157 | ||
305 | +172_s 170 | ||
306 | +175_s 173 | ||
307 | +180_s 178 | ||
308 | +200_s 198 | ||
309 | +202_s 200 | ||
310 | +207_s 205 | ||
311 | +224_s 222 | ||
312 | +228_s 226 | ||
313 | +231_s 229 | ||
314 | +259_s 256 | ||
315 | +263_s 260 | ||
316 | +274_s 271 | ||
317 | +283_s 280 | ||
318 | +288_s 285 | ||
319 | +306_s 302 | ||
320 | +321_s 317 | ||
321 | +323_s 319 | ||
322 | +334_s 330 | ||
323 | +359_s 355 | ||
324 | +382_s 378 | ||
325 | +395_s 391 | ||
326 | +417_s 413 | ||
327 | +434_s 430 | ||
328 | +454_s 450 | ||
329 | +1 | ||
330 | +Cluster: 4 | ||
331 | + | ||
332 | +2_s 2 | ||
333 | +4_s 4 | ||
334 | +11_s 11 | ||
335 | +15_s 15 | ||
336 | +27_s 27 | ||
337 | +30_s 30 | ||
338 | +35_s 35 | ||
339 | +36_s 36 | ||
340 | +40_s 40 | ||
341 | +41_s 41 | ||
342 | +45_s 45 | ||
343 | +46_s 46 | ||
344 | +50_s 50 | ||
345 | +51_s 51 | ||
346 | +53_s 53 | ||
347 | +56_s 56 | ||
348 | +57_s 57 | ||
349 | +60_s 60 | ||
350 | +61_s 61 | ||
351 | +62_s 62 | ||
352 | +63_s 63 | ||
353 | +68_s 68 | ||
354 | +76_s 76 | ||
355 | +78_s 78 | ||
356 | +85_s 85 | ||
357 | +96_s 96 | ||
358 | +97_s 97 | ||
359 | +99_s 99 | ||
360 | +103_s 103 | ||
361 | +105_s 105 | ||
362 | +108_s 108 | ||
363 | +111_s 111 | ||
364 | +114_s 114 | ||
365 | +116_s 116 | ||
366 | +119_s 119 | ||
367 | +121_s 121 | ||
368 | +122_s 122 | ||
369 | +133_s 132 | ||
370 | +134_s 133 | ||
371 | +135_s 134 | ||
372 | +139_s 138 | ||
373 | +140_s 139 | ||
374 | +143_s 142 | ||
375 | +151_s 150 | ||
376 | +152_s 151 | ||
377 | +154_s 153 | ||
378 | +155_s 154 | ||
379 | +157_s 156 | ||
380 | +160_s 159 | ||
381 | +167_s 166 | ||
382 | +168_s 167 | ||
383 | +173_s 171 | ||
384 | +176_s 174 | ||
385 | +178_s 176 | ||
386 | +181_s 179 | ||
387 | +183_s 181 | ||
388 | +185_s 183 | ||
389 | +187_s 185 | ||
390 | +191_s 189 | ||
391 | +193_s 191 | ||
392 | +194_s 192 | ||
393 | +201_s 199 | ||
394 | +206_s 204 | ||
395 | +209_s 207 | ||
396 | +211_s 209 | ||
397 | +212_s 210 | ||
398 | +213_s 211 | ||
399 | +218_s 216 | ||
400 | +232_s 230 | ||
401 | +235_s 233 | ||
402 | +249_s 247 | ||
403 | +257_s 254 | ||
404 | +258_s 255 | ||
405 | +261_s 258 | ||
406 | +266_s 263 | ||
407 | +267_s 264 | ||
408 | +270_s 267 | ||
409 | +271_s 268 | ||
410 | +276_s 273 | ||
411 | +277_s 274 | ||
412 | +282_s 279 | ||
413 | +285_s 282 | ||
414 | +294_s 290 | ||
415 | +296_s 292 | ||
416 | +304_s 300 | ||
417 | +309_s 305 | ||
418 | +311_s 307 | ||
419 | +312_s 308 | ||
420 | +319_s 315 | ||
421 | +324_s 320 | ||
422 | +327_s 323 | ||
423 | +329_s 325 | ||
424 | +331_s 327 | ||
425 | +337_s 333 | ||
426 | +338_s 334 | ||
427 | +339_s 335 | ||
428 | +341_s 337 | ||
429 | +342_s 338 | ||
430 | +344_s 340 | ||
431 | +346_s 342 | ||
432 | +351_s 347 | ||
433 | +358_s 354 | ||
434 | +369_s 365 | ||
435 | +370_s 366 | ||
436 | +373_s 369 | ||
437 | +381_s 377 | ||
438 | +385_s 381 | ||
439 | +386_s 382 | ||
440 | +389_s 385 | ||
441 | +393_s 389 | ||
442 | +394_s 390 | ||
443 | +396_s 392 | ||
444 | +397_s 393 | ||
445 | +400_s 396 | ||
446 | +404_s 400 | ||
447 | +405_s 401 | ||
448 | +406_s 402 | ||
449 | +410_s 406 | ||
450 | +412_s 408 | ||
451 | +422_s 418 | ||
452 | +423_s 419 | ||
453 | +425_s 421 | ||
454 | +426_s 422 | ||
455 | +427_s 423 | ||
456 | +430_s 426 | ||
457 | +432_s 428 | ||
458 | +435_s 431 | ||
459 | +436_s 432 | ||
460 | +439_s 435 | ||
461 | +441_s 437 | ||
462 | +449_s 445 | ||
463 | +455_s 451 |
scripts/Clustering_Analysis.R
0 → 100644
1 | +library(methods) | ||
2 | +library(cluster) | ||
3 | + | ||
4 | +# Funcion para imprimir los clusters | ||
5 | +print_cluster <- function(obj, filename) { | ||
6 | + | ||
7 | + for(cl in 1:length(obj)) { | ||
8 | + | ||
9 | + write.table(paste("\nCluster: ", cl, "\n"), file = filename, append = TRUE, quote = FALSE, row.names = TRUE, col.names = FALSE) | ||
10 | + write.table(obj[[cl]], file = filename, append = TRUE, quote = FALSE, row.names = TRUE, col.names = FALSE, sep = " ") | ||
11 | + | ||
12 | + } | ||
13 | +} | ||
14 | +################################################################################################################################################### | ||
15 | +# Receive arguments | ||
16 | +arg = commandArgs(trailingOnly = T) | ||
17 | + | ||
18 | +if (length(arg)==0) { | ||
19 | + stop("Must supply input file.n", call.=FALSE) | ||
20 | +} | ||
21 | + | ||
22 | +################################################# Run analysis ################################################## | ||
23 | +vecs <- read.table(arg[1], | ||
24 | + header = F, row.names = 1, sep = ' ', | ||
25 | + colClasses = c("character", rep("numeric", 299))) | ||
26 | + | ||
27 | +senclus <- hclust(dist(vecs), method = 'ward.D') | ||
28 | +print("agglomerative coefficient: ") | ||
29 | +print(coef.hclust(senclus)) | ||
30 | + | ||
31 | +# Guardamos la imagen del dendograma original | ||
32 | +png("Dendogram_ward.png", height = 608, width = 975) | ||
33 | +plot(senclus, hang = -1) | ||
34 | +dev.off() | ||
35 | + | ||
36 | +### | ||
37 | +# Particion en dos clusters | ||
38 | +png("Dendogram_2clusters.png", height = 608, width = 975) | ||
39 | +plot(senclus, hang = -1) | ||
40 | +cls2 <- rect.hclust(senclus, k=2, border = 3:4) | ||
41 | +dev.off() | ||
42 | + | ||
43 | +# Escribir archivo | ||
44 | +print_cluster(cls2, "SentenceMembership_2clusters.txt") | ||
45 | + | ||
46 | +####### | ||
47 | +# Particion en tres clusters | ||
48 | +png("Dendogram_3clusters.png", height = 608, width = 975) | ||
49 | +plot(senclus, hang = -1) | ||
50 | +cls3 <- rect.hclust(senclus, k=3, border = 3:4) | ||
51 | +dev.off() | ||
52 | + | ||
53 | +# Escribir archivo | ||
54 | +print_cluster(cls3, "SentenceMembership_3clusters.txt") | ||
55 | + | ||
56 | +##### | ||
57 | +# Particion en cuatro clusters | ||
58 | +png("Dendogram_4clusters.png", height = 608, width = 975) | ||
59 | +plot(senclus, hang = -1) | ||
60 | +cls4 <- rect.hclust(senclus, k=4, border = 3:4) | ||
61 | +dev.off() | ||
62 | + | ||
63 | +# Escribir archivo | ||
64 | +print_cluster(cls4, "SentenceMembership_4clusters.txt") | ||
65 | + | ||
66 | + | ||
67 | + | ||
68 | + |
scripts/change_tabs.py
0 → 100644
1 | +from optparse import OptionParser | ||
2 | + | ||
3 | +# Recibir input y output | ||
4 | +parser = OptionParser() | ||
5 | +parser.add_option("-i", dest="inF",help="Input vector file. Sentence is separated by tabs from values which are sparated by simple space", metavar="PATH") | ||
6 | +parser.add_option("-o", dest="otF",help="output file name", metavar="PATH") | ||
7 | + | ||
8 | +(options, args) = parser.parse_args() | ||
9 | +if len(args) > 0: | ||
10 | + parser.error("Please indicate an input directory") | ||
11 | + sys.exit(1) | ||
12 | + | ||
13 | +# Asignar variables | ||
14 | +infile = options.inF | ||
15 | +outfile = options.otF | ||
16 | + | ||
17 | +# Abrir nuevo archivo | ||
18 | +newfile = open(outfile, 'w') | ||
19 | + | ||
20 | +# Reemplazar tab por espacio | ||
21 | +with open(infile) as vectors: | ||
22 | + for line in vectors: | ||
23 | + # Aislar el numero de articulo de sus valores | ||
24 | + elements = line.rstrip().split('\t') | ||
25 | + | ||
26 | + # Ponemos una letra para facilitar la indentificacion posterior | ||
27 | + index = elements[0] + '_s' | ||
28 | + | ||
29 | + # Armar la nueva linea | ||
30 | + newline = ' '.join([index,elements[1]]) | ||
31 | + newline = newline + '\n' | ||
32 | + newfile.write(newline) | ||
33 | + | ||
34 | +newfile.close() | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment