Showing
10 changed files
with
565 additions
and
0 deletions
corpora/article-titles-mod.txt
0 → 100644
This diff could not be displayed because it is too large.
results/Dendogram_2clusters.png
0 → 100644
55 KB
results/Dendogram_3clusters.png
0 → 100644
55.2 KB
results/Dendogram_4clusters.png
0 → 100644
55.4 KB
results/Dendogram_ward.png
0 → 100644
27.8 KB
results/SentenceMembership_2clusters.txt
0 → 100644
This diff is collapsed. Click to expand it.
results/SentenceMembership_3clusters.txt
0 → 100644
This diff is collapsed. Click to expand it.
results/SentenceMembership_4clusters.txt
0 → 100644
| 1 | +1 | ||
| 2 | +Cluster: 1 | ||
| 3 | + | ||
| 4 | +3_s 3 | ||
| 5 | +17_s 17 | ||
| 6 | +33_s 33 | ||
| 7 | +42_s 42 | ||
| 8 | +43_s 43 | ||
| 9 | +59_s 59 | ||
| 10 | +66_s 66 | ||
| 11 | +82_s 82 | ||
| 12 | +84_s 84 | ||
| 13 | +100_s 100 | ||
| 14 | +123_s 123 | ||
| 15 | +124_s 124 | ||
| 16 | +132_s 131 | ||
| 17 | +136_s 135 | ||
| 18 | +137_s 136 | ||
| 19 | +148_s 147 | ||
| 20 | +188_s 186 | ||
| 21 | +196_s 194 | ||
| 22 | +216_s 214 | ||
| 23 | +219_s 217 | ||
| 24 | +226_s 224 | ||
| 25 | +227_s 225 | ||
| 26 | +229_s 227 | ||
| 27 | +230_s 228 | ||
| 28 | +234_s 232 | ||
| 29 | +238_s 236 | ||
| 30 | +244_s 242 | ||
| 31 | +245_s 243 | ||
| 32 | +269_s 266 | ||
| 33 | +275_s 272 | ||
| 34 | +281_s 278 | ||
| 35 | +287_s 284 | ||
| 36 | +290_s 287 | ||
| 37 | +301_s 297 | ||
| 38 | +302_s 298 | ||
| 39 | +303_s 299 | ||
| 40 | +315_s 311 | ||
| 41 | +332_s 328 | ||
| 42 | +350_s 346 | ||
| 43 | +361_s 357 | ||
| 44 | +366_s 362 | ||
| 45 | +379_s 375 | ||
| 46 | +415_s 411 | ||
| 47 | +421_s 417 | ||
| 48 | +444_s 440 | ||
| 49 | +445_s 441 | ||
| 50 | +446_s 442 | ||
| 51 | +453_s 449 | ||
| 52 | +1 | ||
| 53 | +Cluster: 2 | ||
| 54 | + | ||
| 55 | +1_s 1 | ||
| 56 | +5_s 5 | ||
| 57 | +6_s 6 | ||
| 58 | +7_s 7 | ||
| 59 | +8_s 8 | ||
| 60 | +10_s 10 | ||
| 61 | +12_s 12 | ||
| 62 | +13_s 13 | ||
| 63 | +14_s 14 | ||
| 64 | +16_s 16 | ||
| 65 | +18_s 18 | ||
| 66 | +20_s 20 | ||
| 67 | +22_s 22 | ||
| 68 | +23_s 23 | ||
| 69 | +24_s 24 | ||
| 70 | +25_s 25 | ||
| 71 | +28_s 28 | ||
| 72 | +29_s 29 | ||
| 73 | +31_s 31 | ||
| 74 | +34_s 34 | ||
| 75 | +37_s 37 | ||
| 76 | +38_s 38 | ||
| 77 | +44_s 44 | ||
| 78 | +47_s 47 | ||
| 79 | +48_s 48 | ||
| 80 | +49_s 49 | ||
| 81 | +54_s 54 | ||
| 82 | +55_s 55 | ||
| 83 | +58_s 58 | ||
| 84 | +64_s 64 | ||
| 85 | +65_s 65 | ||
| 86 | +67_s 67 | ||
| 87 | +69_s 69 | ||
| 88 | +70_s 70 | ||
| 89 | +71_s 71 | ||
| 90 | +72_s 72 | ||
| 91 | +74_s 74 | ||
| 92 | +75_s 75 | ||
| 93 | +79_s 79 | ||
| 94 | +80_s 80 | ||
| 95 | +81_s 81 | ||
| 96 | +83_s 83 | ||
| 97 | +86_s 86 | ||
| 98 | +87_s 87 | ||
| 99 | +88_s 88 | ||
| 100 | +89_s 89 | ||
| 101 | +90_s 90 | ||
| 102 | +91_s 91 | ||
| 103 | +92_s 92 | ||
| 104 | +93_s 93 | ||
| 105 | +94_s 94 | ||
| 106 | +98_s 98 | ||
| 107 | +101_s 101 | ||
| 108 | +102_s 102 | ||
| 109 | +104_s 104 | ||
| 110 | +106_s 106 | ||
| 111 | +109_s 109 | ||
| 112 | +110_s 110 | ||
| 113 | +112_s 112 | ||
| 114 | +115_s 115 | ||
| 115 | +117_s 117 | ||
| 116 | +118_s 118 | ||
| 117 | +120_s 120 | ||
| 118 | +127_s 126 | ||
| 119 | +128_s 127 | ||
| 120 | +130_s 129 | ||
| 121 | +138_s 137 | ||
| 122 | +141_s 140 | ||
| 123 | +142_s 141 | ||
| 124 | +144_s 143 | ||
| 125 | +146_s 145 | ||
| 126 | +147_s 146 | ||
| 127 | +149_s 148 | ||
| 128 | +150_s 149 | ||
| 129 | +156_s 155 | ||
| 130 | +159_s 158 | ||
| 131 | +161_s 160 | ||
| 132 | +162_s 161 | ||
| 133 | +163_s 162 | ||
| 134 | +164_s 163 | ||
| 135 | +165_s 164 | ||
| 136 | +166_s 165 | ||
| 137 | +169_s 168 | ||
| 138 | +171_s 169 | ||
| 139 | +174_s 172 | ||
| 140 | +177_s 175 | ||
| 141 | +179_s 177 | ||
| 142 | +182_s 180 | ||
| 143 | +184_s 182 | ||
| 144 | +186_s 184 | ||
| 145 | +189_s 187 | ||
| 146 | +190_s 188 | ||
| 147 | +192_s 190 | ||
| 148 | +195_s 193 | ||
| 149 | +197_s 195 | ||
| 150 | +198_s 196 | ||
| 151 | +199_s 197 | ||
| 152 | +203_s 201 | ||
| 153 | +204_s 202 | ||
| 154 | +205_s 203 | ||
| 155 | +208_s 206 | ||
| 156 | +210_s 208 | ||
| 157 | +214_s 212 | ||
| 158 | +215_s 213 | ||
| 159 | +217_s 215 | ||
| 160 | +220_s 218 | ||
| 161 | +221_s 219 | ||
| 162 | +222_s 220 | ||
| 163 | +223_s 221 | ||
| 164 | +225_s 223 | ||
| 165 | +233_s 231 | ||
| 166 | +236_s 234 | ||
| 167 | +237_s 235 | ||
| 168 | +239_s 237 | ||
| 169 | +240_s 238 | ||
| 170 | +241_s 239 | ||
| 171 | +242_s 240 | ||
| 172 | +243_s 241 | ||
| 173 | +246_s 244 | ||
| 174 | +247_s 245 | ||
| 175 | +248_s 246 | ||
| 176 | +250_s 248 | ||
| 177 | +251_s 249 | ||
| 178 | +253_s 250 | ||
| 179 | +254_s 251 | ||
| 180 | +255_s 252 | ||
| 181 | +256_s 253 | ||
| 182 | +260_s 257 | ||
| 183 | +262_s 259 | ||
| 184 | +264_s 261 | ||
| 185 | +265_s 262 | ||
| 186 | +268_s 265 | ||
| 187 | +272_s 269 | ||
| 188 | +273_s 270 | ||
| 189 | +278_s 275 | ||
| 190 | +279_s 276 | ||
| 191 | +280_s 277 | ||
| 192 | +284_s 281 | ||
| 193 | +286_s 283 | ||
| 194 | +289_s 286 | ||
| 195 | +291_s 288 | ||
| 196 | +293_s 289 | ||
| 197 | +295_s 291 | ||
| 198 | +297_s 293 | ||
| 199 | +298_s 294 | ||
| 200 | +299_s 295 | ||
| 201 | +300_s 296 | ||
| 202 | +305_s 301 | ||
| 203 | +307_s 303 | ||
| 204 | +308_s 304 | ||
| 205 | +310_s 306 | ||
| 206 | +313_s 309 | ||
| 207 | +314_s 310 | ||
| 208 | +316_s 312 | ||
| 209 | +317_s 313 | ||
| 210 | +318_s 314 | ||
| 211 | +320_s 316 | ||
| 212 | +322_s 318 | ||
| 213 | +325_s 321 | ||
| 214 | +326_s 322 | ||
| 215 | +328_s 324 | ||
| 216 | +330_s 326 | ||
| 217 | +333_s 329 | ||
| 218 | +335_s 331 | ||
| 219 | +336_s 332 | ||
| 220 | +340_s 336 | ||
| 221 | +343_s 339 | ||
| 222 | +345_s 341 | ||
| 223 | +347_s 343 | ||
| 224 | +348_s 344 | ||
| 225 | +349_s 345 | ||
| 226 | +352_s 348 | ||
| 227 | +353_s 349 | ||
| 228 | +354_s 350 | ||
| 229 | +355_s 351 | ||
| 230 | +356_s 352 | ||
| 231 | +357_s 353 | ||
| 232 | +360_s 356 | ||
| 233 | +362_s 358 | ||
| 234 | +363_s 359 | ||
| 235 | +364_s 360 | ||
| 236 | +365_s 361 | ||
| 237 | +367_s 363 | ||
| 238 | +368_s 364 | ||
| 239 | +371_s 367 | ||
| 240 | +372_s 368 | ||
| 241 | +374_s 370 | ||
| 242 | +375_s 371 | ||
| 243 | +376_s 372 | ||
| 244 | +377_s 373 | ||
| 245 | +378_s 374 | ||
| 246 | +380_s 376 | ||
| 247 | +383_s 379 | ||
| 248 | +384_s 380 | ||
| 249 | +387_s 383 | ||
| 250 | +388_s 384 | ||
| 251 | +390_s 386 | ||
| 252 | +391_s 387 | ||
| 253 | +392_s 388 | ||
| 254 | +398_s 394 | ||
| 255 | +399_s 395 | ||
| 256 | +401_s 397 | ||
| 257 | +402_s 398 | ||
| 258 | +403_s 399 | ||
| 259 | +407_s 403 | ||
| 260 | +408_s 404 | ||
| 261 | +409_s 405 | ||
| 262 | +411_s 407 | ||
| 263 | +413_s 409 | ||
| 264 | +414_s 410 | ||
| 265 | +416_s 412 | ||
| 266 | +418_s 414 | ||
| 267 | +419_s 415 | ||
| 268 | +420_s 416 | ||
| 269 | +424_s 420 | ||
| 270 | +428_s 424 | ||
| 271 | +429_s 425 | ||
| 272 | +431_s 427 | ||
| 273 | +433_s 429 | ||
| 274 | +437_s 433 | ||
| 275 | +438_s 434 | ||
| 276 | +440_s 436 | ||
| 277 | +442_s 438 | ||
| 278 | +443_s 439 | ||
| 279 | +447_s 443 | ||
| 280 | +448_s 444 | ||
| 281 | +450_s 446 | ||
| 282 | +451_s 447 | ||
| 283 | +452_s 448 | ||
| 284 | +1 | ||
| 285 | +Cluster: 3 | ||
| 286 | + | ||
| 287 | +9_s 9 | ||
| 288 | +19_s 19 | ||
| 289 | +21_s 21 | ||
| 290 | +26_s 26 | ||
| 291 | +32_s 32 | ||
| 292 | +39_s 39 | ||
| 293 | +52_s 52 | ||
| 294 | +73_s 73 | ||
| 295 | +77_s 77 | ||
| 296 | +95_s 95 | ||
| 297 | +107_s 107 | ||
| 298 | +113_s 113 | ||
| 299 | +125_s 125 | ||
| 300 | +129_s 128 | ||
| 301 | +131_s 130 | ||
| 302 | +145_s 144 | ||
| 303 | +153_s 152 | ||
| 304 | +158_s 157 | ||
| 305 | +172_s 170 | ||
| 306 | +175_s 173 | ||
| 307 | +180_s 178 | ||
| 308 | +200_s 198 | ||
| 309 | +202_s 200 | ||
| 310 | +207_s 205 | ||
| 311 | +224_s 222 | ||
| 312 | +228_s 226 | ||
| 313 | +231_s 229 | ||
| 314 | +259_s 256 | ||
| 315 | +263_s 260 | ||
| 316 | +274_s 271 | ||
| 317 | +283_s 280 | ||
| 318 | +288_s 285 | ||
| 319 | +306_s 302 | ||
| 320 | +321_s 317 | ||
| 321 | +323_s 319 | ||
| 322 | +334_s 330 | ||
| 323 | +359_s 355 | ||
| 324 | +382_s 378 | ||
| 325 | +395_s 391 | ||
| 326 | +417_s 413 | ||
| 327 | +434_s 430 | ||
| 328 | +454_s 450 | ||
| 329 | +1 | ||
| 330 | +Cluster: 4 | ||
| 331 | + | ||
| 332 | +2_s 2 | ||
| 333 | +4_s 4 | ||
| 334 | +11_s 11 | ||
| 335 | +15_s 15 | ||
| 336 | +27_s 27 | ||
| 337 | +30_s 30 | ||
| 338 | +35_s 35 | ||
| 339 | +36_s 36 | ||
| 340 | +40_s 40 | ||
| 341 | +41_s 41 | ||
| 342 | +45_s 45 | ||
| 343 | +46_s 46 | ||
| 344 | +50_s 50 | ||
| 345 | +51_s 51 | ||
| 346 | +53_s 53 | ||
| 347 | +56_s 56 | ||
| 348 | +57_s 57 | ||
| 349 | +60_s 60 | ||
| 350 | +61_s 61 | ||
| 351 | +62_s 62 | ||
| 352 | +63_s 63 | ||
| 353 | +68_s 68 | ||
| 354 | +76_s 76 | ||
| 355 | +78_s 78 | ||
| 356 | +85_s 85 | ||
| 357 | +96_s 96 | ||
| 358 | +97_s 97 | ||
| 359 | +99_s 99 | ||
| 360 | +103_s 103 | ||
| 361 | +105_s 105 | ||
| 362 | +108_s 108 | ||
| 363 | +111_s 111 | ||
| 364 | +114_s 114 | ||
| 365 | +116_s 116 | ||
| 366 | +119_s 119 | ||
| 367 | +121_s 121 | ||
| 368 | +122_s 122 | ||
| 369 | +133_s 132 | ||
| 370 | +134_s 133 | ||
| 371 | +135_s 134 | ||
| 372 | +139_s 138 | ||
| 373 | +140_s 139 | ||
| 374 | +143_s 142 | ||
| 375 | +151_s 150 | ||
| 376 | +152_s 151 | ||
| 377 | +154_s 153 | ||
| 378 | +155_s 154 | ||
| 379 | +157_s 156 | ||
| 380 | +160_s 159 | ||
| 381 | +167_s 166 | ||
| 382 | +168_s 167 | ||
| 383 | +173_s 171 | ||
| 384 | +176_s 174 | ||
| 385 | +178_s 176 | ||
| 386 | +181_s 179 | ||
| 387 | +183_s 181 | ||
| 388 | +185_s 183 | ||
| 389 | +187_s 185 | ||
| 390 | +191_s 189 | ||
| 391 | +193_s 191 | ||
| 392 | +194_s 192 | ||
| 393 | +201_s 199 | ||
| 394 | +206_s 204 | ||
| 395 | +209_s 207 | ||
| 396 | +211_s 209 | ||
| 397 | +212_s 210 | ||
| 398 | +213_s 211 | ||
| 399 | +218_s 216 | ||
| 400 | +232_s 230 | ||
| 401 | +235_s 233 | ||
| 402 | +249_s 247 | ||
| 403 | +257_s 254 | ||
| 404 | +258_s 255 | ||
| 405 | +261_s 258 | ||
| 406 | +266_s 263 | ||
| 407 | +267_s 264 | ||
| 408 | +270_s 267 | ||
| 409 | +271_s 268 | ||
| 410 | +276_s 273 | ||
| 411 | +277_s 274 | ||
| 412 | +282_s 279 | ||
| 413 | +285_s 282 | ||
| 414 | +294_s 290 | ||
| 415 | +296_s 292 | ||
| 416 | +304_s 300 | ||
| 417 | +309_s 305 | ||
| 418 | +311_s 307 | ||
| 419 | +312_s 308 | ||
| 420 | +319_s 315 | ||
| 421 | +324_s 320 | ||
| 422 | +327_s 323 | ||
| 423 | +329_s 325 | ||
| 424 | +331_s 327 | ||
| 425 | +337_s 333 | ||
| 426 | +338_s 334 | ||
| 427 | +339_s 335 | ||
| 428 | +341_s 337 | ||
| 429 | +342_s 338 | ||
| 430 | +344_s 340 | ||
| 431 | +346_s 342 | ||
| 432 | +351_s 347 | ||
| 433 | +358_s 354 | ||
| 434 | +369_s 365 | ||
| 435 | +370_s 366 | ||
| 436 | +373_s 369 | ||
| 437 | +381_s 377 | ||
| 438 | +385_s 381 | ||
| 439 | +386_s 382 | ||
| 440 | +389_s 385 | ||
| 441 | +393_s 389 | ||
| 442 | +394_s 390 | ||
| 443 | +396_s 392 | ||
| 444 | +397_s 393 | ||
| 445 | +400_s 396 | ||
| 446 | +404_s 400 | ||
| 447 | +405_s 401 | ||
| 448 | +406_s 402 | ||
| 449 | +410_s 406 | ||
| 450 | +412_s 408 | ||
| 451 | +422_s 418 | ||
| 452 | +423_s 419 | ||
| 453 | +425_s 421 | ||
| 454 | +426_s 422 | ||
| 455 | +427_s 423 | ||
| 456 | +430_s 426 | ||
| 457 | +432_s 428 | ||
| 458 | +435_s 431 | ||
| 459 | +436_s 432 | ||
| 460 | +439_s 435 | ||
| 461 | +441_s 437 | ||
| 462 | +449_s 445 | ||
| 463 | +455_s 451 | 
scripts/Clustering_Analysis.R
0 → 100644
| 1 | +library(methods) | ||
| 2 | +library(cluster) | ||
| 3 | + | ||
| 4 | +# Funcion para imprimir los clusters | ||
| 5 | +print_cluster <- function(obj, filename) { | ||
| 6 | + | ||
| 7 | + for(cl in 1:length(obj)) { | ||
| 8 | + | ||
| 9 | + write.table(paste("\nCluster: ", cl, "\n"), file = filename, append = TRUE, quote = FALSE, row.names = TRUE, col.names = FALSE) | ||
| 10 | + write.table(obj[[cl]], file = filename, append = TRUE, quote = FALSE, row.names = TRUE, col.names = FALSE, sep = " ") | ||
| 11 | + | ||
| 12 | + } | ||
| 13 | +} | ||
| 14 | +################################################################################################################################################### | ||
| 15 | +# Receive arguments | ||
| 16 | +arg = commandArgs(trailingOnly = T) | ||
| 17 | + | ||
| 18 | +if (length(arg)==0) { | ||
| 19 | + stop("Must supply input file.n", call.=FALSE) | ||
| 20 | +} | ||
| 21 | + | ||
| 22 | +################################################# Run analysis ################################################## | ||
| 23 | +vecs <- read.table(arg[1], | ||
| 24 | + header = F, row.names = 1, sep = ' ', | ||
| 25 | + colClasses = c("character", rep("numeric", 299))) | ||
| 26 | + | ||
| 27 | +senclus <- hclust(dist(vecs), method = 'ward.D') | ||
| 28 | +print("agglomerative coefficient: ") | ||
| 29 | +print(coef.hclust(senclus)) | ||
| 30 | + | ||
| 31 | +# Guardamos la imagen del dendograma original | ||
| 32 | +png("Dendogram_ward.png", height = 608, width = 975) | ||
| 33 | +plot(senclus, hang = -1) | ||
| 34 | +dev.off() | ||
| 35 | + | ||
| 36 | +### | ||
| 37 | +# Particion en dos clusters | ||
| 38 | +png("Dendogram_2clusters.png", height = 608, width = 975) | ||
| 39 | +plot(senclus, hang = -1) | ||
| 40 | +cls2 <- rect.hclust(senclus, k=2, border = 3:4) | ||
| 41 | +dev.off() | ||
| 42 | + | ||
| 43 | +# Escribir archivo | ||
| 44 | +print_cluster(cls2, "SentenceMembership_2clusters.txt") | ||
| 45 | + | ||
| 46 | +####### | ||
| 47 | +# Particion en tres clusters | ||
| 48 | +png("Dendogram_3clusters.png", height = 608, width = 975) | ||
| 49 | +plot(senclus, hang = -1) | ||
| 50 | +cls3 <- rect.hclust(senclus, k=3, border = 3:4) | ||
| 51 | +dev.off() | ||
| 52 | + | ||
| 53 | +# Escribir archivo | ||
| 54 | +print_cluster(cls3, "SentenceMembership_3clusters.txt") | ||
| 55 | + | ||
| 56 | +##### | ||
| 57 | +# Particion en cuatro clusters | ||
| 58 | +png("Dendogram_4clusters.png", height = 608, width = 975) | ||
| 59 | +plot(senclus, hang = -1) | ||
| 60 | +cls4 <- rect.hclust(senclus, k=4, border = 3:4) | ||
| 61 | +dev.off() | ||
| 62 | + | ||
| 63 | +# Escribir archivo | ||
| 64 | +print_cluster(cls4, "SentenceMembership_4clusters.txt") | ||
| 65 | + | ||
| 66 | + | ||
| 67 | + | ||
| 68 | + | 
scripts/change_tabs.py
0 → 100644
| 1 | +from optparse import OptionParser | ||
| 2 | + | ||
| 3 | +# Recibir input y output | ||
| 4 | +parser = OptionParser() | ||
| 5 | +parser.add_option("-i", dest="inF",help="Input vector file. Sentence is separated by tabs from values which are sparated by simple space", metavar="PATH") | ||
| 6 | +parser.add_option("-o", dest="otF",help="output file name", metavar="PATH") | ||
| 7 | + | ||
| 8 | +(options, args) = parser.parse_args() | ||
| 9 | +if len(args) > 0: | ||
| 10 | + parser.error("Please indicate an input directory") | ||
| 11 | + sys.exit(1) | ||
| 12 | + | ||
| 13 | +# Asignar variables | ||
| 14 | +infile = options.inF | ||
| 15 | +outfile = options.otF | ||
| 16 | + | ||
| 17 | +# Abrir nuevo archivo | ||
| 18 | +newfile = open(outfile, 'w') | ||
| 19 | + | ||
| 20 | +# Reemplazar tab por espacio | ||
| 21 | +with open(infile) as vectors: | ||
| 22 | + for line in vectors: | ||
| 23 | + # Aislar el numero de articulo de sus valores | ||
| 24 | + elements = line.rstrip().split('\t') | ||
| 25 | + | ||
| 26 | + # Ponemos una letra para facilitar la indentificacion posterior | ||
| 27 | + index = elements[0] + '_s' | ||
| 28 | + | ||
| 29 | + # Armar la nueva linea | ||
| 30 | + newline = ' '.join([index,elements[1]]) | ||
| 31 | + newline = newline + '\n' | ||
| 32 | + newfile.write(newline) | ||
| 33 | + | ||
| 34 | +newfile.close() | ||
| ... | \ No newline at end of file | ... | \ No newline at end of file | 
- 
Please register or login to post a comment