Showing
18 changed files
with
404 additions
and
0 deletions
data/w2v_corpus-spanish-seg_H300_w10.dic
0 → 100644
This diff could not be displayed because it is too large.
data/w2v_corpus-spanish-seg_H300_w10.mtx
0 → 100644
This diff could not be displayed because it is too large.
multi_clustering.sh
0 → 100644
| 1 | +for n in {4..30}; do python sent_clustering3.py -x data/w2v_corpus-spanish-seg_H300_w10.mtx -m cosine -c km -N 10 -k "$n" -o results/spanish-seg_w2v-cos_H300_NC0"$n".cls; done |
nohup.out
0 → 100644
| 1 | +Traceback (most recent call last): | ||
| 2 | + File "sent_clustering3.py", line 70, in <module> | ||
| 3 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 4 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 5 | +Traceback (most recent call last): | ||
| 6 | + File "sent_clustering3.py", line 70, in <module> | ||
| 7 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 8 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 9 | +Traceback (most recent call last): | ||
| 10 | + File "sent_clustering3.py", line 70, in <module> | ||
| 11 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 12 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 13 | +Traceback (most recent call last): | ||
| 14 | + File "sent_clustering3.py", line 70, in <module> | ||
| 15 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 16 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 17 | +Traceback (most recent call last): | ||
| 18 | + File "sent_clustering3.py", line 70, in <module> | ||
| 19 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 20 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 21 | +Traceback (most recent call last): | ||
| 22 | + File "sent_clustering3.py", line 70, in <module> | ||
| 23 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 24 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 25 | +Traceback (most recent call last): | ||
| 26 | + File "sent_clustering3.py", line 70, in <module> | ||
| 27 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 28 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 29 | +Traceback (most recent call last): | ||
| 30 | + File "sent_clustering3.py", line 70, in <module> | ||
| 31 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 32 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 33 | +Traceback (most recent call last): | ||
| 34 | + File "sent_clustering3.py", line 70, in <module> | ||
| 35 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 36 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 37 | +Traceback (most recent call last): | ||
| 38 | + File "sent_clustering3.py", line 70, in <module> | ||
| 39 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 40 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 41 | +Traceback (most recent call last): | ||
| 42 | + File "sent_clustering3.py", line 70, in <module> | ||
| 43 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 44 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 45 | +Traceback (most recent call last): | ||
| 46 | + File "sent_clustering3.py", line 70, in <module> | ||
| 47 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 48 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 49 | +Traceback (most recent call last): | ||
| 50 | + File "sent_clustering3.py", line 70, in <module> | ||
| 51 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 52 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 53 | +Traceback (most recent call last): | ||
| 54 | + File "sent_clustering3.py", line 70, in <module> | ||
| 55 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 56 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 57 | +Traceback (most recent call last): | ||
| 58 | + File "sent_clustering3.py", line 70, in <module> | ||
| 59 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 60 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 61 | +Traceback (most recent call last): | ||
| 62 | + File "sent_clustering3.py", line 70, in <module> | ||
| 63 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 64 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 65 | +Traceback (most recent call last): | ||
| 66 | + File "sent_clustering3.py", line 70, in <module> | ||
| 67 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 68 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 69 | +Traceback (most recent call last): | ||
| 70 | + File "sent_clustering3.py", line 70, in <module> | ||
| 71 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 72 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 73 | +Traceback (most recent call last): | ||
| 74 | + File "sent_clustering3.py", line 70, in <module> | ||
| 75 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 76 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 77 | +Traceback (most recent call last): | ||
| 78 | + File "sent_clustering3.py", line 70, in <module> | ||
| 79 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 80 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 81 | +Traceback (most recent call last): | ||
| 82 | + File "sent_clustering3.py", line 70, in <module> | ||
| 83 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 84 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 85 | +Traceback (most recent call last): | ||
| 86 | + File "sent_clustering3.py", line 70, in <module> | ||
| 87 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 88 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 89 | +Traceback (most recent call last): | ||
| 90 | + File "sent_clustering3.py", line 70, in <module> | ||
| 91 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 92 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 93 | +Traceback (most recent call last): | ||
| 94 | + File "sent_clustering3.py", line 70, in <module> | ||
| 95 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 96 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 97 | +Traceback (most recent call last): | ||
| 98 | + File "sent_clustering3.py", line 70, in <module> | ||
| 99 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 100 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 101 | +Traceback (most recent call last): | ||
| 102 | + File "sent_clustering3.py", line 70, in <module> | ||
| 103 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 104 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 105 | +Traceback (most recent call last): | ||
| 106 | + File "sent_clustering3.py", line 70, in <module> | ||
| 107 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 108 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 109 | +Traceback (most recent call last): | ||
| 110 | + File "sent_clustering3.py", line 70, in <module> | ||
| 111 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 112 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 113 | +Traceback (most recent call last): | ||
| 114 | + File "sent_clustering3.py", line 70, in <module> | ||
| 115 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 116 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 117 | +Traceback (most recent call last): | ||
| 118 | + File "sent_clustering3.py", line 70, in <module> | ||
| 119 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 120 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 121 | +Traceback (most recent call last): | ||
| 122 | + File "sent_clustering3.py", line 70, in <module> | ||
| 123 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 124 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 125 | +Traceback (most recent call last): | ||
| 126 | + File "sent_clustering3.py", line 70, in <module> | ||
| 127 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 128 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 129 | +Traceback (most recent call last): | ||
| 130 | + File "sent_clustering3.py", line 70, in <module> | ||
| 131 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 132 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 133 | +Traceback (most recent call last): | ||
| 134 | + File "sent_clustering3.py", line 70, in <module> | ||
| 135 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 136 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 137 | +Traceback (most recent call last): | ||
| 138 | + File "sent_clustering3.py", line 70, in <module> | ||
| 139 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 140 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 141 | +Traceback (most recent call last): | ||
| 142 | + File "sent_clustering3.py", line 70, in <module> | ||
| 143 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 144 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 145 | +Traceback (most recent call last): | ||
| 146 | + File "sent_clustering3.py", line 70, in <module> | ||
| 147 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 148 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 149 | +Traceback (most recent call last): | ||
| 150 | + File "sent_clustering3.py", line 70, in <module> | ||
| 151 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 152 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 153 | +Traceback (most recent call last): | ||
| 154 | + File "sent_clustering3.py", line 70, in <module> | ||
| 155 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 156 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 157 | +Traceback (most recent call last): | ||
| 158 | + File "sent_clustering3.py", line 70, in <module> | ||
| 159 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 160 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 161 | +Traceback (most recent call last): | ||
| 162 | + File "sent_clustering3.py", line 70, in <module> | ||
| 163 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 164 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 165 | +Traceback (most recent call last): | ||
| 166 | + File "sent_clustering3.py", line 70, in <module> | ||
| 167 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 168 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 169 | +Traceback (most recent call last): | ||
| 170 | + File "sent_clustering3.py", line 70, in <module> | ||
| 171 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 172 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 173 | +Traceback (most recent call last): | ||
| 174 | + File "sent_clustering3.py", line 70, in <module> | ||
| 175 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 176 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 177 | +Traceback (most recent call last): | ||
| 178 | + File "sent_clustering3.py", line 70, in <module> | ||
| 179 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 180 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 181 | +Traceback (most recent call last): | ||
| 182 | + File "sent_clustering3.py", line 70, in <module> | ||
| 183 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 184 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 185 | +Traceback (most recent call last): | ||
| 186 | + File "sent_clustering3.py", line 70, in <module> | ||
| 187 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 188 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 189 | +Traceback (most recent call last): | ||
| 190 | + File "sent_clustering3.py", line 70, in <module> | ||
| 191 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 192 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 193 | +Traceback (most recent call last): | ||
| 194 | + File "sent_clustering3.py", line 70, in <module> | ||
| 195 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 196 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 197 | +Traceback (most recent call last): | ||
| 198 | + File "sent_clustering3.py", line 70, in <module> | ||
| 199 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 200 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 201 | +Traceback (most recent call last): | ||
| 202 | + File "sent_clustering3.py", line 70, in <module> | ||
| 203 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 204 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 205 | +Traceback (most recent call last): | ||
| 206 | + File "sent_clustering3.py", line 70, in <module> | ||
| 207 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 208 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 209 | +Traceback (most recent call last): | ||
| 210 | + File "sent_clustering3.py", line 70, in <module> | ||
| 211 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 212 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 213 | +Traceback (most recent call last): | ||
| 214 | + File "sent_clustering3.py", line 70, in <module> | ||
| 215 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 216 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
| 217 | +Traceback (most recent call last): | ||
| 218 | + File "sent_clustering3.py", line 99, in <module> | ||
| 219 | + coeff = metrics.silhouette_score(X, km.labels_, sample_size=1000) | ||
| 220 | + File "/almac/ignacio/anaconda3/envs/py36/lib/python3.6/site-packages/sklearn/metrics/cluster/unsupervised.py", line 100, in silhouette_score | ||
| 221 | + return np.mean(silhouette_samples(X, labels, metric=metric, **kwds)) | ||
| 222 | + File "/almac/ignacio/anaconda3/envs/py36/lib/python3.6/site-packages/sklearn/metrics/cluster/unsupervised.py", line 166, in silhouette_samples | ||
| 223 | + check_number_of_labels(len(le.classes_), X.shape[0]) | ||
| 224 | + File "/almac/ignacio/anaconda3/envs/py36/lib/python3.6/site-packages/sklearn/metrics/cluster/unsupervised.py", line 20, in check_number_of_labels | ||
| 225 | + "to n_samples - 1 (inclusive)" % n_labels) | ||
| 226 | +ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive) | ||
| 227 | +Silhouette Coefficient: 0.045 | ||
| 228 | +Number of clusters: 5 | ||
| 229 | + | ||
| 230 | +Silhouette Coefficient: -0.017 | ||
| 231 | +Number of clusters: 6 | ||
| 232 | + | ||
| 233 | +Silhouette Coefficient: -0.012 | ||
| 234 | +Number of clusters: 7 | ||
| 235 | + | ||
| 236 | +Silhouette Coefficient: -0.016 | ||
| 237 | +Number of clusters: 8 | ||
| 238 | + | ||
| 239 | +Silhouette Coefficient: -0.019 | ||
| 240 | +Number of clusters: 9 | ||
| 241 | + | ||
| 242 | +Silhouette Coefficient: -0.062 | ||
| 243 | +Number of clusters: 10 | ||
| 244 | + | ||
| 245 | +Silhouette Coefficient: -0.054 | ||
| 246 | +Number of clusters: 11 | ||
| 247 | + | ||
| 248 | +Silhouette Coefficient: -0.068 | ||
| 249 | +Number of clusters: 12 | ||
| 250 | + | ||
| 251 | +Silhouette Coefficient: -0.076 | ||
| 252 | +Number of clusters: 13 | ||
| 253 | + | ||
| 254 | +Silhouette Coefficient: -0.046 | ||
| 255 | +Number of clusters: 14 | ||
| 256 | + | ||
| 257 | +Silhouette Coefficient: -0.092 | ||
| 258 | +Number of clusters: 15 | ||
| 259 | + | ||
| 260 | +Silhouette Coefficient: -0.089 | ||
| 261 | +Number of clusters: 16 | ||
| 262 | + | ||
| 263 | +Silhouette Coefficient: -0.091 | ||
| 264 | +Number of clusters: 17 | ||
| 265 | + |
results/spanish-seg_w2v-cos_H300_NC010.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC011.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC012.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC013.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC014.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC015.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC016.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC017.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC05.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC06.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC07.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC08.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC09.cls
0 → 100644
This diff could not be displayed because it is too large.
sent_clustering3.py
0 → 100644
| 1 | +# -*- coding: utf-8 -*- | ||
| 2 | +from sklearn.cluster import KMeans | ||
| 3 | +from sklearn import metrics | ||
| 4 | +from scipy.cluster.hierarchy import ward, dendrogram | ||
| 5 | +from sklearn.metrics.pairwise import cosine_similarity | ||
| 6 | +from re import search, M, I | ||
| 7 | +import logging | ||
| 8 | +import matplotlib as mpl | ||
| 9 | +mpl.use('Agg') | ||
| 10 | +import matplotlib.pyplot as plt | ||
| 11 | +import sys | ||
| 12 | +import numpy as np | ||
| 13 | +from argparse import ArgumentParser as ap | ||
| 14 | +import os | ||
| 15 | + | ||
| 16 | +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', | ||
| 17 | + level=logging.INFO) | ||
| 18 | + | ||
| 19 | +parser = ap(description='This script clusters word embeddings by using k-means.') | ||
| 20 | +parser.add_argument("-x", help="Input file name (vectors)", metavar="input_file", required=True) | ||
| 21 | +parser.add_argument("-o", help="Output file name (clusters)", metavar="output_file") | ||
| 22 | +parser.add_argument("-m", help="Clustering metric [cosine|euclidean]", metavar="metric", default="euclidean") | ||
| 23 | +parser.add_argument("-c", help="Clusterer = {km, km++, aggwr}", metavar="clusterer", required=True) | ||
| 24 | +parser.add_argument("-k", help="Number of clusters", metavar="k_clusters") | ||
| 25 | +parser.add_argument("-N", help="Number of trials for maximize Silhueltte", metavar="n_trials") | ||
| 26 | +parser.add_argument("-t", default=False, action="store_true", help="Toggle if labels are PoS tags instead of snippets.") | ||
| 27 | +parser.add_argument("-n", default=False, action="store_true", help="Toggle if labels are NounPhrases instead of snippets.") | ||
| 28 | +args = parser.parse_args() | ||
| 29 | + | ||
| 30 | +min_show_length = 100 | ||
| 31 | + | ||
| 32 | +if args.m=="cosine": | ||
| 33 | + from sklearn.metrics.pairwise import cosine_distances as cos | ||
| 34 | + KMeans.euclidean_distances=cos | ||
| 35 | + | ||
| 36 | +def cleaner(line): # The default is the average sentence length in English | ||
| 37 | + return line.strip()#[:min_show_length] | ||
| 38 | + | ||
| 39 | +#try: | ||
| 40 | +# source = search(r"(?:vectors|pairs)_([A-Za-z]+[\-A-Za-z0-9]+)_?(T[0-9]{2,3}_C[1-9]_[0-9]{2}|d\d+t|\w+)?_([d2v|w2v|fstx|coocc\w*|doc\w*]*)_(H[0-9]{1,4})_?([sub|co[nvs{0,2}|rr|nc]+]?)?_(m[0-9]{1,3}[_w?[0-9]{0,3}]?)", args.x, M|I) | ||
| 41 | + | ||
| 42 | +corpus = "spanish-seg" # source.group(1) | ||
| 43 | +representation = "w2v" # source.group(3) | ||
| 44 | +dimensions = "300" # source.group(4)[1:] | ||
| 45 | +min_count = "1" #source.group(6)[1:] | ||
| 46 | +term_name = args.x.split(".")[0] + ".dic" #source.group(2) | ||
| 47 | + | ||
| 48 | +#except IndexError: | ||
| 49 | +# print ("\nError in the filename. One or more indicators are missing. Notation: <vectors|pairs>_<source_corpus>_<model_representation>_<dimen..") | ||
| 50 | +# for i in range(6): | ||
| 51 | +# try: | ||
| 52 | +# print (source.group(i)) | ||
| 53 | +# except IndexError: | ||
| 54 | +# print (":>> Unparsed: %s" % (i)) | ||
| 55 | +# pass | ||
| 56 | +# exit() | ||
| 57 | +#except AttributeError: | ||
| 58 | +# print ("\nFatal Error in the filename. Notation: <vectors|pairs>_<source_corpus>_<model_representation>_<dimendions>_<operation>*_<mminimum_...") | ||
| 59 | +# for i in range(6): | ||
| 60 | +# try: | ||
| 61 | +# print (source.group(i)) | ||
| 62 | +# except AttributeError: | ||
| 63 | +# print (":>> Unparsed: %s" % (i)) | ||
| 64 | +# pass | ||
| 65 | +# exit() | ||
| 66 | + | ||
| 67 | +#route = os.path.dirname(args.x) | ||
| 68 | +## Loading files | ||
| 69 | +if not args.t and not args.n: | ||
| 70 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 71 | + snippets = list(map(cleaner, f.readlines())) | ||
| 72 | + t = "" | ||
| 73 | +elif args.n: | ||
| 74 | + with open("%s/%s.txt.phr" % (route, term_name)) as f: | ||
| 75 | + snippets = list(map(cleaner, f.readlines())) | ||
| 76 | + t = "_phr" | ||
| 77 | +else: | ||
| 78 | + with open("%s/%s.tags" % (route, term_name)) as f: | ||
| 79 | + snippets = list(map(cleaner, f.readlines())) | ||
| 80 | + t = "_tags" | ||
| 81 | +#TODO: Parse the snippets wit correct vectors file. | ||
| 82 | +X = np.loadtxt(args.x) | ||
| 83 | +if args.c.startswith("km"): | ||
| 84 | + num_clusters = int(args.k) | ||
| 85 | + | ||
| 86 | + if "++" in args.c: | ||
| 87 | + km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100, | ||
| 88 | + verbose=1, n_init=1) | ||
| 89 | + km.fit(X) | ||
| 90 | + coeff = metrics.silhouette_score(X, km.labels_, sample_size=1000) | ||
| 91 | + clusters=km.labels_.tolist() | ||
| 92 | + else: | ||
| 93 | + max_silhouette = -1 | ||
| 94 | + silhouette = -1 | ||
| 95 | + for tr in range(int(args.N)): # Number of trials for maximize Silhueltte | ||
| 96 | + km = KMeans(n_clusters=num_clusters, init='random', max_iter=100, | ||
| 97 | + verbose=1, n_init=1, n_jobs=4) | ||
| 98 | + km.fit(X) | ||
| 99 | + coeff = metrics.silhouette_score(X, km.labels_, sample_size=1000) | ||
| 100 | + #print ("Partial Silhuette: %f" % coeff) | ||
| 101 | + if silhouette < coeff: | ||
| 102 | + clusters=km.labels_.tolist() | ||
| 103 | + silhouette = coeff | ||
| 104 | + definitions=sorted(list(zip(clusters, snippets)), key=lambda x: x[0]) | ||
| 105 | + | ||
| 106 | + if args.o: | ||
| 107 | + f = open(args.o, 'w') | ||
| 108 | + writer = f.write | ||
| 109 | + else: | ||
| 110 | + writer = print | ||
| 111 | + | ||
| 112 | + while(1): | ||
| 113 | + try: | ||
| 114 | + c, s = definitions.pop() | ||
| 115 | + writer ("%d\t%s\n" % (c, s)) | ||
| 116 | + except IndexError: | ||
| 117 | + break | ||
| 118 | + | ||
| 119 | + print("Silhouette Coefficient: %0.3f" % silhouette) | ||
| 120 | + print("Number of clusters: %d" % num_clusters) | ||
| 121 | + print() | ||
| 122 | + | ||
| 123 | +elif args.c.startswith("agg"): | ||
| 124 | + dist = 1 - cosine_similarity(X) | ||
| 125 | + linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances | ||
| 126 | + fig, ax = plt.subplots(figsize=(15, 20)) # set size | ||
| 127 | + ax = dendrogram(linkage_matrix, orientation="right", labels=list(snippets)); | ||
| 128 | + | ||
| 129 | + plt.tick_params(\ | ||
| 130 | + axis= 'x', # changes apply to the x-axis | ||
| 131 | + which='both', # both major and minor ticks are affected | ||
| 132 | + bottom='off', # ticks along the bottom edge are off | ||
| 133 | + top='off', # ticks along the top edge are off | ||
| 134 | + labelbottom='off') | ||
| 135 | + | ||
| 136 | + plt.tight_layout() #show plot with tight layout | ||
| 137 | +#uncomment below to save figure | ||
| 138 | + plt.savefig("ward_clusters_%s%s_%s_H%s.png" % (term_name, t, corpus, dimensions), dpi=200) #save figure as ward_clusters |
-
Please register or login to post a comment