Showing
18 changed files
with
404 additions
and
0 deletions
data/w2v_corpus-spanish-seg_H300_w10.dic
0 → 100644
This diff could not be displayed because it is too large.
data/w2v_corpus-spanish-seg_H300_w10.mtx
0 → 100644
This diff could not be displayed because it is too large.
multi_clustering.sh
0 → 100644
1 | +for n in {4..30}; do python sent_clustering3.py -x data/w2v_corpus-spanish-seg_H300_w10.mtx -m cosine -c km -N 10 -k "$n" -o results/spanish-seg_w2v-cos_H300_NC0"$n".cls; done |
nohup.out
0 → 100644
1 | +Traceback (most recent call last): | ||
2 | + File "sent_clustering3.py", line 70, in <module> | ||
3 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
4 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
5 | +Traceback (most recent call last): | ||
6 | + File "sent_clustering3.py", line 70, in <module> | ||
7 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
8 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
9 | +Traceback (most recent call last): | ||
10 | + File "sent_clustering3.py", line 70, in <module> | ||
11 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
12 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
13 | +Traceback (most recent call last): | ||
14 | + File "sent_clustering3.py", line 70, in <module> | ||
15 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
16 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
17 | +Traceback (most recent call last): | ||
18 | + File "sent_clustering3.py", line 70, in <module> | ||
19 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
20 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
21 | +Traceback (most recent call last): | ||
22 | + File "sent_clustering3.py", line 70, in <module> | ||
23 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
24 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
25 | +Traceback (most recent call last): | ||
26 | + File "sent_clustering3.py", line 70, in <module> | ||
27 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
28 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
29 | +Traceback (most recent call last): | ||
30 | + File "sent_clustering3.py", line 70, in <module> | ||
31 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
32 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
33 | +Traceback (most recent call last): | ||
34 | + File "sent_clustering3.py", line 70, in <module> | ||
35 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
36 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
37 | +Traceback (most recent call last): | ||
38 | + File "sent_clustering3.py", line 70, in <module> | ||
39 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
40 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
41 | +Traceback (most recent call last): | ||
42 | + File "sent_clustering3.py", line 70, in <module> | ||
43 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
44 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
45 | +Traceback (most recent call last): | ||
46 | + File "sent_clustering3.py", line 70, in <module> | ||
47 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
48 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
49 | +Traceback (most recent call last): | ||
50 | + File "sent_clustering3.py", line 70, in <module> | ||
51 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
52 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
53 | +Traceback (most recent call last): | ||
54 | + File "sent_clustering3.py", line 70, in <module> | ||
55 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
56 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
57 | +Traceback (most recent call last): | ||
58 | + File "sent_clustering3.py", line 70, in <module> | ||
59 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
60 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
61 | +Traceback (most recent call last): | ||
62 | + File "sent_clustering3.py", line 70, in <module> | ||
63 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
64 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
65 | +Traceback (most recent call last): | ||
66 | + File "sent_clustering3.py", line 70, in <module> | ||
67 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
68 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
69 | +Traceback (most recent call last): | ||
70 | + File "sent_clustering3.py", line 70, in <module> | ||
71 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
72 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
73 | +Traceback (most recent call last): | ||
74 | + File "sent_clustering3.py", line 70, in <module> | ||
75 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
76 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
77 | +Traceback (most recent call last): | ||
78 | + File "sent_clustering3.py", line 70, in <module> | ||
79 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
80 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
81 | +Traceback (most recent call last): | ||
82 | + File "sent_clustering3.py", line 70, in <module> | ||
83 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
84 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
85 | +Traceback (most recent call last): | ||
86 | + File "sent_clustering3.py", line 70, in <module> | ||
87 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
88 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
89 | +Traceback (most recent call last): | ||
90 | + File "sent_clustering3.py", line 70, in <module> | ||
91 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
92 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
93 | +Traceback (most recent call last): | ||
94 | + File "sent_clustering3.py", line 70, in <module> | ||
95 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
96 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
97 | +Traceback (most recent call last): | ||
98 | + File "sent_clustering3.py", line 70, in <module> | ||
99 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
100 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
101 | +Traceback (most recent call last): | ||
102 | + File "sent_clustering3.py", line 70, in <module> | ||
103 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
104 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
105 | +Traceback (most recent call last): | ||
106 | + File "sent_clustering3.py", line 70, in <module> | ||
107 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
108 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
109 | +Traceback (most recent call last): | ||
110 | + File "sent_clustering3.py", line 70, in <module> | ||
111 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
112 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
113 | +Traceback (most recent call last): | ||
114 | + File "sent_clustering3.py", line 70, in <module> | ||
115 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
116 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
117 | +Traceback (most recent call last): | ||
118 | + File "sent_clustering3.py", line 70, in <module> | ||
119 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
120 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
121 | +Traceback (most recent call last): | ||
122 | + File "sent_clustering3.py", line 70, in <module> | ||
123 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
124 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
125 | +Traceback (most recent call last): | ||
126 | + File "sent_clustering3.py", line 70, in <module> | ||
127 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
128 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
129 | +Traceback (most recent call last): | ||
130 | + File "sent_clustering3.py", line 70, in <module> | ||
131 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
132 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
133 | +Traceback (most recent call last): | ||
134 | + File "sent_clustering3.py", line 70, in <module> | ||
135 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
136 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
137 | +Traceback (most recent call last): | ||
138 | + File "sent_clustering3.py", line 70, in <module> | ||
139 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
140 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
141 | +Traceback (most recent call last): | ||
142 | + File "sent_clustering3.py", line 70, in <module> | ||
143 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
144 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
145 | +Traceback (most recent call last): | ||
146 | + File "sent_clustering3.py", line 70, in <module> | ||
147 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
148 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
149 | +Traceback (most recent call last): | ||
150 | + File "sent_clustering3.py", line 70, in <module> | ||
151 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
152 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
153 | +Traceback (most recent call last): | ||
154 | + File "sent_clustering3.py", line 70, in <module> | ||
155 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
156 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
157 | +Traceback (most recent call last): | ||
158 | + File "sent_clustering3.py", line 70, in <module> | ||
159 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
160 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
161 | +Traceback (most recent call last): | ||
162 | + File "sent_clustering3.py", line 70, in <module> | ||
163 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
164 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
165 | +Traceback (most recent call last): | ||
166 | + File "sent_clustering3.py", line 70, in <module> | ||
167 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
168 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
169 | +Traceback (most recent call last): | ||
170 | + File "sent_clustering3.py", line 70, in <module> | ||
171 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
172 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
173 | +Traceback (most recent call last): | ||
174 | + File "sent_clustering3.py", line 70, in <module> | ||
175 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
176 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
177 | +Traceback (most recent call last): | ||
178 | + File "sent_clustering3.py", line 70, in <module> | ||
179 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
180 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
181 | +Traceback (most recent call last): | ||
182 | + File "sent_clustering3.py", line 70, in <module> | ||
183 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
184 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
185 | +Traceback (most recent call last): | ||
186 | + File "sent_clustering3.py", line 70, in <module> | ||
187 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
188 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
189 | +Traceback (most recent call last): | ||
190 | + File "sent_clustering3.py", line 70, in <module> | ||
191 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
192 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
193 | +Traceback (most recent call last): | ||
194 | + File "sent_clustering3.py", line 70, in <module> | ||
195 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
196 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
197 | +Traceback (most recent call last): | ||
198 | + File "sent_clustering3.py", line 70, in <module> | ||
199 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
200 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
201 | +Traceback (most recent call last): | ||
202 | + File "sent_clustering3.py", line 70, in <module> | ||
203 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
204 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
205 | +Traceback (most recent call last): | ||
206 | + File "sent_clustering3.py", line 70, in <module> | ||
207 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
208 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
209 | +Traceback (most recent call last): | ||
210 | + File "sent_clustering3.py", line 70, in <module> | ||
211 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
212 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
213 | +Traceback (most recent call last): | ||
214 | + File "sent_clustering3.py", line 70, in <module> | ||
215 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
216 | +FileNotFoundError: [Errno 2] No such file or directory: '.dic' | ||
217 | +Traceback (most recent call last): | ||
218 | + File "sent_clustering3.py", line 99, in <module> | ||
219 | + coeff = metrics.silhouette_score(X, km.labels_, sample_size=1000) | ||
220 | + File "/almac/ignacio/anaconda3/envs/py36/lib/python3.6/site-packages/sklearn/metrics/cluster/unsupervised.py", line 100, in silhouette_score | ||
221 | + return np.mean(silhouette_samples(X, labels, metric=metric, **kwds)) | ||
222 | + File "/almac/ignacio/anaconda3/envs/py36/lib/python3.6/site-packages/sklearn/metrics/cluster/unsupervised.py", line 166, in silhouette_samples | ||
223 | + check_number_of_labels(len(le.classes_), X.shape[0]) | ||
224 | + File "/almac/ignacio/anaconda3/envs/py36/lib/python3.6/site-packages/sklearn/metrics/cluster/unsupervised.py", line 20, in check_number_of_labels | ||
225 | + "to n_samples - 1 (inclusive)" % n_labels) | ||
226 | +ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive) | ||
227 | +Silhouette Coefficient: 0.045 | ||
228 | +Number of clusters: 5 | ||
229 | + | ||
230 | +Silhouette Coefficient: -0.017 | ||
231 | +Number of clusters: 6 | ||
232 | + | ||
233 | +Silhouette Coefficient: -0.012 | ||
234 | +Number of clusters: 7 | ||
235 | + | ||
236 | +Silhouette Coefficient: -0.016 | ||
237 | +Number of clusters: 8 | ||
238 | + | ||
239 | +Silhouette Coefficient: -0.019 | ||
240 | +Number of clusters: 9 | ||
241 | + | ||
242 | +Silhouette Coefficient: -0.062 | ||
243 | +Number of clusters: 10 | ||
244 | + | ||
245 | +Silhouette Coefficient: -0.054 | ||
246 | +Number of clusters: 11 | ||
247 | + | ||
248 | +Silhouette Coefficient: -0.068 | ||
249 | +Number of clusters: 12 | ||
250 | + | ||
251 | +Silhouette Coefficient: -0.076 | ||
252 | +Number of clusters: 13 | ||
253 | + | ||
254 | +Silhouette Coefficient: -0.046 | ||
255 | +Number of clusters: 14 | ||
256 | + | ||
257 | +Silhouette Coefficient: -0.092 | ||
258 | +Number of clusters: 15 | ||
259 | + | ||
260 | +Silhouette Coefficient: -0.089 | ||
261 | +Number of clusters: 16 | ||
262 | + | ||
263 | +Silhouette Coefficient: -0.091 | ||
264 | +Number of clusters: 17 | ||
265 | + |
results/spanish-seg_w2v-cos_H300_NC010.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC011.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC012.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC013.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC014.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC015.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC016.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC017.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC05.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC06.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC07.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC08.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC09.cls
0 → 100644
This diff could not be displayed because it is too large.
sent_clustering3.py
0 → 100644
1 | +# -*- coding: utf-8 -*- | ||
2 | +from sklearn.cluster import KMeans | ||
3 | +from sklearn import metrics | ||
4 | +from scipy.cluster.hierarchy import ward, dendrogram | ||
5 | +from sklearn.metrics.pairwise import cosine_similarity | ||
6 | +from re import search, M, I | ||
7 | +import logging | ||
8 | +import matplotlib as mpl | ||
9 | +mpl.use('Agg') | ||
10 | +import matplotlib.pyplot as plt | ||
11 | +import sys | ||
12 | +import numpy as np | ||
13 | +from argparse import ArgumentParser as ap | ||
14 | +import os | ||
15 | + | ||
16 | +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', | ||
17 | + level=logging.INFO) | ||
18 | + | ||
19 | +parser = ap(description='This script clusters word embeddings by using k-means.') | ||
20 | +parser.add_argument("-x", help="Input file name (vectors)", metavar="input_file", required=True) | ||
21 | +parser.add_argument("-o", help="Output file name (clusters)", metavar="output_file") | ||
22 | +parser.add_argument("-m", help="Clustering metric [cosine|euclidean]", metavar="metric", default="euclidean") | ||
23 | +parser.add_argument("-c", help="Clusterer = {km, km++, aggwr}", metavar="clusterer", required=True) | ||
24 | +parser.add_argument("-k", help="Number of clusters", metavar="k_clusters") | ||
25 | +parser.add_argument("-N", help="Number of trials for maximize Silhueltte", metavar="n_trials") | ||
26 | +parser.add_argument("-t", default=False, action="store_true", help="Toggle if labels are PoS tags instead of snippets.") | ||
27 | +parser.add_argument("-n", default=False, action="store_true", help="Toggle if labels are NounPhrases instead of snippets.") | ||
28 | +args = parser.parse_args() | ||
29 | + | ||
30 | +min_show_length = 100 | ||
31 | + | ||
32 | +if args.m=="cosine": | ||
33 | + from sklearn.metrics.pairwise import cosine_distances as cos | ||
34 | + KMeans.euclidean_distances=cos | ||
35 | + | ||
36 | +def cleaner(line): # The default is the average sentence length in English | ||
37 | + return line.strip()#[:min_show_length] | ||
38 | + | ||
39 | +#try: | ||
40 | +# source = search(r"(?:vectors|pairs)_([A-Za-z]+[\-A-Za-z0-9]+)_?(T[0-9]{2,3}_C[1-9]_[0-9]{2}|d\d+t|\w+)?_([d2v|w2v|fstx|coocc\w*|doc\w*]*)_(H[0-9]{1,4})_?([sub|co[nvs{0,2}|rr|nc]+]?)?_(m[0-9]{1,3}[_w?[0-9]{0,3}]?)", args.x, M|I) | ||
41 | + | ||
42 | +corpus = "spanish-seg" # source.group(1) | ||
43 | +representation = "w2v" # source.group(3) | ||
44 | +dimensions = "300" # source.group(4)[1:] | ||
45 | +min_count = "1" #source.group(6)[1:] | ||
46 | +term_name = args.x.split(".")[0] + ".dic" #source.group(2) | ||
47 | + | ||
48 | +#except IndexError: | ||
49 | +# print ("\nError in the filename. One or more indicators are missing. Notation: <vectors|pairs>_<source_corpus>_<model_representation>_<dimen..") | ||
50 | +# for i in range(6): | ||
51 | +# try: | ||
52 | +# print (source.group(i)) | ||
53 | +# except IndexError: | ||
54 | +# print (":>> Unparsed: %s" % (i)) | ||
55 | +# pass | ||
56 | +# exit() | ||
57 | +#except AttributeError: | ||
58 | +# print ("\nFatal Error in the filename. Notation: <vectors|pairs>_<source_corpus>_<model_representation>_<dimendions>_<operation>*_<mminimum_...") | ||
59 | +# for i in range(6): | ||
60 | +# try: | ||
61 | +# print (source.group(i)) | ||
62 | +# except AttributeError: | ||
63 | +# print (":>> Unparsed: %s" % (i)) | ||
64 | +# pass | ||
65 | +# exit() | ||
66 | + | ||
67 | +#route = os.path.dirname(args.x) | ||
68 | +## Loading files | ||
69 | +if not args.t and not args.n: | ||
70 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
71 | + snippets = list(map(cleaner, f.readlines())) | ||
72 | + t = "" | ||
73 | +elif args.n: | ||
74 | + with open("%s/%s.txt.phr" % (route, term_name)) as f: | ||
75 | + snippets = list(map(cleaner, f.readlines())) | ||
76 | + t = "_phr" | ||
77 | +else: | ||
78 | + with open("%s/%s.tags" % (route, term_name)) as f: | ||
79 | + snippets = list(map(cleaner, f.readlines())) | ||
80 | + t = "_tags" | ||
81 | +#TODO: Parse the snippets wit correct vectors file. | ||
82 | +X = np.loadtxt(args.x) | ||
83 | +if args.c.startswith("km"): | ||
84 | + num_clusters = int(args.k) | ||
85 | + | ||
86 | + if "++" in args.c: | ||
87 | + km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100, | ||
88 | + verbose=1, n_init=1) | ||
89 | + km.fit(X) | ||
90 | + coeff = metrics.silhouette_score(X, km.labels_, sample_size=1000) | ||
91 | + clusters=km.labels_.tolist() | ||
92 | + else: | ||
93 | + max_silhouette = -1 | ||
94 | + silhouette = -1 | ||
95 | + for tr in range(int(args.N)): # Number of trials for maximize Silhueltte | ||
96 | + km = KMeans(n_clusters=num_clusters, init='random', max_iter=100, | ||
97 | + verbose=1, n_init=1, n_jobs=4) | ||
98 | + km.fit(X) | ||
99 | + coeff = metrics.silhouette_score(X, km.labels_, sample_size=1000) | ||
100 | + #print ("Partial Silhuette: %f" % coeff) | ||
101 | + if silhouette < coeff: | ||
102 | + clusters=km.labels_.tolist() | ||
103 | + silhouette = coeff | ||
104 | + definitions=sorted(list(zip(clusters, snippets)), key=lambda x: x[0]) | ||
105 | + | ||
106 | + if args.o: | ||
107 | + f = open(args.o, 'w') | ||
108 | + writer = f.write | ||
109 | + else: | ||
110 | + writer = print | ||
111 | + | ||
112 | + while(1): | ||
113 | + try: | ||
114 | + c, s = definitions.pop() | ||
115 | + writer ("%d\t%s\n" % (c, s)) | ||
116 | + except IndexError: | ||
117 | + break | ||
118 | + | ||
119 | + print("Silhouette Coefficient: %0.3f" % silhouette) | ||
120 | + print("Number of clusters: %d" % num_clusters) | ||
121 | + print() | ||
122 | + | ||
123 | +elif args.c.startswith("agg"): | ||
124 | + dist = 1 - cosine_similarity(X) | ||
125 | + linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances | ||
126 | + fig, ax = plt.subplots(figsize=(15, 20)) # set size | ||
127 | + ax = dendrogram(linkage_matrix, orientation="right", labels=list(snippets)); | ||
128 | + | ||
129 | + plt.tick_params(\ | ||
130 | + axis= 'x', # changes apply to the x-axis | ||
131 | + which='both', # both major and minor ticks are affected | ||
132 | + bottom='off', # ticks along the bottom edge are off | ||
133 | + top='off', # ticks along the top edge are off | ||
134 | + labelbottom='off') | ||
135 | + | ||
136 | + plt.tight_layout() #show plot with tight layout | ||
137 | +#uncomment below to save figure | ||
138 | + plt.savefig("ward_clusters_%s%s_%s_H%s.png" % (term_name, t, corpus, dimensions), dpi=200) #save figure as ward_clusters |
-
Please register or login to post a comment