Ignacio Arroyo Fernández

added fisrt clusters in spanish

This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
1 +for n in {4..30}; do python sent_clustering3.py -x data/w2v_corpus-spanish-seg_H300_w10.mtx -m cosine -c km -N 10 -k "$n" -o results/spanish-seg_w2v-cos_H300_NC0"$n".cls; done
1 +Traceback (most recent call last):
2 + File "sent_clustering3.py", line 70, in <module>
3 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
4 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
5 +Traceback (most recent call last):
6 + File "sent_clustering3.py", line 70, in <module>
7 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
8 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
9 +Traceback (most recent call last):
10 + File "sent_clustering3.py", line 70, in <module>
11 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
12 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
13 +Traceback (most recent call last):
14 + File "sent_clustering3.py", line 70, in <module>
15 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
16 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
17 +Traceback (most recent call last):
18 + File "sent_clustering3.py", line 70, in <module>
19 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
20 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
21 +Traceback (most recent call last):
22 + File "sent_clustering3.py", line 70, in <module>
23 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
24 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
25 +Traceback (most recent call last):
26 + File "sent_clustering3.py", line 70, in <module>
27 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
28 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
29 +Traceback (most recent call last):
30 + File "sent_clustering3.py", line 70, in <module>
31 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
32 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
33 +Traceback (most recent call last):
34 + File "sent_clustering3.py", line 70, in <module>
35 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
36 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
37 +Traceback (most recent call last):
38 + File "sent_clustering3.py", line 70, in <module>
39 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
40 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
41 +Traceback (most recent call last):
42 + File "sent_clustering3.py", line 70, in <module>
43 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
44 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
45 +Traceback (most recent call last):
46 + File "sent_clustering3.py", line 70, in <module>
47 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
48 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
49 +Traceback (most recent call last):
50 + File "sent_clustering3.py", line 70, in <module>
51 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
52 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
53 +Traceback (most recent call last):
54 + File "sent_clustering3.py", line 70, in <module>
55 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
56 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
57 +Traceback (most recent call last):
58 + File "sent_clustering3.py", line 70, in <module>
59 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
60 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
61 +Traceback (most recent call last):
62 + File "sent_clustering3.py", line 70, in <module>
63 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
64 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
65 +Traceback (most recent call last):
66 + File "sent_clustering3.py", line 70, in <module>
67 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
68 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
69 +Traceback (most recent call last):
70 + File "sent_clustering3.py", line 70, in <module>
71 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
72 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
73 +Traceback (most recent call last):
74 + File "sent_clustering3.py", line 70, in <module>
75 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
76 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
77 +Traceback (most recent call last):
78 + File "sent_clustering3.py", line 70, in <module>
79 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
80 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
81 +Traceback (most recent call last):
82 + File "sent_clustering3.py", line 70, in <module>
83 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
84 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
85 +Traceback (most recent call last):
86 + File "sent_clustering3.py", line 70, in <module>
87 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
88 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
89 +Traceback (most recent call last):
90 + File "sent_clustering3.py", line 70, in <module>
91 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
92 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
93 +Traceback (most recent call last):
94 + File "sent_clustering3.py", line 70, in <module>
95 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
96 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
97 +Traceback (most recent call last):
98 + File "sent_clustering3.py", line 70, in <module>
99 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
100 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
101 +Traceback (most recent call last):
102 + File "sent_clustering3.py", line 70, in <module>
103 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
104 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
105 +Traceback (most recent call last):
106 + File "sent_clustering3.py", line 70, in <module>
107 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
108 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
109 +Traceback (most recent call last):
110 + File "sent_clustering3.py", line 70, in <module>
111 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
112 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
113 +Traceback (most recent call last):
114 + File "sent_clustering3.py", line 70, in <module>
115 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
116 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
117 +Traceback (most recent call last):
118 + File "sent_clustering3.py", line 70, in <module>
119 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
120 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
121 +Traceback (most recent call last):
122 + File "sent_clustering3.py", line 70, in <module>
123 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
124 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
125 +Traceback (most recent call last):
126 + File "sent_clustering3.py", line 70, in <module>
127 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
128 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
129 +Traceback (most recent call last):
130 + File "sent_clustering3.py", line 70, in <module>
131 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
132 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
133 +Traceback (most recent call last):
134 + File "sent_clustering3.py", line 70, in <module>
135 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
136 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
137 +Traceback (most recent call last):
138 + File "sent_clustering3.py", line 70, in <module>
139 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
140 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
141 +Traceback (most recent call last):
142 + File "sent_clustering3.py", line 70, in <module>
143 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
144 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
145 +Traceback (most recent call last):
146 + File "sent_clustering3.py", line 70, in <module>
147 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
148 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
149 +Traceback (most recent call last):
150 + File "sent_clustering3.py", line 70, in <module>
151 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
152 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
153 +Traceback (most recent call last):
154 + File "sent_clustering3.py", line 70, in <module>
155 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
156 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
157 +Traceback (most recent call last):
158 + File "sent_clustering3.py", line 70, in <module>
159 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
160 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
161 +Traceback (most recent call last):
162 + File "sent_clustering3.py", line 70, in <module>
163 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
164 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
165 +Traceback (most recent call last):
166 + File "sent_clustering3.py", line 70, in <module>
167 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
168 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
169 +Traceback (most recent call last):
170 + File "sent_clustering3.py", line 70, in <module>
171 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
172 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
173 +Traceback (most recent call last):
174 + File "sent_clustering3.py", line 70, in <module>
175 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
176 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
177 +Traceback (most recent call last):
178 + File "sent_clustering3.py", line 70, in <module>
179 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
180 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
181 +Traceback (most recent call last):
182 + File "sent_clustering3.py", line 70, in <module>
183 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
184 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
185 +Traceback (most recent call last):
186 + File "sent_clustering3.py", line 70, in <module>
187 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
188 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
189 +Traceback (most recent call last):
190 + File "sent_clustering3.py", line 70, in <module>
191 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
192 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
193 +Traceback (most recent call last):
194 + File "sent_clustering3.py", line 70, in <module>
195 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
196 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
197 +Traceback (most recent call last):
198 + File "sent_clustering3.py", line 70, in <module>
199 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
200 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
201 +Traceback (most recent call last):
202 + File "sent_clustering3.py", line 70, in <module>
203 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
204 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
205 +Traceback (most recent call last):
206 + File "sent_clustering3.py", line 70, in <module>
207 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
208 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
209 +Traceback (most recent call last):
210 + File "sent_clustering3.py", line 70, in <module>
211 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
212 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
213 +Traceback (most recent call last):
214 + File "sent_clustering3.py", line 70, in <module>
215 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
216 +FileNotFoundError: [Errno 2] No such file or directory: '.dic'
217 +Traceback (most recent call last):
218 + File "sent_clustering3.py", line 99, in <module>
219 + coeff = metrics.silhouette_score(X, km.labels_, sample_size=1000)
220 + File "/almac/ignacio/anaconda3/envs/py36/lib/python3.6/site-packages/sklearn/metrics/cluster/unsupervised.py", line 100, in silhouette_score
221 + return np.mean(silhouette_samples(X, labels, metric=metric, **kwds))
222 + File "/almac/ignacio/anaconda3/envs/py36/lib/python3.6/site-packages/sklearn/metrics/cluster/unsupervised.py", line 166, in silhouette_samples
223 + check_number_of_labels(len(le.classes_), X.shape[0])
224 + File "/almac/ignacio/anaconda3/envs/py36/lib/python3.6/site-packages/sklearn/metrics/cluster/unsupervised.py", line 20, in check_number_of_labels
225 + "to n_samples - 1 (inclusive)" % n_labels)
226 +ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)
227 +Silhouette Coefficient: 0.045
228 +Number of clusters: 5
229 +
230 +Silhouette Coefficient: -0.017
231 +Number of clusters: 6
232 +
233 +Silhouette Coefficient: -0.012
234 +Number of clusters: 7
235 +
236 +Silhouette Coefficient: -0.016
237 +Number of clusters: 8
238 +
239 +Silhouette Coefficient: -0.019
240 +Number of clusters: 9
241 +
242 +Silhouette Coefficient: -0.062
243 +Number of clusters: 10
244 +
245 +Silhouette Coefficient: -0.054
246 +Number of clusters: 11
247 +
248 +Silhouette Coefficient: -0.068
249 +Number of clusters: 12
250 +
251 +Silhouette Coefficient: -0.076
252 +Number of clusters: 13
253 +
254 +Silhouette Coefficient: -0.046
255 +Number of clusters: 14
256 +
257 +Silhouette Coefficient: -0.092
258 +Number of clusters: 15
259 +
260 +Silhouette Coefficient: -0.089
261 +Number of clusters: 16
262 +
263 +Silhouette Coefficient: -0.091
264 +Number of clusters: 17
265 +
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
1 +# -*- coding: utf-8 -*-
2 +from sklearn.cluster import KMeans
3 +from sklearn import metrics
4 +from scipy.cluster.hierarchy import ward, dendrogram
5 +from sklearn.metrics.pairwise import cosine_similarity
6 +from re import search, M, I
7 +import logging
8 +import matplotlib as mpl
9 +mpl.use('Agg')
10 +import matplotlib.pyplot as plt
11 +import sys
12 +import numpy as np
13 +from argparse import ArgumentParser as ap
14 +import os
15 +
16 +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
17 + level=logging.INFO)
18 +
19 +parser = ap(description='This script clusters word embeddings by using k-means.')
20 +parser.add_argument("-x", help="Input file name (vectors)", metavar="input_file", required=True)
21 +parser.add_argument("-o", help="Output file name (clusters)", metavar="output_file")
22 +parser.add_argument("-m", help="Clustering metric [cosine|euclidean]", metavar="metric", default="euclidean")
23 +parser.add_argument("-c", help="Clusterer = {km, km++, aggwr}", metavar="clusterer", required=True)
24 +parser.add_argument("-k", help="Number of clusters", metavar="k_clusters")
25 +parser.add_argument("-N", help="Number of trials for maximize Silhueltte", metavar="n_trials")
26 +parser.add_argument("-t", default=False, action="store_true", help="Toggle if labels are PoS tags instead of snippets.")
27 +parser.add_argument("-n", default=False, action="store_true", help="Toggle if labels are NounPhrases instead of snippets.")
28 +args = parser.parse_args()
29 +
30 +min_show_length = 100
31 +
32 +if args.m=="cosine":
33 + from sklearn.metrics.pairwise import cosine_distances as cos
34 + KMeans.euclidean_distances=cos
35 +
36 +def cleaner(line): # The default is the average sentence length in English
37 + return line.strip()#[:min_show_length]
38 +
39 +#try:
40 +# source = search(r"(?:vectors|pairs)_([A-Za-z]+[\-A-Za-z0-9]+)_?(T[0-9]{2,3}_C[1-9]_[0-9]{2}|d\d+t|\w+)?_([d2v|w2v|fstx|coocc\w*|doc\w*]*)_(H[0-9]{1,4})_?([sub|co[nvs{0,2}|rr|nc]+]?)?_(m[0-9]{1,3}[_w?[0-9]{0,3}]?)", args.x, M|I)
41 +
42 +corpus = "spanish-seg" # source.group(1)
43 +representation = "w2v" # source.group(3)
44 +dimensions = "300" # source.group(4)[1:]
45 +min_count = "1" #source.group(6)[1:]
46 +term_name = args.x.split(".")[0] + ".dic" #source.group(2)
47 +
48 +#except IndexError:
49 +# print ("\nError in the filename. One or more indicators are missing. Notation: <vectors|pairs>_<source_corpus>_<model_representation>_<dimen..")
50 +# for i in range(6):
51 +# try:
52 +# print (source.group(i))
53 +# except IndexError:
54 +# print (":>> Unparsed: %s" % (i))
55 +# pass
56 +# exit()
57 +#except AttributeError:
58 +# print ("\nFatal Error in the filename. Notation: <vectors|pairs>_<source_corpus>_<model_representation>_<dimendions>_<operation>*_<mminimum_...")
59 +# for i in range(6):
60 +# try:
61 +# print (source.group(i))
62 +# except AttributeError:
63 +# print (":>> Unparsed: %s" % (i))
64 +# pass
65 +# exit()
66 +
67 +#route = os.path.dirname(args.x)
68 +## Loading files
69 +if not args.t and not args.n:
70 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
71 + snippets = list(map(cleaner, f.readlines()))
72 + t = ""
73 +elif args.n:
74 + with open("%s/%s.txt.phr" % (route, term_name)) as f:
75 + snippets = list(map(cleaner, f.readlines()))
76 + t = "_phr"
77 +else:
78 + with open("%s/%s.tags" % (route, term_name)) as f:
79 + snippets = list(map(cleaner, f.readlines()))
80 + t = "_tags"
81 +#TODO: Parse the snippets wit correct vectors file.
82 +X = np.loadtxt(args.x)
83 +if args.c.startswith("km"):
84 + num_clusters = int(args.k)
85 +
86 + if "++" in args.c:
87 + km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100,
88 + verbose=1, n_init=1)
89 + km.fit(X)
90 + coeff = metrics.silhouette_score(X, km.labels_, sample_size=1000)
91 + clusters=km.labels_.tolist()
92 + else:
93 + max_silhouette = -1
94 + silhouette = -1
95 + for tr in range(int(args.N)): # Number of trials for maximize Silhueltte
96 + km = KMeans(n_clusters=num_clusters, init='random', max_iter=100,
97 + verbose=1, n_init=1, n_jobs=4)
98 + km.fit(X)
99 + coeff = metrics.silhouette_score(X, km.labels_, sample_size=1000)
100 + #print ("Partial Silhuette: %f" % coeff)
101 + if silhouette < coeff:
102 + clusters=km.labels_.tolist()
103 + silhouette = coeff
104 + definitions=sorted(list(zip(clusters, snippets)), key=lambda x: x[0])
105 +
106 + if args.o:
107 + f = open(args.o, 'w')
108 + writer = f.write
109 + else:
110 + writer = print
111 +
112 + while(1):
113 + try:
114 + c, s = definitions.pop()
115 + writer ("%d\t%s\n" % (c, s))
116 + except IndexError:
117 + break
118 +
119 + print("Silhouette Coefficient: %0.3f" % silhouette)
120 + print("Number of clusters: %d" % num_clusters)
121 + print()
122 +
123 +elif args.c.startswith("agg"):
124 + dist = 1 - cosine_similarity(X)
125 + linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances
126 + fig, ax = plt.subplots(figsize=(15, 20)) # set size
127 + ax = dendrogram(linkage_matrix, orientation="right", labels=list(snippets));
128 +
129 + plt.tick_params(\
130 + axis= 'x', # changes apply to the x-axis
131 + which='both', # both major and minor ticks are affected
132 + bottom='off', # ticks along the bottom edge are off
133 + top='off', # ticks along the top edge are off
134 + labelbottom='off')
135 +
136 + plt.tight_layout() #show plot with tight layout
137 +#uncomment below to save figure
138 + plt.savefig("ward_clusters_%s%s_%s_H%s.png" % (term_name, t, corpus, dimensions), dpi=200) #save figure as ward_clusters