Ignacio Arroyo Fernández

added fisrt clusters in spanish

This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
1 +for n in {4..30}; do python sent_clustering3.py -x data/w2v_corpus-spanish-seg_H300_w10.mtx -m cosine -c km -N 10 -k "$n" -o results/spanish-seg_w2v-cos_H300_NC0"$n".cls; done
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
1 +# -*- coding: utf-8 -*-
2 +from sklearn.cluster import KMeans
3 +from sklearn import metrics
4 +from scipy.cluster.hierarchy import ward, dendrogram
5 +from sklearn.metrics.pairwise import cosine_similarity
6 +from re import search, M, I
7 +import logging
8 +import matplotlib as mpl
9 +mpl.use('Agg')
10 +import matplotlib.pyplot as plt
11 +import sys
12 +import numpy as np
13 +from argparse import ArgumentParser as ap
14 +import os
15 +
16 +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
17 + level=logging.INFO)
18 +
19 +parser = ap(description='This script clusters word embeddings by using k-means.')
20 +parser.add_argument("-x", help="Input file name (vectors)", metavar="input_file", required=True)
21 +parser.add_argument("-o", help="Output file name (clusters)", metavar="output_file")
22 +parser.add_argument("-m", help="Clustering metric [cosine|euclidean]", metavar="metric", default="euclidean")
23 +parser.add_argument("-c", help="Clusterer = {km, km++, aggwr}", metavar="clusterer", required=True)
24 +parser.add_argument("-k", help="Number of clusters", metavar="k_clusters")
25 +parser.add_argument("-N", help="Number of trials for maximize Silhueltte", metavar="n_trials")
26 +parser.add_argument("-t", default=False, action="store_true", help="Toggle if labels are PoS tags instead of snippets.")
27 +parser.add_argument("-n", default=False, action="store_true", help="Toggle if labels are NounPhrases instead of snippets.")
28 +args = parser.parse_args()
29 +
30 +min_show_length = 100
31 +
32 +if args.m=="cosine":
33 + from sklearn.metrics.pairwise import cosine_distances as cos
34 + KMeans.euclidean_distances=cos
35 +
36 +def cleaner(line): # The default is the average sentence length in English
37 + return line.strip()#[:min_show_length]
38 +
39 +#try:
40 +# source = search(r"(?:vectors|pairs)_([A-Za-z]+[\-A-Za-z0-9]+)_?(T[0-9]{2,3}_C[1-9]_[0-9]{2}|d\d+t|\w+)?_([d2v|w2v|fstx|coocc\w*|doc\w*]*)_(H[0-9]{1,4})_?([sub|co[nvs{0,2}|rr|nc]+]?)?_(m[0-9]{1,3}[_w?[0-9]{0,3}]?)", args.x, M|I)
41 +
42 +corpus = "spanish-seg" # source.group(1)
43 +representation = "w2v" # source.group(3)
44 +dimensions = "300" # source.group(4)[1:]
45 +min_count = "1" #source.group(6)[1:]
46 +term_name = args.x.split(".")[0] + ".dic" #source.group(2)
47 +
48 +#except IndexError:
49 +# print ("\nError in the filename. One or more indicators are missing. Notation: <vectors|pairs>_<source_corpus>_<model_representation>_<dimen..")
50 +# for i in range(6):
51 +# try:
52 +# print (source.group(i))
53 +# except IndexError:
54 +# print (":>> Unparsed: %s" % (i))
55 +# pass
56 +# exit()
57 +#except AttributeError:
58 +# print ("\nFatal Error in the filename. Notation: <vectors|pairs>_<source_corpus>_<model_representation>_<dimendions>_<operation>*_<mminimum_...")
59 +# for i in range(6):
60 +# try:
61 +# print (source.group(i))
62 +# except AttributeError:
63 +# print (":>> Unparsed: %s" % (i))
64 +# pass
65 +# exit()
66 +
67 +#route = os.path.dirname(args.x)
68 +## Loading files
69 +if not args.t and not args.n:
70 + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
71 + snippets = list(map(cleaner, f.readlines()))
72 + t = ""
73 +elif args.n:
74 + with open("%s/%s.txt.phr" % (route, term_name)) as f:
75 + snippets = list(map(cleaner, f.readlines()))
76 + t = "_phr"
77 +else:
78 + with open("%s/%s.tags" % (route, term_name)) as f:
79 + snippets = list(map(cleaner, f.readlines()))
80 + t = "_tags"
81 +#TODO: Parse the snippets wit correct vectors file.
82 +X = np.loadtxt(args.x)
83 +if args.c.startswith("km"):
84 + num_clusters = int(args.k)
85 +
86 + if "++" in args.c:
87 + km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100,
88 + verbose=1, n_init=1)
89 + km.fit(X)
90 + coeff = metrics.silhouette_score(X, km.labels_, sample_size=1000)
91 + clusters=km.labels_.tolist()
92 + else:
93 + max_silhouette = -1
94 + silhouette = -1
95 + for tr in range(int(args.N)): # Number of trials for maximize Silhueltte
96 + km = KMeans(n_clusters=num_clusters, init='random', max_iter=100,
97 + verbose=1, n_init=1, n_jobs=4)
98 + km.fit(X)
99 + coeff = metrics.silhouette_score(X, km.labels_, sample_size=1000)
100 + #print ("Partial Silhuette: %f" % coeff)
101 + if silhouette < coeff:
102 + clusters=km.labels_.tolist()
103 + silhouette = coeff
104 + definitions=sorted(list(zip(clusters, snippets)), key=lambda x: x[0])
105 +
106 + if args.o:
107 + f = open(args.o, 'w')
108 + writer = f.write
109 + else:
110 + writer = print
111 +
112 + while(1):
113 + try:
114 + c, s = definitions.pop()
115 + writer ("%d\t%s\n" % (c, s))
116 + except IndexError:
117 + break
118 +
119 + print("Silhouette Coefficient: %0.3f" % silhouette)
120 + print("Number of clusters: %d" % num_clusters)
121 + print()
122 +
123 +elif args.c.startswith("agg"):
124 + dist = 1 - cosine_similarity(X)
125 + linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances
126 + fig, ax = plt.subplots(figsize=(15, 20)) # set size
127 + ax = dendrogram(linkage_matrix, orientation="right", labels=list(snippets));
128 +
129 + plt.tick_params(\
130 + axis= 'x', # changes apply to the x-axis
131 + which='both', # both major and minor ticks are affected
132 + bottom='off', # ticks along the bottom edge are off
133 + top='off', # ticks along the top edge are off
134 + labelbottom='off')
135 +
136 + plt.tight_layout() #show plot with tight layout
137 +#uncomment below to save figure
138 + plt.savefig("ward_clusters_%s%s_%s_H%s.png" % (term_name, t, corpus, dimensions), dpi=200) #save figure as ward_clusters