Showing
18 changed files
with
139 additions
and
0 deletions
data/w2v_corpus-spanish-seg_H300_w10.dic
0 → 100644
This diff could not be displayed because it is too large.
data/w2v_corpus-spanish-seg_H300_w10.mtx
0 → 100644
This diff could not be displayed because it is too large.
multi_clustering.sh
0 → 100644
1 | +for n in {4..30}; do python sent_clustering3.py -x data/w2v_corpus-spanish-seg_H300_w10.mtx -m cosine -c km -N 10 -k "$n" -o results/spanish-seg_w2v-cos_H300_NC0"$n".cls; done |
nohup.out
0 → 100644
This diff is collapsed. Click to expand it.
results/spanish-seg_w2v-cos_H300_NC010.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC011.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC012.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC013.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC014.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC015.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC016.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC017.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC05.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC06.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC07.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC08.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC09.cls
0 → 100644
This diff could not be displayed because it is too large.
sent_clustering3.py
0 → 100644
1 | +# -*- coding: utf-8 -*- | ||
2 | +from sklearn.cluster import KMeans | ||
3 | +from sklearn import metrics | ||
4 | +from scipy.cluster.hierarchy import ward, dendrogram | ||
5 | +from sklearn.metrics.pairwise import cosine_similarity | ||
6 | +from re import search, M, I | ||
7 | +import logging | ||
8 | +import matplotlib as mpl | ||
9 | +mpl.use('Agg') | ||
10 | +import matplotlib.pyplot as plt | ||
11 | +import sys | ||
12 | +import numpy as np | ||
13 | +from argparse import ArgumentParser as ap | ||
14 | +import os | ||
15 | + | ||
16 | +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', | ||
17 | + level=logging.INFO) | ||
18 | + | ||
19 | +parser = ap(description='This script clusters word embeddings by using k-means.') | ||
20 | +parser.add_argument("-x", help="Input file name (vectors)", metavar="input_file", required=True) | ||
21 | +parser.add_argument("-o", help="Output file name (clusters)", metavar="output_file") | ||
22 | +parser.add_argument("-m", help="Clustering metric [cosine|euclidean]", metavar="metric", default="euclidean") | ||
23 | +parser.add_argument("-c", help="Clusterer = {km, km++, aggwr}", metavar="clusterer", required=True) | ||
24 | +parser.add_argument("-k", help="Number of clusters", metavar="k_clusters") | ||
25 | +parser.add_argument("-N", help="Number of trials for maximize Silhueltte", metavar="n_trials") | ||
26 | +parser.add_argument("-t", default=False, action="store_true", help="Toggle if labels are PoS tags instead of snippets.") | ||
27 | +parser.add_argument("-n", default=False, action="store_true", help="Toggle if labels are NounPhrases instead of snippets.") | ||
28 | +args = parser.parse_args() | ||
29 | + | ||
30 | +min_show_length = 100 | ||
31 | + | ||
32 | +if args.m=="cosine": | ||
33 | + from sklearn.metrics.pairwise import cosine_distances as cos | ||
34 | + KMeans.euclidean_distances=cos | ||
35 | + | ||
36 | +def cleaner(line): # The default is the average sentence length in English | ||
37 | + return line.strip()#[:min_show_length] | ||
38 | + | ||
39 | +#try: | ||
40 | +# source = search(r"(?:vectors|pairs)_([A-Za-z]+[\-A-Za-z0-9]+)_?(T[0-9]{2,3}_C[1-9]_[0-9]{2}|d\d+t|\w+)?_([d2v|w2v|fstx|coocc\w*|doc\w*]*)_(H[0-9]{1,4})_?([sub|co[nvs{0,2}|rr|nc]+]?)?_(m[0-9]{1,3}[_w?[0-9]{0,3}]?)", args.x, M|I) | ||
41 | + | ||
42 | +corpus = "spanish-seg" # source.group(1) | ||
43 | +representation = "w2v" # source.group(3) | ||
44 | +dimensions = "300" # source.group(4)[1:] | ||
45 | +min_count = "1" #source.group(6)[1:] | ||
46 | +term_name = args.x.split(".")[0] + ".dic" #source.group(2) | ||
47 | + | ||
48 | +#except IndexError: | ||
49 | +# print ("\nError in the filename. One or more indicators are missing. Notation: <vectors|pairs>_<source_corpus>_<model_representation>_<dimen..") | ||
50 | +# for i in range(6): | ||
51 | +# try: | ||
52 | +# print (source.group(i)) | ||
53 | +# except IndexError: | ||
54 | +# print (":>> Unparsed: %s" % (i)) | ||
55 | +# pass | ||
56 | +# exit() | ||
57 | +#except AttributeError: | ||
58 | +# print ("\nFatal Error in the filename. Notation: <vectors|pairs>_<source_corpus>_<model_representation>_<dimendions>_<operation>*_<mminimum_...") | ||
59 | +# for i in range(6): | ||
60 | +# try: | ||
61 | +# print (source.group(i)) | ||
62 | +# except AttributeError: | ||
63 | +# print (":>> Unparsed: %s" % (i)) | ||
64 | +# pass | ||
65 | +# exit() | ||
66 | + | ||
67 | +#route = os.path.dirname(args.x) | ||
68 | +## Loading files | ||
69 | +if not args.t and not args.n: | ||
70 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
71 | + snippets = list(map(cleaner, f.readlines())) | ||
72 | + t = "" | ||
73 | +elif args.n: | ||
74 | + with open("%s/%s.txt.phr" % (route, term_name)) as f: | ||
75 | + snippets = list(map(cleaner, f.readlines())) | ||
76 | + t = "_phr" | ||
77 | +else: | ||
78 | + with open("%s/%s.tags" % (route, term_name)) as f: | ||
79 | + snippets = list(map(cleaner, f.readlines())) | ||
80 | + t = "_tags" | ||
81 | +#TODO: Parse the snippets wit correct vectors file. | ||
82 | +X = np.loadtxt(args.x) | ||
83 | +if args.c.startswith("km"): | ||
84 | + num_clusters = int(args.k) | ||
85 | + | ||
86 | + if "++" in args.c: | ||
87 | + km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100, | ||
88 | + verbose=1, n_init=1) | ||
89 | + km.fit(X) | ||
90 | + coeff = metrics.silhouette_score(X, km.labels_, sample_size=1000) | ||
91 | + clusters=km.labels_.tolist() | ||
92 | + else: | ||
93 | + max_silhouette = -1 | ||
94 | + silhouette = -1 | ||
95 | + for tr in range(int(args.N)): # Number of trials for maximize Silhueltte | ||
96 | + km = KMeans(n_clusters=num_clusters, init='random', max_iter=100, | ||
97 | + verbose=1, n_init=1, n_jobs=4) | ||
98 | + km.fit(X) | ||
99 | + coeff = metrics.silhouette_score(X, km.labels_, sample_size=1000) | ||
100 | + #print ("Partial Silhuette: %f" % coeff) | ||
101 | + if silhouette < coeff: | ||
102 | + clusters=km.labels_.tolist() | ||
103 | + silhouette = coeff | ||
104 | + definitions=sorted(list(zip(clusters, snippets)), key=lambda x: x[0]) | ||
105 | + | ||
106 | + if args.o: | ||
107 | + f = open(args.o, 'w') | ||
108 | + writer = f.write | ||
109 | + else: | ||
110 | + writer = print | ||
111 | + | ||
112 | + while(1): | ||
113 | + try: | ||
114 | + c, s = definitions.pop() | ||
115 | + writer ("%d\t%s\n" % (c, s)) | ||
116 | + except IndexError: | ||
117 | + break | ||
118 | + | ||
119 | + print("Silhouette Coefficient: %0.3f" % silhouette) | ||
120 | + print("Number of clusters: %d" % num_clusters) | ||
121 | + print() | ||
122 | + | ||
123 | +elif args.c.startswith("agg"): | ||
124 | + dist = 1 - cosine_similarity(X) | ||
125 | + linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances | ||
126 | + fig, ax = plt.subplots(figsize=(15, 20)) # set size | ||
127 | + ax = dendrogram(linkage_matrix, orientation="right", labels=list(snippets)); | ||
128 | + | ||
129 | + plt.tick_params(\ | ||
130 | + axis= 'x', # changes apply to the x-axis | ||
131 | + which='both', # both major and minor ticks are affected | ||
132 | + bottom='off', # ticks along the bottom edge are off | ||
133 | + top='off', # ticks along the top edge are off | ||
134 | + labelbottom='off') | ||
135 | + | ||
136 | + plt.tight_layout() #show plot with tight layout | ||
137 | +#uncomment below to save figure | ||
138 | + plt.savefig("ward_clusters_%s%s_%s_H%s.png" % (term_name, t, corpus, dimensions), dpi=200) #save figure as ward_clusters |
-
Please register or login to post a comment