sent_clustering3.py
5.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# -*- coding: utf-8 -*-
from sklearn.cluster import KMeans
from sklearn import metrics
from scipy.cluster.hierarchy import ward, dendrogram
from sklearn.metrics.pairwise import cosine_similarity
from re import search, M, I
import logging
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
import sys
import numpy as np
from argparse import ArgumentParser as ap
import os
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
level=logging.INFO)
parser = ap(description='This script clusters word embeddings by using k-means.')
parser.add_argument("-x", help="Input file name (vectors)", metavar="input_file", required=True)
parser.add_argument("-o", help="Output file name (clusters)", metavar="output_file")
parser.add_argument("-m", help="Clustering metric [cosine|euclidean]", metavar="metric", default="euclidean")
parser.add_argument("-c", help="Clusterer = {km, km++, aggwr}", metavar="clusterer", required=True)
parser.add_argument("-k", help="Number of clusters", metavar="k_clusters")
parser.add_argument("-N", help="Number of trials for maximize Silhueltte", metavar="n_trials")
parser.add_argument("-t", default=False, action="store_true", help="Toggle if labels are PoS tags instead of snippets.")
parser.add_argument("-n", default=False, action="store_true", help="Toggle if labels are NounPhrases instead of snippets.")
args = parser.parse_args()
min_show_length = 100
if args.m=="cosine":
from sklearn.metrics.pairwise import cosine_distances as cos
KMeans.euclidean_distances=cos
def cleaner(line): # The default is the average sentence length in English
return line.strip()#[:min_show_length]
#try:
# source = search(r"(?:vectors|pairs)_([A-Za-z]+[\-A-Za-z0-9]+)_?(T[0-9]{2,3}_C[1-9]_[0-9]{2}|d\d+t|\w+)?_([d2v|w2v|fstx|coocc\w*|doc\w*]*)_(H[0-9]{1,4})_?([sub|co[nvs{0,2}|rr|nc]+]?)?_(m[0-9]{1,3}[_w?[0-9]{0,3}]?)", args.x, M|I)
corpus = "spanish-seg" # source.group(1)
representation = "w2v" # source.group(3)
dimensions = "300" # source.group(4)[1:]
min_count = "1" #source.group(6)[1:]
term_name = args.x.split(".")[0] + ".dic" #source.group(2)
#except IndexError:
# print ("\nError in the filename. One or more indicators are missing. Notation: <vectors|pairs>_<source_corpus>_<model_representation>_<dimen..")
# for i in range(6):
# try:
# print (source.group(i))
# except IndexError:
# print (":>> Unparsed: %s" % (i))
# pass
# exit()
#except AttributeError:
# print ("\nFatal Error in the filename. Notation: <vectors|pairs>_<source_corpus>_<model_representation>_<dimendions>_<operation>*_<mminimum_...")
# for i in range(6):
# try:
# print (source.group(i))
# except AttributeError:
# print (":>> Unparsed: %s" % (i))
# pass
# exit()
#route = os.path.dirname(args.x)
## Loading files
if not args.t and not args.n:
with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f:
snippets = list(map(cleaner, f.readlines()))
t = ""
elif args.n:
with open("%s/%s.txt.phr" % (route, term_name)) as f:
snippets = list(map(cleaner, f.readlines()))
t = "_phr"
else:
with open("%s/%s.tags" % (route, term_name)) as f:
snippets = list(map(cleaner, f.readlines()))
t = "_tags"
#TODO: Parse the snippets wit correct vectors file.
X = np.loadtxt(args.x)
if args.c.startswith("km"):
num_clusters = int(args.k)
if "++" in args.c:
km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100,
verbose=1, n_init=1)
km.fit(X)
coeff = metrics.silhouette_score(X, km.labels_, sample_size=1000)
clusters=km.labels_.tolist()
else:
max_silhouette = -1
silhouette = -1
for tr in range(int(args.N)): # Number of trials for maximize Silhueltte
km = KMeans(n_clusters=num_clusters, init='random', max_iter=100,
verbose=1, n_init=1, n_jobs=4)
km.fit(X)
coeff = metrics.silhouette_score(X, km.labels_, sample_size=1000)
#print ("Partial Silhuette: %f" % coeff)
if silhouette < coeff:
clusters=km.labels_.tolist()
silhouette = coeff
definitions=sorted(list(zip(clusters, snippets)), key=lambda x: x[0])
if args.o:
f = open(args.o, 'w')
writer = f.write
else:
writer = print
while(1):
try:
c, s = definitions.pop()
writer ("%d\t%s\n" % (c, s))
except IndexError:
break
print("Silhouette Coefficient: %0.3f" % silhouette)
print("Number of clusters: %d" % num_clusters)
print()
elif args.c.startswith("agg"):
dist = 1 - cosine_similarity(X)
linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances
fig, ax = plt.subplots(figsize=(15, 20)) # set size
ax = dendrogram(linkage_matrix, orientation="right", labels=list(snippets));
plt.tick_params(\
axis= 'x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelbottom='off')
plt.tight_layout() #show plot with tight layout
#uncomment below to save figure
plt.savefig("ward_clusters_%s%s_%s_H%s.png" % (term_name, t, corpus, dimensions), dpi=200) #save figure as ward_clusters