Showing
18 changed files
with
139 additions
and
0 deletions
data/w2v_corpus-spanish-seg_H300_w10.dic
0 → 100644
This diff could not be displayed because it is too large.
data/w2v_corpus-spanish-seg_H300_w10.mtx
0 → 100644
This diff could not be displayed because it is too large.
multi_clustering.sh
0 → 100644
| 1 | +for n in {4..30}; do python sent_clustering3.py -x data/w2v_corpus-spanish-seg_H300_w10.mtx -m cosine -c km -N 10 -k "$n" -o results/spanish-seg_w2v-cos_H300_NC0"$n".cls; done |
nohup.out
0 → 100644
This diff is collapsed. Click to expand it.
results/spanish-seg_w2v-cos_H300_NC010.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC011.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC012.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC013.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC014.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC015.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC016.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC017.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC05.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC06.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC07.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC08.cls
0 → 100644
This diff could not be displayed because it is too large.
results/spanish-seg_w2v-cos_H300_NC09.cls
0 → 100644
This diff could not be displayed because it is too large.
sent_clustering3.py
0 → 100644
| 1 | +# -*- coding: utf-8 -*- | ||
| 2 | +from sklearn.cluster import KMeans | ||
| 3 | +from sklearn import metrics | ||
| 4 | +from scipy.cluster.hierarchy import ward, dendrogram | ||
| 5 | +from sklearn.metrics.pairwise import cosine_similarity | ||
| 6 | +from re import search, M, I | ||
| 7 | +import logging | ||
| 8 | +import matplotlib as mpl | ||
| 9 | +mpl.use('Agg') | ||
| 10 | +import matplotlib.pyplot as plt | ||
| 11 | +import sys | ||
| 12 | +import numpy as np | ||
| 13 | +from argparse import ArgumentParser as ap | ||
| 14 | +import os | ||
| 15 | + | ||
| 16 | +logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', | ||
| 17 | + level=logging.INFO) | ||
| 18 | + | ||
| 19 | +parser = ap(description='This script clusters word embeddings by using k-means.') | ||
| 20 | +parser.add_argument("-x", help="Input file name (vectors)", metavar="input_file", required=True) | ||
| 21 | +parser.add_argument("-o", help="Output file name (clusters)", metavar="output_file") | ||
| 22 | +parser.add_argument("-m", help="Clustering metric [cosine|euclidean]", metavar="metric", default="euclidean") | ||
| 23 | +parser.add_argument("-c", help="Clusterer = {km, km++, aggwr}", metavar="clusterer", required=True) | ||
| 24 | +parser.add_argument("-k", help="Number of clusters", metavar="k_clusters") | ||
| 25 | +parser.add_argument("-N", help="Number of trials for maximize Silhueltte", metavar="n_trials") | ||
| 26 | +parser.add_argument("-t", default=False, action="store_true", help="Toggle if labels are PoS tags instead of snippets.") | ||
| 27 | +parser.add_argument("-n", default=False, action="store_true", help="Toggle if labels are NounPhrases instead of snippets.") | ||
| 28 | +args = parser.parse_args() | ||
| 29 | + | ||
| 30 | +min_show_length = 100 | ||
| 31 | + | ||
| 32 | +if args.m=="cosine": | ||
| 33 | + from sklearn.metrics.pairwise import cosine_distances as cos | ||
| 34 | + KMeans.euclidean_distances=cos | ||
| 35 | + | ||
| 36 | +def cleaner(line): # The default is the average sentence length in English | ||
| 37 | + return line.strip()#[:min_show_length] | ||
| 38 | + | ||
| 39 | +#try: | ||
| 40 | +# source = search(r"(?:vectors|pairs)_([A-Za-z]+[\-A-Za-z0-9]+)_?(T[0-9]{2,3}_C[1-9]_[0-9]{2}|d\d+t|\w+)?_([d2v|w2v|fstx|coocc\w*|doc\w*]*)_(H[0-9]{1,4})_?([sub|co[nvs{0,2}|rr|nc]+]?)?_(m[0-9]{1,3}[_w?[0-9]{0,3}]?)", args.x, M|I) | ||
| 41 | + | ||
| 42 | +corpus = "spanish-seg" # source.group(1) | ||
| 43 | +representation = "w2v" # source.group(3) | ||
| 44 | +dimensions = "300" # source.group(4)[1:] | ||
| 45 | +min_count = "1" #source.group(6)[1:] | ||
| 46 | +term_name = args.x.split(".")[0] + ".dic" #source.group(2) | ||
| 47 | + | ||
| 48 | +#except IndexError: | ||
| 49 | +# print ("\nError in the filename. One or more indicators are missing. Notation: <vectors|pairs>_<source_corpus>_<model_representation>_<dimen..") | ||
| 50 | +# for i in range(6): | ||
| 51 | +# try: | ||
| 52 | +# print (source.group(i)) | ||
| 53 | +# except IndexError: | ||
| 54 | +# print (":>> Unparsed: %s" % (i)) | ||
| 55 | +# pass | ||
| 56 | +# exit() | ||
| 57 | +#except AttributeError: | ||
| 58 | +# print ("\nFatal Error in the filename. Notation: <vectors|pairs>_<source_corpus>_<model_representation>_<dimendions>_<operation>*_<mminimum_...") | ||
| 59 | +# for i in range(6): | ||
| 60 | +# try: | ||
| 61 | +# print (source.group(i)) | ||
| 62 | +# except AttributeError: | ||
| 63 | +# print (":>> Unparsed: %s" % (i)) | ||
| 64 | +# pass | ||
| 65 | +# exit() | ||
| 66 | + | ||
| 67 | +#route = os.path.dirname(args.x) | ||
| 68 | +## Loading files | ||
| 69 | +if not args.t and not args.n: | ||
| 70 | + with open(term_name) as f: #"%s/%s.txt" % (route, term_name)) as f: | ||
| 71 | + snippets = list(map(cleaner, f.readlines())) | ||
| 72 | + t = "" | ||
| 73 | +elif args.n: | ||
| 74 | + with open("%s/%s.txt.phr" % (route, term_name)) as f: | ||
| 75 | + snippets = list(map(cleaner, f.readlines())) | ||
| 76 | + t = "_phr" | ||
| 77 | +else: | ||
| 78 | + with open("%s/%s.tags" % (route, term_name)) as f: | ||
| 79 | + snippets = list(map(cleaner, f.readlines())) | ||
| 80 | + t = "_tags" | ||
| 81 | +#TODO: Parse the snippets wit correct vectors file. | ||
| 82 | +X = np.loadtxt(args.x) | ||
| 83 | +if args.c.startswith("km"): | ||
| 84 | + num_clusters = int(args.k) | ||
| 85 | + | ||
| 86 | + if "++" in args.c: | ||
| 87 | + km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100, | ||
| 88 | + verbose=1, n_init=1) | ||
| 89 | + km.fit(X) | ||
| 90 | + coeff = metrics.silhouette_score(X, km.labels_, sample_size=1000) | ||
| 91 | + clusters=km.labels_.tolist() | ||
| 92 | + else: | ||
| 93 | + max_silhouette = -1 | ||
| 94 | + silhouette = -1 | ||
| 95 | + for tr in range(int(args.N)): # Number of trials for maximize Silhueltte | ||
| 96 | + km = KMeans(n_clusters=num_clusters, init='random', max_iter=100, | ||
| 97 | + verbose=1, n_init=1, n_jobs=4) | ||
| 98 | + km.fit(X) | ||
| 99 | + coeff = metrics.silhouette_score(X, km.labels_, sample_size=1000) | ||
| 100 | + #print ("Partial Silhuette: %f" % coeff) | ||
| 101 | + if silhouette < coeff: | ||
| 102 | + clusters=km.labels_.tolist() | ||
| 103 | + silhouette = coeff | ||
| 104 | + definitions=sorted(list(zip(clusters, snippets)), key=lambda x: x[0]) | ||
| 105 | + | ||
| 106 | + if args.o: | ||
| 107 | + f = open(args.o, 'w') | ||
| 108 | + writer = f.write | ||
| 109 | + else: | ||
| 110 | + writer = print | ||
| 111 | + | ||
| 112 | + while(1): | ||
| 113 | + try: | ||
| 114 | + c, s = definitions.pop() | ||
| 115 | + writer ("%d\t%s\n" % (c, s)) | ||
| 116 | + except IndexError: | ||
| 117 | + break | ||
| 118 | + | ||
| 119 | + print("Silhouette Coefficient: %0.3f" % silhouette) | ||
| 120 | + print("Number of clusters: %d" % num_clusters) | ||
| 121 | + print() | ||
| 122 | + | ||
| 123 | +elif args.c.startswith("agg"): | ||
| 124 | + dist = 1 - cosine_similarity(X) | ||
| 125 | + linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances | ||
| 126 | + fig, ax = plt.subplots(figsize=(15, 20)) # set size | ||
| 127 | + ax = dendrogram(linkage_matrix, orientation="right", labels=list(snippets)); | ||
| 128 | + | ||
| 129 | + plt.tick_params(\ | ||
| 130 | + axis= 'x', # changes apply to the x-axis | ||
| 131 | + which='both', # both major and minor ticks are affected | ||
| 132 | + bottom='off', # ticks along the bottom edge are off | ||
| 133 | + top='off', # ticks along the top edge are off | ||
| 134 | + labelbottom='off') | ||
| 135 | + | ||
| 136 | + plt.tight_layout() #show plot with tight layout | ||
| 137 | +#uncomment below to save figure | ||
| 138 | + plt.savefig("ward_clusters_%s%s_%s_H%s.png" % (term_name, t, corpus, dimensions), dpi=200) #save figure as ward_clusters |
-
Please register or login to post a comment