added fisrt clusters in spanish

Ignacio Arroyo Fernández
Commit c38428731e355771c6e8f60d321adfe902fda43b c3842873 1 parent 0b068a4a
Showing 18 changed files with 139 additions and 0 deletions
data/w2v_corpus-spanish-seg_H300_w10.dic
data/w2v_corpus-spanish-seg_H300_w10.mtx
multi_clustering.sh
nohup.out
results/spanish-seg_w2v-cos_H300_NC010.cls
results/spanish-seg_w2v-cos_H300_NC011.cls
results/spanish-seg_w2v-cos_H300_NC012.cls
results/spanish-seg_w2v-cos_H300_NC013.cls
results/spanish-seg_w2v-cos_H300_NC014.cls
results/spanish-seg_w2v-cos_H300_NC015.cls
results/spanish-seg_w2v-cos_H300_NC016.cls
results/spanish-seg_w2v-cos_H300_NC017.cls
results/spanish-seg_w2v-cos_H300_NC05.cls
results/spanish-seg_w2v-cos_H300_NC06.cls
results/spanish-seg_w2v-cos_H300_NC07.cls
results/spanish-seg_w2v-cos_H300_NC08.cls
results/spanish-seg_w2v-cos_H300_NC09.cls
sent_clustering3.py
--- a/data/w2v_corpus-spanish-seg_H300_w10.dic 0 → 100644
View file @c384287
+++ b/data/w2v_corpus-spanish-seg_H300_w10.dic 0 → 100644
View file @c384287
--- a/data/w2v_corpus-spanish-seg_H300_w10.mtx 0 → 100644
View file @c384287
+++ b/data/w2v_corpus-spanish-seg_H300_w10.mtx 0 → 100644
View file @c384287
--- a/multi_clustering.sh 0 → 100644
View file @c384287
+++ b/multi_clustering.sh 0 → 100644
View file @c384287
+ for n in {4..30}; do python sent_clustering3.py -x data/w2v_corpus-spanish-seg_H300_w10.mtx -m cosine -c km -N 10 -k "$n" -o results/spanish-seg_w2v-cos_H300_NC0"$n".cls; done
--- a/nohup.out 0 → 100644
View file @c384287
+++ b/nohup.out 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC010.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC010.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC011.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC011.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC012.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC012.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC013.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC013.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC014.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC014.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC015.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC015.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC016.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC016.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC017.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC017.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC05.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC05.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC06.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC06.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC07.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC07.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC08.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC08.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC09.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC09.cls 0 → 100644
View file @c384287
--- a/sent_clustering3.py 0 → 100644
View file @c384287
+++ b/sent_clustering3.py 0 → 100644
View file @c384287
+ # -*- coding: utf-8 -*-
+ from sklearn.cluster import KMeans
+ from sklearn import metrics
+ from scipy.cluster.hierarchy import ward, dendrogram
+ from sklearn.metrics.pairwise import cosine_similarity
+ from re import search, M, I
+ import logging
+ import matplotlib as mpl
+ mpl.use('Agg')
+ import matplotlib.pyplot as plt
+ import sys
+ import numpy as np
+ from argparse import ArgumentParser as ap
+ import os
+ 
+ logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
+                     level=logging.INFO)
+ 
+ parser = ap(description='This script clusters word embeddings by using k-means.')
+ parser.add_argument("-x", help="Input file name (vectors)", metavar="input_file", required=True)
+ parser.add_argument("-o", help="Output file name (clusters)", metavar="output_file")
+ parser.add_argument("-m", help="Clustering metric [cosine|euclidean]", metavar="metric", default="euclidean")
+ parser.add_argument("-c", help="Clusterer = {km, km++, aggwr}", metavar="clusterer", required=True)
+ parser.add_argument("-k", help="Number of clusters", metavar="k_clusters")
+ parser.add_argument("-N", help="Number of trials for maximize Silhueltte", metavar="n_trials")
+ parser.add_argument("-t", default=False, action="store_true", help="Toggle if labels are PoS tags instead of snippets.")
+ parser.add_argument("-n", default=False, action="store_true", help="Toggle if labels are NounPhrases instead of snippets.")
+ args = parser.parse_args()
+ 
+ min_show_length = 100
+ 
+ if args.m=="cosine":
+     from sklearn.metrics.pairwise import cosine_distances as cos
+     KMeans.euclidean_distances=cos
+ 
+ def cleaner(line): # The default is the average sentence length in English
+     return line.strip()#[:min_show_length]
+ 
+ #try:
+ # source = search(r"(?:vectors|pairs)_([A-Za-z]+[\-A-Za-z0-9]+)_?(T[0-9]{2,3}_C[1-9]_[0-9]{2}|d\d+t|\w+)?_([d2v|w2v|fstx|coocc\w*|doc\w*]*)_(H[0-9]{1,4})_?([sub|co[nvs{0,2}|rr|nc]+]?)?_(m[0-9]{1,3}[_w?[0-9]{0,3}]?)", args.x, M|I)
+ 
+ corpus = "spanish-seg"  # source.group(1)
+ representation = "w2v"  # source.group(3)
+ dimensions = "300"  # source.group(4)[1:]
+ min_count = "1"  #source.group(6)[1:]
+ term_name = args.x.split(".")[0] + ".dic"  #source.group(2)
+ 
+ #except IndexError:
+ #    print ("\nError in the filename. One or more indicators are missing. Notation: <vectors|pairs>_<source_corpus>_<model_representation>_<dimen..")
+ #    for i in range(6):
+ #        try:
+ #            print (source.group(i))
+ #        except IndexError:
+ #            print (":>> Unparsed: %s" % (i))
+ #            pass
+ #    exit()
+ #except AttributeError:
+ #    print ("\nFatal Error in the filename. Notation: <vectors|pairs>_<source_corpus>_<model_representation>_<dimendions>_<operation>*_<mminimum_...")
+ #    for i in range(6):
+ #        try:
+ #            print (source.group(i))
+ #        except AttributeError:
+ #            print (":>> Unparsed: %s" % (i))
+ #            pass
+ #    exit()
+ 
+ #route = os.path.dirname(args.x)
+ ## Loading files
+ if not args.t and not args.n:
+     with open(term_name)  as f:  #"%s/%s.txt" % (route, term_name)) as f:
+         snippets = list(map(cleaner, f.readlines()))
+         t = ""
+ elif args.n:
+     with open("%s/%s.txt.phr" % (route, term_name)) as f:
+         snippets = list(map(cleaner, f.readlines()))
+         t = "_phr"
+ else:
+     with open("%s/%s.tags" % (route, term_name)) as f:
+         snippets = list(map(cleaner, f.readlines()))
+         t = "_tags"
+ #TODO: Parse the snippets wit correct vectors file.
+ X = np.loadtxt(args.x)
+ if args.c.startswith("km"):
+     num_clusters = int(args.k)
+ 
+     if "++" in args.c:
+         km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100,
+                                                             verbose=1, n_init=1)
+         km.fit(X)
+         coeff = metrics.silhouette_score(X, km.labels_, sample_size=1000)
+         clusters=km.labels_.tolist()
+     else:
+         max_silhouette = -1
+         silhouette = -1
+         for tr in range(int(args.N)): # Number of trials for maximize Silhueltte
+              km = KMeans(n_clusters=num_clusters, init='random', max_iter=100,
+                                                 verbose=1, n_init=1, n_jobs=4)
+              km.fit(X)
+              coeff = metrics.silhouette_score(X, km.labels_, sample_size=1000)
+              #print ("Partial Silhuette: %f" % coeff)
+              if silhouette < coeff:
+                  clusters=km.labels_.tolist()
+                  silhouette = coeff
+     definitions=sorted(list(zip(clusters, snippets)), key=lambda x: x[0])
+ 
+     if args.o:
+         f = open(args.o, 'w')
+         writer = f.write
+     else:
+         writer = print
+ 
+     while(1):
+         try:
+             c, s = definitions.pop()
+             writer ("%d\t%s\n" % (c, s))
+         except IndexError:
+             break
+ 
+     print("Silhouette Coefficient: %0.3f" % silhouette)
+     print("Number of clusters: %d" % num_clusters)
+     print()
+ 
+ elif args.c.startswith("agg"):
+     dist = 1 - cosine_similarity(X)
+     linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances
+     fig, ax = plt.subplots(figsize=(15, 20)) # set size
+     ax = dendrogram(linkage_matrix, orientation="right", labels=list(snippets));
+ 
+     plt.tick_params(\
+     axis= 'x',          # changes apply to the x-axis
+     which='both',      # both major and minor ticks are affected
+     bottom='off',      # ticks along the bottom edge are off
+     top='off',         # ticks along the top edge are off
+     labelbottom='off')
+ 
+     plt.tight_layout() #show plot with tight layout
+ #uncomment below to save figure
+     plt.savefig("ward_clusters_%s%s_%s_H%s.png" % (term_name, t, corpus, dimensions), dpi=200) #save figure as ward_clusters