added fisrt clusters in spanish

Ignacio Arroyo Fernández
Commit c38428731e355771c6e8f60d321adfe902fda43b c3842873 1 parent 0b068a4a
Showing 18 changed files with 139 additions and 0 deletions
data/w2v_corpus-spanish-seg_H300_w10.dic
data/w2v_corpus-spanish-seg_H300_w10.mtx
multi_clustering.sh
nohup.out
results/spanish-seg_w2v-cos_H300_NC010.cls
results/spanish-seg_w2v-cos_H300_NC011.cls
results/spanish-seg_w2v-cos_H300_NC012.cls
results/spanish-seg_w2v-cos_H300_NC013.cls
results/spanish-seg_w2v-cos_H300_NC014.cls
results/spanish-seg_w2v-cos_H300_NC015.cls
results/spanish-seg_w2v-cos_H300_NC016.cls
results/spanish-seg_w2v-cos_H300_NC017.cls
results/spanish-seg_w2v-cos_H300_NC05.cls
results/spanish-seg_w2v-cos_H300_NC06.cls
results/spanish-seg_w2v-cos_H300_NC07.cls
results/spanish-seg_w2v-cos_H300_NC08.cls
results/spanish-seg_w2v-cos_H300_NC09.cls
sent_clustering3.py
--- a/data/w2v_corpus-spanish-seg_H300_w10.dic 0 → 100644
View file @c384287
+++ b/data/w2v_corpus-spanish-seg_H300_w10.dic 0 → 100644
View file @c384287
--- a/data/w2v_corpus-spanish-seg_H300_w10.mtx 0 → 100644
View file @c384287
+++ b/data/w2v_corpus-spanish-seg_H300_w10.mtx 0 → 100644
View file @c384287
--- a/multi_clustering.sh 0 → 100644
View file @c384287
+++ b/multi_clustering.sh 0 → 100644
View file @c384287
+for n in {4..30}; do python sent_clustering3.py -x data/w2v_corpus-spanish-seg_H300_w10.mtx -m cosine -c km -N 10 -k "$n" -o results/spanish-seg_w2v-cos_H300_NC0"$n".cls; done
--- a/nohup.out 0 → 100644
View file @c384287
+++ b/nohup.out 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC010.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC010.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC011.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC011.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC012.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC012.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC013.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC013.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC014.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC014.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC015.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC015.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC016.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC016.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC017.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC017.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC05.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC05.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC06.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC06.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC07.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC07.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC08.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC08.cls 0 → 100644
View file @c384287
--- a/results/spanish-seg_w2v-cos_H300_NC09.cls 0 → 100644
View file @c384287
+++ b/results/spanish-seg_w2v-cos_H300_NC09.cls 0 → 100644
View file @c384287
--- a/sent_clustering3.py 0 → 100644
View file @c384287
+++ b/sent_clustering3.py 0 → 100644
View file @c384287
+# -*- coding: utf-8 -*-
+from sklearn.cluster import KMeans
+from sklearn import metrics
+from scipy.cluster.hierarchy import ward, dendrogram
+from sklearn.metrics.pairwise import cosine_similarity
+from re import search, M, I
+import logging
+import matplotlib as mpl
+mpl.use('Agg')
+import matplotlib.pyplot as plt
+import sys
+import numpy as np
+from argparse import ArgumentParser as ap
+import os
+
+logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
+                    level=logging.INFO)
+
+parser = ap(description='This script clusters word embeddings by using k-means.')
+parser.add_argument("-x", help="Input file name (vectors)", metavar="input_file", required=True)
+parser.add_argument("-o", help="Output file name (clusters)", metavar="output_file")
+parser.add_argument("-m", help="Clustering metric [cosine|euclidean]", metavar="metric", default="euclidean")
+parser.add_argument("-c", help="Clusterer = {km, km++, aggwr}", metavar="clusterer", required=True)
+parser.add_argument("-k", help="Number of clusters", metavar="k_clusters")
+parser.add_argument("-N", help="Number of trials for maximize Silhueltte", metavar="n_trials")
+parser.add_argument("-t", default=False, action="store_true", help="Toggle if labels are PoS tags instead of snippets.")
+parser.add_argument("-n", default=False, action="store_true", help="Toggle if labels are NounPhrases instead of snippets.")
+args = parser.parse_args()
+
+min_show_length = 100
+
+if args.m=="cosine":
+    from sklearn.metrics.pairwise import cosine_distances as cos
+    KMeans.euclidean_distances=cos
+
+def cleaner(line): # The default is the average sentence length in English
+    return line.strip()#[:min_show_length]
+
+#try:
+# source = search(r"(?:vectors|pairs)_([A-Za-z]+[\-A-Za-z0-9]+)_?(T[0-9]{2,3}_C[1-9]_[0-9]{2}|d\d+t|\w+)?_([d2v|w2v|fstx|coocc\w*|doc\w*]*)_(H[0-9]{1,4})_?([sub|co[nvs{0,2}|rr|nc]+]?)?_(m[0-9]{1,3}[_w?[0-9]{0,3}]?)", args.x, M|I)
+
+corpus = "spanish-seg"  # source.group(1)
+representation = "w2v"  # source.group(3)
+dimensions = "300"  # source.group(4)[1:]
+min_count = "1"  #source.group(6)[1:]
+term_name = args.x.split(".")[0] + ".dic"  #source.group(2)
+
+#except IndexError:
+#    print ("\nError in the filename. One or more indicators are missing. Notation: <vectors|pairs>_<source_corpus>_<model_representation>_<dimen..")
+#    for i in range(6):
+#        try:
+#            print (source.group(i))
+#        except IndexError:
+#            print (":>> Unparsed: %s" % (i))
+#            pass
+#    exit()
+#except AttributeError:
+#    print ("\nFatal Error in the filename. Notation: <vectors|pairs>_<source_corpus>_<model_representation>_<dimendions>_<operation>*_<mminimum_...")
+#    for i in range(6):
+#        try:
+#            print (source.group(i))
+#        except AttributeError:
+#            print (":>> Unparsed: %s" % (i))
+#            pass
+#    exit()
+
+#route = os.path.dirname(args.x)
+## Loading files
+if not args.t and not args.n:
+    with open(term_name)  as f:  #"%s/%s.txt" % (route, term_name)) as f:
+        snippets = list(map(cleaner, f.readlines()))
+        t = ""
+elif args.n:
+    with open("%s/%s.txt.phr" % (route, term_name)) as f:
+        snippets = list(map(cleaner, f.readlines()))
+        t = "_phr"
+else:
+    with open("%s/%s.tags" % (route, term_name)) as f:
+        snippets = list(map(cleaner, f.readlines()))
+        t = "_tags"
+#TODO: Parse the snippets wit correct vectors file.
+X = np.loadtxt(args.x)
+if args.c.startswith("km"):
+    num_clusters = int(args.k)
+
+    if "++" in args.c:
+        km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=100,
+                                                            verbose=1, n_init=1)
+        km.fit(X)
+        coeff = metrics.silhouette_score(X, km.labels_, sample_size=1000)
+        clusters=km.labels_.tolist()
+    else:
+        max_silhouette = -1
+        silhouette = -1
+        for tr in range(int(args.N)): # Number of trials for maximize Silhueltte
+             km = KMeans(n_clusters=num_clusters, init='random', max_iter=100,
+                                                verbose=1, n_init=1, n_jobs=4)
+             km.fit(X)
+             coeff = metrics.silhouette_score(X, km.labels_, sample_size=1000)
+             #print ("Partial Silhuette: %f" % coeff)
+             if silhouette < coeff:
+                 clusters=km.labels_.tolist()
+                 silhouette = coeff
+    definitions=sorted(list(zip(clusters, snippets)), key=lambda x: x[0])
+
+    if args.o:
+        f = open(args.o, 'w')
+        writer = f.write
+    else:
+        writer = print
+
+    while(1):
+        try:
+            c, s = definitions.pop()
+            writer ("%d\t%s\n" % (c, s))
+        except IndexError:
+            break
+
+    print("Silhouette Coefficient: %0.3f" % silhouette)
+    print("Number of clusters: %d" % num_clusters)
+    print()
+
+elif args.c.startswith("agg"):
+    dist = 1 - cosine_similarity(X)
+    linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances
+    fig, ax = plt.subplots(figsize=(15, 20)) # set size
+    ax = dendrogram(linkage_matrix, orientation="right", labels=list(snippets));
+
+    plt.tick_params(\
+    axis= 'x',          # changes apply to the x-axis
+    which='both',      # both major and minor ticks are affected
+    bottom='off',      # ticks along the bottom edge are off
+    top='off',         # ticks along the top edge are off
+    labelbottom='off')
+
+    plt.tight_layout() #show plot with tight layout
+#uncomment below to save figure
+    plt.savefig("ward_clusters_%s%s_%s_H%s.png" % (term_name, t, corpus, dimensions), dpi=200) #save figure as ward_clusters