Carlos-Francisco Méndez-Cruz

WISSE example

1 +# -*- coding: UTF-8 -*-
2 +import os
3 +from optparse import OptionParser
4 +import sys
5 +from time import time
6 +import matplotlib.pyplot as plt
7 +import re
8 +import numpy as np
9 +import matplotlib
10 +from sklearn.metrics.pairwise import cosine_similarity
11 +
12 +matplotlib.use('Qt4Agg')
13 +
14 +
15 +__author__ = 'CMendezC'
16 +
17 +# Objective: Plot vectors into 2D and 3D
18 +# with a color for vectors using different transformations
19 +
20 +# Parameters:
21 +# 1) --vectorPath Path to read vectors.
22 +# 2) --vectorFile File to read vectors.
23 +# 3) --outputPath Path to place plot files.
24 +# 4) --outputFormat Plot file format: pdf, png
25 +# 5) --absoluteValue Employ absolute values in vectors
26 +
27 +# Ouput:
28 +# 1) Plots
29 +
30 +# Execution:
31 +# C:\Anaconda3\python plot_Vectors_LSA.py
32 +# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\GENSOR_UNITS\wordEmbeddings\lsa\plots
33 +# --vectorPath C:\Users\cmendezc\Documents\GENOMICAS\GENSOR_UNITS\wordEmbeddings\lsa
34 +# --vectorFile GU_lsa_local_vectors_2T.txt
35 +# --absoluteValue
36 +# --outputFormat pdf
37 +
38 +# C:\Anaconda3\python plot_Vectors_LSA.py --outputPath C:\Users\cmendezc\Documents\GENOMICAS\GENSOR_UNITS\wordEmbeddings\lsa\plots --vectorPath C:\Users\cmendezc\Documents\GENOMICAS\GENSOR_UNITS\wordEmbeddings\lsa --vectorFile GU_lsa_local_vectors_2T.txt --absoluteValue --outputFormat pdf
39 +# C:\Anaconda3\python plot_Vectors_LSA.py --outputPath C:\Users\cmendezc\Documents\GENOMICAS\GENSOR_UNITS\wordEmbeddings\lsa\plots --vectorPath C:\Users\cmendezc\Documents\GENOMICAS\GENSOR_UNITS\wordEmbeddings\lsa --vectorFile GU_lsa_local_vectors_10T.txt --absoluteValue --outputFormat pdf
40 +# C:\Anaconda3\python plot_Vectors_LSA.py --outputPath C:\Users\cmendezc\Documents\GENOMICAS\GENSOR_UNITS\wordEmbeddings\lsa\plots --vectorPath C:\Users\cmendezc\Documents\GENOMICAS\GENSOR_UNITS\wordEmbeddings\lsa --vectorFile GU_lsa_local_vectors_36T.txt --absoluteValue --outputFormat pdf
41 +# C:\Anaconda3\python plot_Vectors_LSA.py --outputPath C:\Users\cmendezc\Documents\GENOMICAS\GENSOR_UNITS\wordEmbeddings\lsa\plots --vectorPath C:\Users\cmendezc\Documents\GENOMICAS\GENSOR_UNITS\wordEmbeddings\lsa --vectorFile GU_lsa_local_vectors_88T.txt --absoluteValue --outputFormat pdf
42 +# C:\Anaconda3\python plot_Vectors_LSA.py --outputPath C:\Users\cmendezc\Documents\GENOMICAS\GENSOR_UNITS\wordEmbeddings\lsa\plots --vectorPath C:\Users\cmendezc\Documents\GENOMICAS\GENSOR_UNITS\wordEmbeddings\lsa --vectorFile GU_lsa_local_vectors_120T.txt --absoluteValue --outputFormat pdf
43 +
44 +###########################################################
45 +# MAIN PROGRAM #
46 +###########################################################
47 +
48 +if __name__ == "__main__":
49 + # Parameter definition
50 + parser = OptionParser()
51 + parser.add_option("--vectorPath", dest="vectorPath",
52 + help="Path to read vector file", metavar="PATH")
53 + parser.add_option("--vectorFile", dest="vectorFile",
54 + help="File to read vectors", metavar="FILE")
55 + parser.add_option("--outputPath", dest="outputPath",
56 + help="Path to place clustering classified files", metavar="PATH")
57 + parser.add_option("--outputFormat", dest="outputFormat", choices=('pdf', 'png'),
58 + help="Plot output format", metavar="PATH")
59 + parser.add_option("--absoluteValue", default=False,
60 + action="store_true", dest="absoluteValue",
61 + help="Use vector absolute values?")
62 +
63 + (options, args) = parser.parse_args()
64 + if len(args) > 0:
65 + parser.error("None parameters indicated.")
66 + sys.exit(1)
67 +
68 + # Printing parameter values
69 + print('-------------------------------- PARAMETERS --------------------------------')
70 + print("Path to read vector file: " + str(options.vectorPath))
71 + print("File to read vectors: " + str(options.vectorFile))
72 + print("Path to write plots: " + str(options.outputPath))
73 + print("Plot output format: " + str(options.outputFormat))
74 + print("Use vector absolute values? " + str(options.absoluteValue))
75 +
76 + #regexLen = re.compile(r'_(?P<vectorLen>[0-9]+)T')
77 + listVectors = []
78 + listLabels = []
79 + print("Reading vectors...")
80 + #result = regexLen.search(options.vectorFile)
81 + #vectorLen = 0
82 + #if result:
83 + # vectorLen = int(result.group('vectorLen'))
84 + # print("Vector vectorLen: {}".format(vectorLen))
85 + #else:
86 + # print("None vectorLen mentioned within name file!")
87 + # quit()
88 + with open(os.path.join(options.vectorPath, options.vectorFile), mode="r", encoding='utf8') as iFile:
89 + for line in iFile.readlines():
90 + line = line.strip('\r\n')
91 + listLine = line.split()
92 + # print("Len listLine: {}".format(len(listLine)))
93 + label = listLine[0][:12]
94 + # print(" Label: {}".format(label))
95 + vector = []
96 + listValues = listLine[1:]
97 + # print(" Len listValues: {}".format(len(listValues)))
98 + #if len(listValues) != vectorLen:
99 + # print("Vector vectorLen does not match: {}".format(label))
100 + # continue
101 + for elem in listValues:
102 + if options.absoluteValue:
103 + vector.append(abs(float(elem)))
104 + else:
105 + vector.append(float(elem))
106 + listLabels.append(label)
107 + listVectors.append(vector)
108 + print(" Reading vectors done!")
109 + print(" Len vectors: " + str(len(listVectors)))
110 + print(" Len labels: " + str(len(listLabels)))
111 +
112 + similarityMatrix = cosine_similarity(np.array(listVectors))
113 + print("similarityMatrix shape: {}".format(similarityMatrix.shape))
114 +
115 + t0 = time()
116 + print("Plotting heatmap...")
117 + # fig, ax = plt.subplots()
118 + fig = plt.figure()
119 + ax = fig.add_subplot(111)
120 + # heatmap = ax.pcolor(similarityMatrix, cmap=plt.cm.Reds, alpha=0.8)
121 + heatmap = ax.pcolor(similarityMatrix, cmap=plt.cm.Reds)
122 + fig = plt.gcf()
123 + fig.set_size_inches(16, 16)
124 + ax.set_frame_on(False)
125 + ax.set_yticks(np.arange(similarityMatrix.shape[0]) + 0.5, minor=False)
126 + ax.set_xticks(np.arange(similarityMatrix.shape[1]) + 0.5, minor=False)
127 + ax.invert_yaxis()
128 + ax.xaxis.tick_top()
129 + ax.set_xticklabels(listLabels, minor=False, size='xx-small')
130 + ax.set_yticklabels(listLabels, minor=False, size='xx-small')
131 + plt.xticks(rotation=90)
132 + ax.grid(False)
133 +
134 + # Turn off all the ticks
135 + ax = plt.gca()
136 +
137 + for t in ax.xaxis.get_major_ticks():
138 + t.tick1On = False
139 + t.tick2On = False
140 + for t in ax.yaxis.get_major_ticks():
141 + t.tick1On = False
142 + t.tick2On = False
143 +
144 + fig.tight_layout()
145 + if options.absoluteValue:
146 + fileName = options.vectorFile.replace('.txt', '.abs.' + options.outputFormat)
147 + else:
148 + fileName = options.vectorFile.replace('.txt', '.' + options.outputFormat)
149 + fig.savefig(os.path.join(options.outputPath, fileName))
150 +
151 + # plt.axis('tight')
152 + # plt.show()
153 + # plt.savefig('test.png', bbox_inches='tight')
154 +
155 + print(" Plotting heatmap done in %fs" % (time() - t0))