Carlos-Francisco Méndez-Cruz

LSA soft clustering

1 +import os
2 +from optparse import OptionParser
3 +import sys
4 +from time import time
5 +import re
6 +import numpy as np
7 +
8 +# Objective: Obtain groups according to the component with the higher absolute value
9 +
10 +# Parameters:
11 +# 1) --vectorPath Path to read vectors.
12 +# 2) --vectorFile File to read vectors.
13 +# 3) --outputPath Path to place output files.
14 +# 4) --groups Number of groups
15 +
16 +# Ouput:
17 +# 1) File with groups and plots
18 +
19 +# Execution:
20 +# python plot_Vectors_LSA_structured_heatmap.py --outputPath /home/compu2/bionlp/lcg-faaa/agrupamiento-datos-categoricos --vectorPath /home/compu2/bionlp/lcg-faaa/agrupamiento-datos-categoricos --vectorFile vectors_file.txt --groups 2
21 +
22 +###########################################################
23 +# MAIN PROGRAM #
24 +###########################################################
25 +
26 +def getGroup(v):
27 + if np.max(v) == 0:
28 + index_max = len(v)
29 + else:
30 + index_max = np.argmax(v)
31 + return index_max
32 +
33 +def getGroupSign(v):
34 + sign = ''
35 + vabs = [abs(i) for i in v]
36 + if np.max(vabs) == 0:
37 + index_max = len(vabs)
38 + sign = '(+/-)'
39 + else:
40 + index_max = np.argmax(vabs)
41 + sign = '(' + str(v[index_max])[:5] + ')'
42 + return index_max, sign
43 +
44 +if __name__ == "__main__":
45 + # Parameter definition
46 + parser = OptionParser()
47 + parser.add_option("--vectorPath", dest="vectorPath",
48 + help="Path to read vector file", metavar="PATH")
49 + parser.add_option("--vectorFile", dest="vectorFile",
50 + help="File to read vectors", metavar="FILE")
51 + parser.add_option("--outputPath", dest="outputPath",
52 + help="Path to place clustering classified files", metavar="PATH")
53 + parser.add_option("--groups", type="int",
54 + dest="groups", default=0,
55 + help="Groups", metavar="N")
56 +
57 + (options, args) = parser.parse_args()
58 + if len(args) > 0:
59 + parser.error("None parameters indicated.")
60 + sys.exit(1)
61 +
62 + # Printing parameter values
63 + print('-------------------------------- PARAMETERS --------------------------------')
64 + print("Path to read vector file: " + str(options.vectorPath))
65 + print("File to read vectors: " + str(options.vectorFile))
66 + print("Output path: " + str(options.outputPath))
67 + print("Groups:" + str(options.groups))
68 +
69 + listVectors = []
70 + listLabels = []
71 + listGroup = []
72 + vectorLen = int(options.groups)
73 + t0 = time()
74 + with open(os.path.join(options.vectorPath, options.vectorFile), mode="r", encoding='utf8') as iFile:
75 + for line in iFile.readlines():
76 + if line.startswith("#"):
77 + continue
78 + line = line.strip('\r\n')
79 + listLine = line.split('\t')
80 + label = listLine[0]
81 + vector = []
82 + vectorOrig = []
83 + listValues = listLine[1].split()
84 + if len(listValues) != vectorLen:
85 + print("Vector vectorLen does not match: {}".format(label))
86 + continue
87 + for elem in listValues:
88 + vectorOrig.append(float(elem))
89 + vector.append(abs(float(elem)))
90 + listVectors.append(vector)
91 + #group = getGroupSign(vectorOrig)
92 + group = getGroup(vectorOrig)
93 + listGroup.append(group[0])
94 + #listSign.append(group[1])
95 + listLabels.append(label + group[1])
96 + print(" Reading vectors done!")
97 + print(" Len vectors: " + str(len(listVectors)))
98 + print(" Len labels: " + str(len(listLabels)))
99 +
100 + with open(os.path.join(options.outputPath, options.vectorFile.replace('.txt', '.grps.txt')), mode='w', encoding='utf8') as oFile:
101 + for g, l in sorted(zip(listGroup, listLabels)):
102 + oFile.write('{}\t{}\n'.format(g, l))
103 +
104 + with open(os.path.join(options.outputPath, options.vectorFile.replace('.txt', '.grps-rows.txt')), mode='w', encoding='utf8') as oFile:
105 + g_before = 0
106 + labels = ''
107 + for g, l in sorted(zip(listGroup, listLabels)):
108 + if g != g_before:
109 + oFile.write('{}\t{}\n'.format(g_before, labels.rstrip(', ')))
110 + labels = ''
111 + g_before = g
112 + labels = labels + l + ', '
113 + oFile.write('{}\t{}\n'.format(g_before, labels.rstrip(', ')))
114 +
115 + print(" Processing done in %fs" % (time() - t0))