get-groups.py
4.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os
from optparse import OptionParser
import sys
from time import time
import re
import numpy as np
# Objective: Obtain groups according to the component with the higher absolute value
# Parameters:
# 1) --vectorPath Path to read vectors.
# 2) --vectorFile File to read vectors.
# 3) --outputPath Path to place output files.
# 4) --groups Number of groups
# Ouput:
# 1) File with groups and plots
# Execution:
# python get-groups.py --outputPath /home/compu2/bionlp/lcg-faaa/agrupamiento-datos-categoricos --vectorPath /home/compu2/bionlp/lcg-faaa/agrupamiento-datos-categoricos --vectorFile vectors_file.txt --groups 2
###########################################################
# MAIN PROGRAM #
###########################################################
def getGroup(v):
if np.max(v) == 0:
index_max = len(v)
else:
index_max = np.argmax(v)
return index_max
def getGroupSign(v):
sign = ''
vabs = [abs(i) for i in v]
if np.max(vabs) == 0:
index_max = len(vabs)
sign = '(+/-)'
else:
index_max = np.argmax(vabs)
sign = '(' + str(v[index_max])[:5] + ')'
return index_max, sign
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--vectorPath", dest="vectorPath",
help="Path to read vector file", metavar="PATH")
parser.add_option("--vectorFile", dest="vectorFile",
help="File to read vectors", metavar="FILE")
parser.add_option("--outputPath", dest="outputPath",
help="Path to place clustering classified files", metavar="PATH")
parser.add_option("--groups", type="int",
dest="groups", default=0,
help="Groups", metavar="N")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameters indicated.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read vector file: " + str(options.vectorPath))
print("File to read vectors: " + str(options.vectorFile))
print("Output path: " + str(options.outputPath))
print("Groups:" + str(options.groups))
listVectors = []
listLabels = []
listGroup = []
vectorLen = int(options.groups)
t0 = time()
with open(os.path.join(options.vectorPath, options.vectorFile), mode="r", encoding='utf8') as iFile:
for line in iFile.readlines():
if line.startswith("#"):
continue
line = line.strip('\r\n')
listLine = line.split('\t')
label = listLine[0]
vector = []
vectorOrig = []
listValues = listLine[1].split()
if len(listValues) != vectorLen:
print("Vector vectorLen does not match: {}".format(label))
continue
for elem in listValues:
#vectorOrig.append(float(elem))
vector.append(abs(float(elem)))
listVectors.append(vector)
#group = getGroupSign(vectorOrig)
group = getGroup(vector)
listGroup.append(group)
#listGroup.append(group[0])
#listSign.append(group[1])
#listLabels.append(label + group[1])
listLabels.append(label)
print("Reading vectors done!")
print(" Total vectors: " + str(len(listVectors)))
print(" Total labels: " + str(len(listLabels)))
with open(os.path.join(options.outputPath, options.vectorFile.replace('.txt', '.grps.txt')), mode='w', encoding='utf8') as oFile:
for g, l in sorted(zip(listGroup, listLabels)):
oFile.write('{}\t{}\n'.format(g, l))
with open(os.path.join(options.outputPath, options.vectorFile.replace('.txt', '.grps-rows.txt')), mode='w', encoding='utf8') as oFile:
g_before = 0
labels = ''
for g, l in sorted(zip(listGroup, listLabels)):
if g != g_before:
oFile.write('{}\t{}\n'.format(g_before, labels.rstrip(', ')))
labels = ''
g_before = g
labels = labels + l + ', '
oFile.write('{}\t{}\n'.format(g_before, labels.rstrip(', ')))
print("Processing done in %fs" % (time() - t0))