Carlos-Francisco Méndez-Cruz

LSA soft clustering

import os
from optparse import OptionParser
import sys
from time import time
import re
import numpy as np
# Objective: Obtain groups according to the component with the higher absolute value
# Parameters:
# 1) --vectorPath Path to read vectors.
# 2) --vectorFile File to read vectors.
# 3) --outputPath Path to place output files.
# 4) --groups Number of groups
# Ouput:
# 1) File with groups and plots
# Execution:
# python plot_Vectors_LSA_structured_heatmap.py --outputPath /home/compu2/bionlp/lcg-faaa/agrupamiento-datos-categoricos --vectorPath /home/compu2/bionlp/lcg-faaa/agrupamiento-datos-categoricos --vectorFile vectors_file.txt --groups 2
###########################################################
# MAIN PROGRAM #
###########################################################
def getGroup(v):
if np.max(v) == 0:
index_max = len(v)
else:
index_max = np.argmax(v)
return index_max
def getGroupSign(v):
sign = ''
vabs = [abs(i) for i in v]
if np.max(vabs) == 0:
index_max = len(vabs)
sign = '(+/-)'
else:
index_max = np.argmax(vabs)
sign = '(' + str(v[index_max])[:5] + ')'
return index_max, sign
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--vectorPath", dest="vectorPath",
help="Path to read vector file", metavar="PATH")
parser.add_option("--vectorFile", dest="vectorFile",
help="File to read vectors", metavar="FILE")
parser.add_option("--outputPath", dest="outputPath",
help="Path to place clustering classified files", metavar="PATH")
parser.add_option("--groups", type="int",
dest="groups", default=0,
help="Groups", metavar="N")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameters indicated.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read vector file: " + str(options.vectorPath))
print("File to read vectors: " + str(options.vectorFile))
print("Output path: " + str(options.outputPath))
print("Groups:" + str(options.groups))
listVectors = []
listLabels = []
listGroup = []
vectorLen = int(options.groups)
t0 = time()
with open(os.path.join(options.vectorPath, options.vectorFile), mode="r", encoding='utf8') as iFile:
for line in iFile.readlines():
if line.startswith("#"):
continue
line = line.strip('\r\n')
listLine = line.split('\t')
label = listLine[0]
vector = []
vectorOrig = []
listValues = listLine[1].split()
if len(listValues) != vectorLen:
print("Vector vectorLen does not match: {}".format(label))
continue
for elem in listValues:
vectorOrig.append(float(elem))
vector.append(abs(float(elem)))
listVectors.append(vector)
#group = getGroupSign(vectorOrig)
group = getGroup(vectorOrig)
listGroup.append(group[0])
#listSign.append(group[1])
listLabels.append(label + group[1])
print(" Reading vectors done!")
print(" Len vectors: " + str(len(listVectors)))
print(" Len labels: " + str(len(listLabels)))
with open(os.path.join(options.outputPath, options.vectorFile.replace('.txt', '.grps.txt')), mode='w', encoding='utf8') as oFile:
for g, l in sorted(zip(listGroup, listLabels)):
oFile.write('{}\t{}\n'.format(g, l))
with open(os.path.join(options.outputPath, options.vectorFile.replace('.txt', '.grps-rows.txt')), mode='w', encoding='utf8') as oFile:
g_before = 0
labels = ''
for g, l in sorted(zip(listGroup, listLabels)):
if g != g_before:
oFile.write('{}\t{}\n'.format(g_before, labels.rstrip(', ')))
labels = ''
g_before = g
labels = labels + l + ', '
oFile.write('{}\t{}\n'.format(g_before, labels.rstrip(', ')))
print(" Processing done in %fs" % (time() - t0))