get-groups.py 4.3 KB
import os
from optparse import OptionParser
import sys
from time import time
import re
import numpy as np

# Objective: Obtain groups according to the component with the higher absolute value

# Parameters:
#   1) --vectorPath Path to read vectors.
#   2) --vectorFile File to read vectors.
#   3) --outputPath Path to place output files.
#   4) --groups Number of groups

# Ouput:
#   1) File with groups and plots

# Execution:
# python get-groups.py --outputPath /home/compu2/bionlp/lcg-faaa/agrupamiento-datos-categoricos --vectorPath /home/compu2/bionlp/lcg-faaa/agrupamiento-datos-categoricos --vectorFile vectors_file.txt --groups 2

###########################################################
#                       MAIN PROGRAM                      #
###########################################################

def getGroup(v):
    if np.max(v) == 0:
        index_max = len(v)
    else:
        index_max = np.argmax(v)
    return index_max

def getGroupSign(v):
    sign = ''
    vabs = [abs(i) for i in v]
    if np.max(vabs) == 0:
        index_max = len(vabs)
        sign = '(+/-)'
    else:
        index_max = np.argmax(vabs)
        sign = '(' + str(v[index_max])[:5] + ')'
    return index_max, sign

if __name__ == "__main__":
    # Parameter definition
    parser = OptionParser()
    parser.add_option("--vectorPath", dest="vectorPath",
                      help="Path to read vector file", metavar="PATH")
    parser.add_option("--vectorFile", dest="vectorFile",
                      help="File to read vectors", metavar="FILE")
    parser.add_option("--outputPath", dest="outputPath",
                      help="Path to place clustering classified files", metavar="PATH")
    parser.add_option("--groups", type="int",
                      dest="groups", default=0,
                      help="Groups", metavar="N")

    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("None parameters indicated.")
        sys.exit(1)

    # Printing parameter values
    print('-------------------------------- PARAMETERS --------------------------------')
    print("Path to read vector file: " + str(options.vectorPath))
    print("File to read vectors: " + str(options.vectorFile))
    print("Output path: " + str(options.outputPath))
    print("Groups:" + str(options.groups))

    listVectors = []
    listLabels = []
    listGroup = []
    vectorLen = int(options.groups)
    t0 = time()
    with open(os.path.join(options.vectorPath, options.vectorFile), mode="r", encoding='utf8') as iFile:
        for line in iFile.readlines():
            if line.startswith("#"):
                continue
            line = line.strip('\r\n')
            listLine = line.split('\t')
            label = listLine[0]
            vector = []
            vectorOrig = []
            listValues = listLine[1].split()
            if len(listValues) != vectorLen:
                print("Vector vectorLen does not match: {}".format(label))
                continue
            for elem in listValues:
                #vectorOrig.append(float(elem))
                vector.append(abs(float(elem)))
            listVectors.append(vector)
            #group = getGroupSign(vectorOrig)
            group = getGroup(vector)
            listGroup.append(group)
            #listGroup.append(group[0])
            #listSign.append(group[1])
            #listLabels.append(label + group[1])
            listLabels.append(label)
    print("Reading vectors done!")
    print("  Total vectors: " + str(len(listVectors)))
    print("  Total labels: " + str(len(listLabels)))

    with open(os.path.join(options.outputPath, options.vectorFile.replace('.txt', '.grps.txt')), mode='w', encoding='utf8') as oFile:
        for g, l in sorted(zip(listGroup, listLabels)):
            oFile.write('{}\t{}\n'.format(g, l))

    with open(os.path.join(options.outputPath, options.vectorFile.replace('.txt', '.grps-rows.txt')), mode='w', encoding='utf8') as oFile:
        g_before = 0
        labels = ''
        for g, l in sorted(zip(listGroup, listLabels)):
            if g != g_before:
                oFile.write('{}\t{}\n'.format(g_before, labels.rstrip(', ')))
                labels = ''
                g_before = g
            labels = labels + l + ', '
        oFile.write('{}\t{}\n'.format(g_before, labels.rstrip(', ')))

    print("Processing done in %fs" % (time() - t0))