analyze_pair.py 4.33 KB

#from pdb import set_trace as st
import os
import matplotlib.pyplot as plt
from numpy import log10
from pdb import set_trace as st

def analyze_pair(group_anlized, gropus_dir, rev, eigenvectors, fo):
    dic_part = {}
    with open(eigenvectors) as f:
        eigenvector = [c for c in f.readlines() if c.startswith("(" +
                                            str(group_anlized) + ",")][0]
    x = eigenvector.split(" ")[1]
    eigen = {"string": x,
            "bo": x.split("*")[1].replace('"', ''),
            "value": float(x.split("*")[0].replace("u'", '').strip())
            }

    filelist = os.listdir(gropus_dir)

    init_gfile = "gus_originales_" + str(rev).zfill(3) + ".cls"

    with open(gropus_dir + init_gfile, 'r') as f, open(fo, 'a+') as fO:
        groups = f.readlines()
        del groups[-1]
        tfs = [c for c in groups if c.startswith(str(group_anlized))][0]

        fO.write("%f\t%f\t%s\t%s\t%s\n" % (abs(eigen['value']), eigen['value'],
                  tfs.split('\t')[0], tfs.split('\t')[1].strip(),
                  eigenvector.strip()))
    group = {}
    ss = tfs.split(",")

    TFs = {[s for s in tf.strip().partition("(")[0].partition("\t")
            if s.replace('-', '').isalpha()][0]: [float(
                tf.partition("(")[-1].strip().strip(")"))] for tf in ss}

    partition = []
    for i in reversed(range(2, rev)):
        name = "gus_originales_" + str(i).zfill(3) + ".cls"
        with open(gropus_dir + name, 'r') as f:
            gropus = f.readlines()
            del gropus[-1]

            for c in gropus:
                ss = c.split(",")
                TFss = {[s for s in tf.strip().partition("(")[0].partition("\t")
                     if s.replace('-', '').isalpha()][0]: [float(
                             tf.partition("(")[-1].strip().strip(")"))]
                                                                for tf in ss}

                if set((t for t in list(TFs.keys()))) <= set(TFss):
                    not_in = set(TFs.keys()) ^ set(list(TFss.keys()))
                    for k in list(TFs.keys()):
                        TFs[k] += TFss[k]

                    if not_in:
                        for n in not_in:
                            TFs[n] = TFss[n]

        partition.append((i, list(TFs.keys()), len(list(TFs.keys()))))

    partition = zip(*partition)
    dic_part["part"] = partition[0]
    dic_part["tfs"] = partition[1]
    dic_part["n_tfs"] = partition[2]

    return dic_part


def get_cmap(n, name='hsv'):
    return plt.cm.get_cmap(name, n)


plotting = False
out_file = "persistence.csv"
log = False
rev = 120
ff = "report_persistence_.csv"
eigenvectors = "one-by-one/eigen_vectors/eigenBOs_120-eigens.txt"
gropus_dir = "one-by-one/groups/"
marked = "Zur ZntR"

# Candidatos que inician con 2 o 3 TFs desde la particion de 120 grupos
groups_analyzed = [97, 80, 74, 68, 63, 53, 52, 49, 47, 44, 43, 40, 39, 38, 37,
                    36, 34, 32, 31, 30, 29, 27, 26, 24, 23, 21, 20, 19, 18, 15,
                    13, 12, 9]
cmap = get_cmap(len(groups_analyzed))

# Write the header of the report
with open(ff, "w") as f:
    f.write("Singular value (abs)\tSingular value\tThe analized group\tThe corresponding TFs\tThe corresponding eigenvector\n")

plots = []

for g in groups_analyzed:
    partition = analyze_pair(group_anlized=g, gropus_dir=gropus_dir,
            rev=rev, eigenvectors=eigenvectors, fo=ff)
    plots.append(partition)

if plotting:
    fig = plt.figure()
    ax1 = fig.add_subplot(111)

    for i, p in enumerate(plots):
        if " ".join(p['tfs'][0]) == marked:
            width = 6
            mark = "D"
        else:
            width = 2
            mark = ""

        ax1.plot(p['part'], log10(p['n_tfs']) if log else p['n_tfs'],
                    c=cmap(i), linewidth=width, marker=mark,
                    label=" ".join(p['tfs'][0]))

    plt.legend(loc='upper right')
    plt.title("TF pair persistence through partitions of model resolution")
    plt.xlabel("Partition")
    plt.ylabel("Number of TFs (log_10)")
    plt.show()
else:
    import csv

    lists = []
    pn = [' '] + list(plots[0]['part'])
    lists.append(pn)
    for p in plots:
        lists.append([p['tfs'][0]] + list(p['n_tfs']))
        lists.append([p['tfs'][0]] + list(p['tfs']))

    with open(out_file, 'w') as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerows(zip(*lists))