extract-gcs-curadores_v1.py 13.1 KB

Raw Blame History Permalink

import plotly.graph_objects as go
import numpy as np
import os
import pandas as pd

def extract_gcs_curadores_SG(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_x):
    df_output_table = pd.DataFrame()
    # df_output_table_bin = pd.DataFrame()
    df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t")
    print("Shape df_input_table: {}".format(df_input_table.shape))
    print(df_input_table.head())
    hash_all_gcs_control = {}
    hash_all_gcs_num_control = {}
    hash_all_gcs_test = {}
    hash_all_gcs_num_test = {}
    hash_gc_classes = {0: 'ORGANISM', 1: 'GENETIC_BACKGROUND', 2: 'MEDIUM',
        3: 'MEDIUM_SUPPLEMENTS', 5: 'TEMPERATURE', 8: 'OPTICAL_DENSITY', 9: 'GROWTH_PHASE',
        10: 'AGITATION_SPEED', 11: 'AERATION', 12: 'pH'}
    idx = 0
    for ind in df_input_table.index:
        hash_gcs_table = {}
        hash_gcs = {}
        num_gcs = 0
        pmid = df_input_table['Reference'][ind]
        hash_gcs_table['pmid'] = str(pmid)
        # print("Control_Growth_Condition...")
        gcs_control = df_input_table['Control_Growth_Condition'][ind]
        list_gcs_control = gcs_control.split('|')
        hash_gcs_table['idx'] = idx
        hash_gcs_table['gc_type'] = 'control'
        for gc in list_gcs_control:
            if gc != '':
                gc_class = hash_gc_classes[list_gcs_control.index(gc)]
                # print("gc_class: {}".format(gc_class))
                hash_gcs_table[gc_class] = gc
                hash_gcs[gc_class] = gc
                num_gcs += 1
        str_gcs = ", ".join(hash_gcs.keys())
        if str_gcs in hash_all_gcs_control:
            hash_all_gcs_control[str_gcs] += 1
        else:
            hash_all_gcs_control[str_gcs] = 1
            hash_all_gcs_num_control[str_gcs] = num_gcs
        df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)

        hash_gcs_table = {}
        hash_gcs = {}
        num_gcs = 0
        hash_gcs_table['pmid'] = str(pmid)
        # print("Test_Growth_Condition...")
        gcs_test = df_input_table['Test_Growth_Condition'][ind]
        list_gcs_test = gcs_test.split('|')
        hash_gcs_table['idx'] = idx
        hash_gcs_table['gc_type'] = 'test'
        for gc in list_gcs_test:
            if gc != '':
                gc_class = hash_gc_classes[list_gcs_test.index(gc)]
                # print("gc_class: {}".format(gc_class))
                hash_gcs_table[gc_class] = gc
                hash_gcs[gc_class] = gc
                num_gcs += 1
            # hash_gcs_bin[gc_class] = int(1)
        str_gcs = ", ".join(hash_gcs.keys())
        # print(str_gcs)
        if str_gcs in hash_all_gcs_test:
            hash_all_gcs_test[str_gcs] += 1
        else:
            hash_all_gcs_test[str_gcs] = 1
            hash_all_gcs_num_test[str_gcs] = num_gcs

        df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
        idx += 1

    df_output_table = df_output_table.replace(np.nan, '')
    # df.sort_values(by=['Country', 'Continent'],
    #                ascending=[False, True])
    # print(df_output_table_bin)
    df_output_table.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True)

    df_all_gcs = pd.DataFrame()
    df_all_gcs_num = pd.DataFrame()
    print(hash_all_gcs_control)
    print(hash_all_gcs_test)
    # quit()
    for all_gcs, num_gcs in hash_all_gcs_control.items():
        # https://www.geeksforgeeks.org/how-to-append-a-list-as-a-row-to-a-pandas-dataframe-in-python/
        # df = df.append(pd.DataFrame(list, columns=['Name', 'Age', 'City', 'Country']), ignore_index=True)
        df_all_gcs = df_all_gcs.append(pd.DataFrame([['control', all_gcs, num_gcs]],
            columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True)
    for all_gcs, num_gcs in hash_all_gcs_test.items():
        df_all_gcs = df_all_gcs.append(pd.DataFrame([['test', all_gcs, num_gcs]],
            columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True)
    print(df_all_gcs)
    for all_gcs, num_gcs in hash_all_gcs_num_control.items():
        df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['control', all_gcs, num_gcs]],
            columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True)
    for all_gcs, num_gcs in hash_all_gcs_num_test.items():
        df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['test', all_gcs, num_gcs]],
            columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True)
    print(df_all_gcs_num)
    # quit()

    df_join = df_all_gcs.join(df_all_gcs_num.set_index(['class_type', 'gc_phrase']), on=['class_type', 'gc_phrase'], lsuffix='_all', rsuffix='_num')
    print(df_join)
    # quit()

    # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
    # df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy()
    # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
    # df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases']
    df_all_gcs_table = df_join.copy()
    df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True)
    quit()

    df_all_gcs_table_x = pd.DataFrame()
    for ind in df_all_gcs_table.index:
        hash_gcs_table_x = {}
        phrase = df_all_gcs_table['gc_phrase'][ind]
        list_phrase = phrase.split(',')
        for gc_class in list_phrase:
            gc_class = gc_class.strip()
            hash_gcs_table_x[gc_class] = 'X'
        hash_gcs_table_x['num_gc_classes'] = df_all_gcs_table['num_gc_classes'][ind]
        hash_gcs_table_x['num_gc_phrases'] = df_all_gcs_table['num_gc_phrases'][ind]
        df_all_gcs_table_x = df_all_gcs_table_x.append(hash_gcs_table_x, ignore_index=True)
    df_all_gcs_table_x_out = df_all_gcs_table_x[['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
        'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
        'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases']].copy()
    df_all_gcs_table_x_out.to_csv(os.path.join(path_output, file_all_gcs_x), sep='\t', index=False, header=True)

    # https://plotly.com/python/table/
    fig = go.Figure(data=[go.Table(
      header=dict(
        values=['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
        'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
        'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases'],
        line_color='white', fill_color='white',
        align='center', font=dict(color='black', size=12)
      ),
      cells=dict(
        values=[df_all_gcs_table_x_out.ORGANISM, df_all_gcs_table_x_out.GENETIC_BACKGROUND,
                df_all_gcs_table_x_out.MEDIUM, df_all_gcs_table_x_out.MEDIUM_SUPPLEMENTS,
                df_all_gcs_table_x_out.TEMPERATURE, df_all_gcs_table_x_out.OPTICAL_DENSITY,
                df_all_gcs_table_x_out.GROWTH_PHASE, df_all_gcs_table_x_out.AGITATION_SPEED,
                df_all_gcs_table_x_out.AERATION, df_all_gcs_table_x_out.pH,
                df_all_gcs_table_x_out.num_gc_classes, df_all_gcs_table_x_out.num_gc_phrases],
        # line_color=[df.Color], fill_color=[df.Color],
        align='center', font=dict(color='black', size=11)
      ))
    ])
    fig.show()

def extract_gcs_curadores_PL(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_x):
    df_output_table = pd.DataFrame()
    # df_output_table_bin = pd.DataFrame()
    df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t")
    print("Shape df_input_table: {}".format(df_input_table.shape))
    # print(df_input_table.head())
    hash_all_gcs = {}
    hash_all_gcs_num = {}
    for ind in df_input_table.index:
        hash_gcs_table = {}
        hash_gcs = {}
        num_gcs = 0
        # hash_gcs_bin = {}
        pmid = df_input_table['PMID'][ind]
        coleccion = df_input_table['Colección'][ind]
        hash_gcs_table['pmid'] = str(pmid)
        # hash_gcs_bin['pmid'] = str(pmid)
        hash_gcs_table['coleccion'] = coleccion
        gcs = df_input_table['GC'][ind]
        list_gcs = gcs.split('|')
        for gc in list_gcs:
            list_gc = gc.split(':')
            gc_class = list_gc[0]
            gc_class = gc_class.strip()
            gc_class = gc_class.strip("'")
            gc_term = list_gc[1]
            gc_term = gc_term.strip()
            gc_term = gc_term.strip("'")
            hash_gcs_table[gc_class] = gc_term
            hash_gcs[gc_class] = gc_term
            num_gcs += 1
            # hash_gcs_bin[gc_class] = int(1)
        str_gcs = ", ".join(hash_gcs.keys())
        # print(str_gcs)
        if str_gcs in hash_all_gcs:
            hash_all_gcs[str_gcs] += 1
        else:
            hash_all_gcs[str_gcs] = 1
            hash_all_gcs_num[str_gcs] = num_gcs
        df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
        # df_output_table_bin = df_output_table_bin.append(hash_gcs_bin, ignore_index=True)
        # print(df_output_table)
        # quit()
    # df_output_table_bin = df_output_table_bin.replace(np.nan, 0)
    # df_output_table_bin = pd.DataFrame(np.sort(df_output_table_bin.values, axis=0), index=df_output_table_bin.index, columns=df_output_table_bin.columns)
    df_output_table = df_output_table.replace(np.nan, '')
    # df.sort_values(by=['Country', 'Continent'],
    #                ascending=[False, True])
    # print(df_output_table_bin)
    df_output_table.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True)
    # df_output_table_bin.to_csv(os.path.join(path_output, file_bin), sep='\t', index=False, header=True)

    df_all_gcs = pd.DataFrame(list(hash_all_gcs.items()))
    df_all_gcs_num = pd.DataFrame(list(hash_all_gcs_num.items()))
    df_join = df_all_gcs.join(df_all_gcs_num, lsuffix='_all', rsuffix='_num')
    # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
    df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy()
    # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
    df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases']
    df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True)

    df_all_gcs_table_x = pd.DataFrame()
    for ind in df_all_gcs_table.index:
        hash_gcs_table_x = {}
        phrase = df_all_gcs_table['gc_phrase'][ind]
        list_phrase = phrase.split(',')
        for gc_class in list_phrase:
            gc_class = gc_class.strip()
            hash_gcs_table_x[gc_class] = 'X'
        hash_gcs_table_x['num_gc_classes'] = df_all_gcs_table['num_gc_classes'][ind]
        hash_gcs_table_x['num_gc_phrases'] = df_all_gcs_table['num_gc_phrases'][ind]
        df_all_gcs_table_x = df_all_gcs_table_x.append(hash_gcs_table_x, ignore_index=True)
    df_all_gcs_table_x_out = df_all_gcs_table_x[['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
        'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
        'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases']].copy()
    df_all_gcs_table_x_out.to_csv(os.path.join(path_output, file_all_gcs_x), sep='\t', index=False, header=True)

    # https://plotly.com/python/table/
    fig = go.Figure(data=[go.Table(
      header=dict(
        values=['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
        'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
        'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases'],
        line_color='white', fill_color='white',
        align='center', font=dict(color='black', size=12)
      ),
      cells=dict(
        values=[df_all_gcs_table_x_out.ORGANISM, df_all_gcs_table_x_out.GENETIC_BACKGROUND,
                df_all_gcs_table_x_out.MEDIUM, df_all_gcs_table_x_out.MEDIUM_SUPPLEMENTS,
                df_all_gcs_table_x_out.TEMPERATURE, df_all_gcs_table_x_out.OPTICAL_DENSITY,
                df_all_gcs_table_x_out.GROWTH_PHASE, df_all_gcs_table_x_out.AGITATION_SPEED,
                df_all_gcs_table_x_out.AERATION, df_all_gcs_table_x_out.pH,
                df_all_gcs_table_x_out.num_gc_classes, df_all_gcs_table_x_out.num_gc_phrases],
        # line_color=[df.Color], fill_color=[df.Color],
        align='center', font=dict(color='black', size=11)
      ))
    ])
    fig.show()


path_in = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
path_out = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
# Curator: PL
file_in = "PL&VT_GC-PMID.tsv"
file_out = "PL&VT_GC-PMID-GCs.tsv"
file_gcs = "PL&VT_GC-PMID-GCs-All.tsv"
file_gcs_x = "PL&VT_GC-PMID-GCs-All-X.tsv"
### extract_gcs_curadores_PL(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x)

# Curator: SG
file_in = "GC-Catalog_resumido-sgama.tsv"
file_out = "GC-Catalog_resumido-sgama-GCs.tsv"
file_gcs = "GC-Catalog_resumido-sgama-GCs-All.tsv"
file_gcs_x = "GC-Catalog_resumido-sgama-GCs-All-X.tsv"
extract_gcs_curadores_SG(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x)