extract-gcs-curadores_v2.py 16.3 KB

Raw Blame History Permalink

from plotly.colors import n_colors
import plotly.graph_objects as go
import numpy as np
import os
import pandas as pd

def extract_gcs_curadores_SG(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_color):
    df_output_table = pd.DataFrame()
    # df_output_table_bin = pd.DataFrame()
    df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t")
    print("Shape df_input_table: {}".format(df_input_table.shape))
    # print(df_input_table.head())
    hash_all_gcs_control = {}
    hash_all_gcs_num_control = {}
    hash_all_gcs_test = {}
    hash_all_gcs_num_test = {}
    hash_gc_classes = {0: 'ORGANISM', 1: 'GENETIC_BACKGROUND', 2: 'MEDIUM',
        3: 'MEDIUM_SUPPLEMENTS', 5: 'TEMPERATURE', 8: 'OPTICAL_DENSITY', 9: 'GROWTH_PHASE',
        10: 'AGITATION_SPEED', 11: 'AERATION', 12: 'pH'}
    idx = 0
    for ind in df_input_table.index:
        hash_gcs_table = {}
        hash_gcs = {}
        num_gcs = 0
        pmid = df_input_table['Reference'][ind]
        hash_gcs_table['pmid'] = str(pmid)
        # print("Control_Growth_Condition...")
        gcs_control = df_input_table['Control_Growth_Condition'][ind]
        list_gcs_control = gcs_control.split('|')
        hash_gcs_table['IDX'] = idx
        hash_gcs_table['GC_TYPE'] = 'Control'
        for gc in list_gcs_control:
            if gc != '':
                gc_class = hash_gc_classes[list_gcs_control.index(gc)]
                # print("gc_class: {}".format(gc_class))
                hash_gcs_table[gc_class] = gc
                hash_gcs[gc_class] = gc
                num_gcs += 1
        str_gcs = ", ".join(hash_gcs.keys())
        if str_gcs in hash_all_gcs_control:
            hash_all_gcs_control[str_gcs] += 1
        else:
            hash_all_gcs_control[str_gcs] = 1
            hash_all_gcs_num_control[str_gcs] = num_gcs
        df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)

        hash_gcs_table = {}
        hash_gcs = {}
        num_gcs = 0
        hash_gcs_table['pmid'] = str(pmid)
        # print("Test_Growth_Condition...")
        gcs_test = df_input_table['Test_Growth_Condition'][ind]
        list_gcs_test = gcs_test.split('|')
        hash_gcs_table['IDX'] = idx
        hash_gcs_table['GC_TYPE'] = 'Test'
        for gc in list_gcs_test:
            if gc != '':
                gc_class = hash_gc_classes[list_gcs_test.index(gc)]
                # print("gc_class: {}".format(gc_class))
                hash_gcs_table[gc_class] = gc
                hash_gcs[gc_class] = gc
                num_gcs += 1
            # hash_gcs_bin[gc_class] = int(1)
        str_gcs = ", ".join(hash_gcs.keys())
        # print(str_gcs)
        if str_gcs in hash_all_gcs_test:
            hash_all_gcs_test[str_gcs] += 1
        else:
            hash_all_gcs_test[str_gcs] = 1
            hash_all_gcs_num_test[str_gcs] = num_gcs

        df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
        idx += 1

    df_output_table = df_output_table.replace(np.nan, '')
    # print(df_output_table.head())
    df_output_table_csv = df_output_table[['IDX', 'GC_TYPE', 'ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
        'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE']].copy()
        # 'AGITATION_SPEED', 'AERATION', 'pH'
    # df.sort_values(by=['Country', 'Continent'],
    #                ascending=[False, True])
    # print(df_output_table_bin)
    df_output_table_csv.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True)

    ### Color table
    columns = list(df_output_table_csv)
    print(columns)
    df_output_table_color = pd.DataFrame()
    for ind in range(df_output_table_csv.index[-1]+1):
        # print("ind: {}".format(ind))
        # https://www.geeksforgeeks.org/iterating-over-rows-and-columns-in-pandas-dataframe/
        if df_output_table_csv['GC_TYPE'][ind] == 'Control':
            hash_output_table_control = {}
            for i in columns:
                # print("i: {}".format(i))
                # printing the element of the column
                # print(df_output_table_csv[i][ind])
                if df_output_table_csv[i][ind] == '':
                    hash_output_table_control[i + '_COLOR'] = 'rgb(239, 243, 255)'
                else:
                    hash_output_table_control[i + '_COLOR'] = 'rgb(189, 215, 231)'
                hash_output_table_control[i] = df_output_table_csv[i][ind]
            hash_output_table_control['IDX'] = df_output_table_csv['IDX'][ind]
            hash_output_table_control['IDX_COLOR'] = 'rgb(189, 215, 231)'
            hash_output_table_control['GC_TYPE'] = df_output_table_csv['GC_TYPE'][ind]
            hash_output_table_control['GC_TYPE_COLOR'] = 'rgb(189, 215, 231)'
            df_output_table_color = df_output_table_color.append(hash_output_table_control, ignore_index=True)
        elif df_output_table_csv['GC_TYPE'][ind] == 'Test':
            hash_output_table_test = {}
            for i in columns:
                # printing the element of the column
                # print(df_output_table_csv[i][ind])
                if df_output_table_csv[i][ind] == '':
                    hash_output_table_test[i + '_COLOR'] = 'rgb(239, 243, 255)'
                elif df_output_table_csv[i][ind] != df_output_table_csv[i][ind-1]:
                    hash_output_table_test[i + '_COLOR'] = 'rgb(107, 174, 214)'
                else:
                    hash_output_table_test[i + '_COLOR'] = 'rgb(189, 215, 245)'
                hash_output_table_test[i] = df_output_table_csv[i][ind]
            hash_output_table_test['IDX'] = df_output_table_csv['IDX'][ind]
            hash_output_table_test['IDX_COLOR'] = 'rgb(189, 215, 245)'
            hash_output_table_test['GC_TYPE'] = df_output_table_csv['GC_TYPE'][ind]
            hash_output_table_test['GC_TYPE_COLOR'] = 'rgb(189, 215, 245)'
            df_output_table_color = df_output_table_color.append(hash_output_table_test, ignore_index=True)

    df_output_table_color[['IDX', 'GC_TYPE', 'ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
        'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
       'ORGANISM_COLOR', 'GENETIC_BACKGROUND_COLOR', 'MEDIUM_COLOR',
       'MEDIUM_SUPPLEMENTS_COLOR', 'TEMPERATURE_COLOR', 'OPTICAL_DENSITY_COLOR',
       'GROWTH_PHASE_COLOR'
                           ]].to_csv(os.path.join(path_output, file_all_gcs_color), sep='\t', index=False, header=True)
    # https://plotly.com/python/table/
    # Seleccionar color: https://redketchup.io/color-picker
    colors = n_colors('rgb(255, 200, 200)', 'rgb(200, 0, 0)', 3, colortype='rgb')
    fig = go.Figure(data=[go.Table(
      header=dict(
        values=['<b>IDX', '<b>GC_TYPE</b>', '<b>ORGANISM</b>', '<b>GENETIC_BACKGROUND</b>', '<b>MEDIUM</b>',
        '<b>MEDIUM_SUPPLEMENTS</b>', '<b>TEMPERATURE</b>', '<b>OPTICAL_DENSITY</b>', '<b>GROWTH_PHASE</b>'],
        line_color='black', fill_color='white',
        align='center', font=dict(color='black', size=12)
      ),
      cells=dict(
        values=[df_output_table_color.IDX,
                df_output_table_color.GC_TYPE,
                df_output_table_color.ORGANISM,
                df_output_table_color.GENETIC_BACKGROUND,
                df_output_table_color.MEDIUM,
                df_output_table_color.MEDIUM_SUPPLEMENTS,
                df_output_table_color.TEMPERATURE,
                df_output_table_color.OPTICAL_DENSITY,
                df_output_table_color.GROWTH_PHASE],
          #, df_output_table_color.AGITATION_SPEED,
          #      df_output_table_color.AERATION, df_output_table_color.pH,
          #      df_output_table_color.num_gc_classes, df_output_table_color.num_gc_phrases],
        # line_color=[df.Color],
        fill_color=[df_output_table_color.IDX_COLOR,
                    df_output_table_color.GC_TYPE_COLOR,
                df_output_table_color.ORGANISM_COLOR,
                df_output_table_color.GENETIC_BACKGROUND_COLOR,
                df_output_table_color.MEDIUM_COLOR,
                df_output_table_color.MEDIUM_SUPPLEMENTS_COLOR,
                df_output_table_color.TEMPERATURE_COLOR,
                df_output_table_color.OPTICAL_DENSITY_COLOR,
                df_output_table_color.GROWTH_PHASE_COLOR],
        align='center', font=dict(color='black', size=11)
      ))
    ])
    # fig.show() os.path.join(path_output, file_all_gcs_color.replace('.tsv', '.png')
    fig.write_image(os.path.join(path_output, file_all_gcs_color.replace('.tsv', '.png')), height=2500, width=1800)
    quit()


    df_all_gcs = pd.DataFrame()
    df_all_gcs_num = pd.DataFrame()
    print(hash_all_gcs_control)
    print(hash_all_gcs_test)
    # quit()
    for all_gcs, num_gcs in hash_all_gcs_control.items():
        # https://www.geeksforgeeks.org/how-to-append-a-list-as-a-row-to-a-pandas-dataframe-in-python/
        # df = df.append(pd.DataFrame(list, columns=['Name', 'Age', 'City', 'Country']), ignore_index=True)
        df_all_gcs = df_all_gcs.append(pd.DataFrame([['Control', all_gcs, num_gcs]],
            columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True)
    for all_gcs, num_gcs in hash_all_gcs_test.items():
        df_all_gcs = df_all_gcs.append(pd.DataFrame([['Test', all_gcs, num_gcs]],
            columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True)
    # print(df_all_gcs)
    for all_gcs, num_gcs in hash_all_gcs_num_control.items():
        df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['Control', all_gcs, num_gcs]],
            columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True)
    for all_gcs, num_gcs in hash_all_gcs_num_test.items():
        df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['Test', all_gcs, num_gcs]],
            columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True)
    # print(df_all_gcs_num)
    # quit()

    df_join = df_all_gcs.join(df_all_gcs_num.set_index(['class_type', 'gc_phrase']), on=['class_type', 'gc_phrase'], lsuffix='_all', rsuffix='_num')
    # print(df_join)
    # quit()

    # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
    # df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy()
    # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
    # df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases']
    df_all_gcs_table = df_join.copy()
    df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True)

def extract_gcs_curadores_PL(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_color):
    df_output_table = pd.DataFrame()
    # df_output_table_bin = pd.DataFrame()
    df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t")
    print("Shape df_input_table: {}".format(df_input_table.shape))
    # print(df_input_table.head())
    hash_all_gcs = {}
    hash_all_gcs_num = {}
    for ind in df_input_table.index:
        hash_gcs_table = {}
        hash_gcs = {}
        num_gcs = 0
        # hash_gcs_bin = {}
        pmid = df_input_table['PMID'][ind]
        coleccion = df_input_table['Colección'][ind]
        hash_gcs_table['pmid'] = str(pmid)
        # hash_gcs_bin['pmid'] = str(pmid)
        hash_gcs_table['coleccion'] = coleccion
        gcs = df_input_table['GC'][ind]
        list_gcs = gcs.split('|')
        for gc in list_gcs:
            list_gc = gc.split(':')
            gc_class = list_gc[0]
            gc_class = gc_class.strip()
            gc_class = gc_class.strip("'")
            gc_term = list_gc[1]
            gc_term = gc_term.strip()
            gc_term = gc_term.strip("'")
            hash_gcs_table[gc_class] = gc_term
            hash_gcs[gc_class] = gc_term
            num_gcs += 1
            # hash_gcs_bin[gc_class] = int(1)
        str_gcs = ", ".join(hash_gcs.keys())
        # print(str_gcs)
        if str_gcs in hash_all_gcs:
            hash_all_gcs[str_gcs] += 1
        else:
            hash_all_gcs[str_gcs] = 1
            hash_all_gcs_num[str_gcs] = num_gcs
        df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
        # df_output_table_bin = df_output_table_bin.append(hash_gcs_bin, ignore_index=True)
        # print(df_output_table)
        # quit()
    # df_output_table_bin = df_output_table_bin.replace(np.nan, 0)
    # df_output_table_bin = pd.DataFrame(np.sort(df_output_table_bin.values, axis=0), index=df_output_table_bin.index, columns=df_output_table_bin.columns)
    df_output_table = df_output_table.replace(np.nan, '')
    # df.sort_values(by=['Country', 'Continent'],
    #                ascending=[False, True])
    # print(df_output_table_bin)
    df_output_table.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True)
    # df_output_table_bin.to_csv(os.path.join(path_output, file_bin), sep='\t', index=False, header=True)

    df_all_gcs = pd.DataFrame(list(hash_all_gcs.items()))
    df_all_gcs_num = pd.DataFrame(list(hash_all_gcs_num.items()))
    df_join = df_all_gcs.join(df_all_gcs_num, lsuffix='_all', rsuffix='_num')
    # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
    df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy()
    # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
    df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases']
    df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True)

    df_all_gcs_table_x = pd.DataFrame()
    for ind in df_all_gcs_table.index:
        hash_gcs_table_x = {}
        phrase = df_all_gcs_table['gc_phrase'][ind]
        list_phrase = phrase.split(',')
        for gc_class in list_phrase:
            gc_class = gc_class.strip()
            hash_gcs_table_x[gc_class] = 'X'
        hash_gcs_table_x['num_gc_classes'] = df_all_gcs_table['num_gc_classes'][ind]
        hash_gcs_table_x['num_gc_phrases'] = df_all_gcs_table['num_gc_phrases'][ind]
        df_all_gcs_table_x = df_all_gcs_table_x.append(hash_gcs_table_x, ignore_index=True)
    df_all_gcs_table_x_out = df_all_gcs_table_x[['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
        'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
        'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases']].copy()
    df_all_gcs_table_x_out.to_csv(os.path.join(path_output, file_all_gcs_color), sep='\t', index=False, header=True)

    # https://plotly.com/python/table/
    fig = go.Figure(data=[go.Table(
      header=dict(
        values=['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
        'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
        'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases'],
        line_color='white', fill_color='white',
        align='center', font=dict(color='black', size=12)
      ),
      cells=dict(
        values=[df_all_gcs_table_x_out.ORGANISM, df_all_gcs_table_x_out.GENETIC_BACKGROUND,
                df_all_gcs_table_x_out.MEDIUM, df_all_gcs_table_x_out.MEDIUM_SUPPLEMENTS,
                df_all_gcs_table_x_out.TEMPERATURE, df_all_gcs_table_x_out.OPTICAL_DENSITY,
                df_all_gcs_table_x_out.GROWTH_PHASE, df_all_gcs_table_x_out.AGITATION_SPEED,
                df_all_gcs_table_x_out.AERATION, df_all_gcs_table_x_out.pH,
                df_all_gcs_table_x_out.num_gc_classes, df_all_gcs_table_x_out.num_gc_phrases],
        # line_color=[df.Color], fill_color=[df.Color],
        align='center', font=dict(color='black', size=11)
      ))
    ])
    fig.show()


path_in = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
path_out = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
# Curator: PL
file_in = "PL&VT_GC-PMID.tsv"
file_out = "PL&VT_GC-PMID-GCs.tsv"
file_gcs = "PL&VT_GC-PMID-GCs-All.tsv"
file_gcs_x = "PL&VT_GC-PMID-GCs-All-X.tsv"
### extract_gcs_curadores_PL(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x)

# Curator: SG
file_in = "GC-Catalog_resumido-sgama.tsv"
file_out = "GC-Catalog_resumido-sgama-GCs.tsv"
file_gcs = "GC-Catalog_resumido-sgama-GCs-All.tsv"
file_gcs_color = "GC-Catalog_resumido-sgama-GCs-color.tsv"
extract_gcs_curadores_SG(path_in, file_in, path_out, file_out, file_gcs, file_gcs_color)