Extracción de GCs de literatura.

cmendezc
Commit 2088c9b2be214c46b7916fd21f609b6d71f247ce 2088c9b2 1 parent ecabc025
Showing 2 changed files with 588 additions and 0 deletions
data-sets/bin/extract-gcs-curadores_v1.py
data-sets/bin/extract-gcs-curadores_v2.py
--- a/data-sets/bin/extract-gcs-curadores_v1.py 0 → 100644
View file @2088c9b
+++ b/data-sets/bin/extract-gcs-curadores_v1.py 0 → 100644
View file @2088c9b
+import plotly.graph_objects as go
+import numpy as np
+import os
+import pandas as pd
+
+def extract_gcs_curadores_SG(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_x):
+    df_output_table = pd.DataFrame()
+    # df_output_table_bin = pd.DataFrame()
+    df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t")
+    print("Shape df_input_table: {}".format(df_input_table.shape))
+    print(df_input_table.head())
+    hash_all_gcs_control = {}
+    hash_all_gcs_num_control = {}
+    hash_all_gcs_test = {}
+    hash_all_gcs_num_test = {}
+    hash_gc_classes = {0: 'ORGANISM', 1: 'GENETIC_BACKGROUND', 2: 'MEDIUM',
+        3: 'MEDIUM_SUPPLEMENTS', 5: 'TEMPERATURE', 8: 'OPTICAL_DENSITY', 9: 'GROWTH_PHASE',
+        10: 'AGITATION_SPEED', 11: 'AERATION', 12: 'pH'}
+    idx = 0
+    for ind in df_input_table.index:
+        hash_gcs_table = {}
+        hash_gcs = {}
+        num_gcs = 0
+        pmid = df_input_table['Reference'][ind]
+        hash_gcs_table['pmid'] = str(pmid)
+        # print("Control_Growth_Condition...")
+        gcs_control = df_input_table['Control_Growth_Condition'][ind]
+        list_gcs_control = gcs_control.split('|')
+        hash_gcs_table['idx'] = idx
+        hash_gcs_table['gc_type'] = 'control'
+        for gc in list_gcs_control:
+            if gc != '':
+                gc_class = hash_gc_classes[list_gcs_control.index(gc)]
+                # print("gc_class: {}".format(gc_class))
+                hash_gcs_table[gc_class] = gc
+                hash_gcs[gc_class] = gc
+                num_gcs += 1
+        str_gcs = ", ".join(hash_gcs.keys())
+        if str_gcs in hash_all_gcs_control:
+            hash_all_gcs_control[str_gcs] += 1
+        else:
+            hash_all_gcs_control[str_gcs] = 1
+            hash_all_gcs_num_control[str_gcs] = num_gcs
+        df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
+
+        hash_gcs_table = {}
+        hash_gcs = {}
+        num_gcs = 0
+        hash_gcs_table['pmid'] = str(pmid)
+        # print("Test_Growth_Condition...")
+        gcs_test = df_input_table['Test_Growth_Condition'][ind]
+        list_gcs_test = gcs_test.split('|')
+        hash_gcs_table['idx'] = idx
+        hash_gcs_table['gc_type'] = 'test'
+        for gc in list_gcs_test:
+            if gc != '':
+                gc_class = hash_gc_classes[list_gcs_test.index(gc)]
+                # print("gc_class: {}".format(gc_class))
+                hash_gcs_table[gc_class] = gc
+                hash_gcs[gc_class] = gc
+                num_gcs += 1
+            # hash_gcs_bin[gc_class] = int(1)
+        str_gcs = ", ".join(hash_gcs.keys())
+        # print(str_gcs)
+        if str_gcs in hash_all_gcs_test:
+            hash_all_gcs_test[str_gcs] += 1
+        else:
+            hash_all_gcs_test[str_gcs] = 1
+            hash_all_gcs_num_test[str_gcs] = num_gcs
+
+        df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
+        idx += 1
+
+    df_output_table = df_output_table.replace(np.nan, '')
+    # df.sort_values(by=['Country', 'Continent'],
+    #                ascending=[False, True])
+    # print(df_output_table_bin)
+    df_output_table.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True)
+
+    df_all_gcs = pd.DataFrame()
+    df_all_gcs_num = pd.DataFrame()
+    print(hash_all_gcs_control)
+    print(hash_all_gcs_test)
+    # quit()
+    for all_gcs, num_gcs in hash_all_gcs_control.items():
+        # https://www.geeksforgeeks.org/how-to-append-a-list-as-a-row-to-a-pandas-dataframe-in-python/
+        # df = df.append(pd.DataFrame(list, columns=['Name', 'Age', 'City', 'Country']), ignore_index=True)
+        df_all_gcs = df_all_gcs.append(pd.DataFrame([['control', all_gcs, num_gcs]],
+            columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True)
+    for all_gcs, num_gcs in hash_all_gcs_test.items():
+        df_all_gcs = df_all_gcs.append(pd.DataFrame([['test', all_gcs, num_gcs]],
+            columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True)
+    print(df_all_gcs)
+    for all_gcs, num_gcs in hash_all_gcs_num_control.items():
+        df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['control', all_gcs, num_gcs]],
+            columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True)
+    for all_gcs, num_gcs in hash_all_gcs_num_test.items():
+        df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['test', all_gcs, num_gcs]],
+            columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True)
+    print(df_all_gcs_num)
+    # quit()
+
+    df_join = df_all_gcs.join(df_all_gcs_num.set_index(['class_type', 'gc_phrase']), on=['class_type', 'gc_phrase'], lsuffix='_all', rsuffix='_num')
+    print(df_join)
+    # quit()
+
+    # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
+    # df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy()
+    # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
+    # df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases']
+    df_all_gcs_table = df_join.copy()
+    df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True)
+    quit()
+
+    df_all_gcs_table_x = pd.DataFrame()
+    for ind in df_all_gcs_table.index:
+        hash_gcs_table_x = {}
+        phrase = df_all_gcs_table['gc_phrase'][ind]
+        list_phrase = phrase.split(',')
+        for gc_class in list_phrase:
+            gc_class = gc_class.strip()
+            hash_gcs_table_x[gc_class] = 'X'
+        hash_gcs_table_x['num_gc_classes'] = df_all_gcs_table['num_gc_classes'][ind]
+        hash_gcs_table_x['num_gc_phrases'] = df_all_gcs_table['num_gc_phrases'][ind]
+        df_all_gcs_table_x = df_all_gcs_table_x.append(hash_gcs_table_x, ignore_index=True)
+    df_all_gcs_table_x_out = df_all_gcs_table_x[['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
+        'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
+        'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases']].copy()
+    df_all_gcs_table_x_out.to_csv(os.path.join(path_output, file_all_gcs_x), sep='\t', index=False, header=True)
+
+    # https://plotly.com/python/table/
+    fig = go.Figure(data=[go.Table(
+      header=dict(
+        values=['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
+        'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
+        'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases'],
+        line_color='white', fill_color='white',
+        align='center', font=dict(color='black', size=12)
+      ),
+      cells=dict(
+        values=[df_all_gcs_table_x_out.ORGANISM, df_all_gcs_table_x_out.GENETIC_BACKGROUND,
+                df_all_gcs_table_x_out.MEDIUM, df_all_gcs_table_x_out.MEDIUM_SUPPLEMENTS,
+                df_all_gcs_table_x_out.TEMPERATURE, df_all_gcs_table_x_out.OPTICAL_DENSITY,
+                df_all_gcs_table_x_out.GROWTH_PHASE, df_all_gcs_table_x_out.AGITATION_SPEED,
+                df_all_gcs_table_x_out.AERATION, df_all_gcs_table_x_out.pH,
+                df_all_gcs_table_x_out.num_gc_classes, df_all_gcs_table_x_out.num_gc_phrases],
+        # line_color=[df.Color], fill_color=[df.Color],
+        align='center', font=dict(color='black', size=11)
+      ))
+    ])
+    fig.show()
+
+def extract_gcs_curadores_PL(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_x):
+    df_output_table = pd.DataFrame()
+    # df_output_table_bin = pd.DataFrame()
+    df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t")
+    print("Shape df_input_table: {}".format(df_input_table.shape))
+    # print(df_input_table.head())
+    hash_all_gcs = {}
+    hash_all_gcs_num = {}
+    for ind in df_input_table.index:
+        hash_gcs_table = {}
+        hash_gcs = {}
+        num_gcs = 0
+        # hash_gcs_bin = {}
+        pmid = df_input_table['PMID'][ind]
+        coleccion = df_input_table['Colección'][ind]
+        hash_gcs_table['pmid'] = str(pmid)
+        # hash_gcs_bin['pmid'] = str(pmid)
+        hash_gcs_table['coleccion'] = coleccion
+        gcs = df_input_table['GC'][ind]
+        list_gcs = gcs.split('|')
+        for gc in list_gcs:
+            list_gc = gc.split(':')
+            gc_class = list_gc[0]
+            gc_class = gc_class.strip()
+            gc_class = gc_class.strip("'")
+            gc_term = list_gc[1]
+            gc_term = gc_term.strip()
+            gc_term = gc_term.strip("'")
+            hash_gcs_table[gc_class] = gc_term
+            hash_gcs[gc_class] = gc_term
+            num_gcs += 1
+            # hash_gcs_bin[gc_class] = int(1)
+        str_gcs = ", ".join(hash_gcs.keys())
+        # print(str_gcs)
+        if str_gcs in hash_all_gcs:
+            hash_all_gcs[str_gcs] += 1
+        else:
+            hash_all_gcs[str_gcs] = 1
+            hash_all_gcs_num[str_gcs] = num_gcs
+        df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
+        # df_output_table_bin = df_output_table_bin.append(hash_gcs_bin, ignore_index=True)
+        # print(df_output_table)
+        # quit()
+    # df_output_table_bin = df_output_table_bin.replace(np.nan, 0)
+    # df_output_table_bin = pd.DataFrame(np.sort(df_output_table_bin.values, axis=0), index=df_output_table_bin.index, columns=df_output_table_bin.columns)
+    df_output_table = df_output_table.replace(np.nan, '')
+    # df.sort_values(by=['Country', 'Continent'],
+    #                ascending=[False, True])
+    # print(df_output_table_bin)
+    df_output_table.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True)
+    # df_output_table_bin.to_csv(os.path.join(path_output, file_bin), sep='\t', index=False, header=True)
+
+    df_all_gcs = pd.DataFrame(list(hash_all_gcs.items()))
+    df_all_gcs_num = pd.DataFrame(list(hash_all_gcs_num.items()))
+    df_join = df_all_gcs.join(df_all_gcs_num, lsuffix='_all', rsuffix='_num')
+    # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
+    df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy()
+    # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
+    df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases']
+    df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True)
+
+    df_all_gcs_table_x = pd.DataFrame()
+    for ind in df_all_gcs_table.index:
+        hash_gcs_table_x = {}
+        phrase = df_all_gcs_table['gc_phrase'][ind]
+        list_phrase = phrase.split(',')
+        for gc_class in list_phrase:
+            gc_class = gc_class.strip()
+            hash_gcs_table_x[gc_class] = 'X'
+        hash_gcs_table_x['num_gc_classes'] = df_all_gcs_table['num_gc_classes'][ind]
+        hash_gcs_table_x['num_gc_phrases'] = df_all_gcs_table['num_gc_phrases'][ind]
+        df_all_gcs_table_x = df_all_gcs_table_x.append(hash_gcs_table_x, ignore_index=True)
+    df_all_gcs_table_x_out = df_all_gcs_table_x[['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
+        'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
+        'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases']].copy()
+    df_all_gcs_table_x_out.to_csv(os.path.join(path_output, file_all_gcs_x), sep='\t', index=False, header=True)
+
+    # https://plotly.com/python/table/
+    fig = go.Figure(data=[go.Table(
+      header=dict(
+        values=['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
+        'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
+        'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases'],
+        line_color='white', fill_color='white',
+        align='center', font=dict(color='black', size=12)
+      ),
+      cells=dict(
+        values=[df_all_gcs_table_x_out.ORGANISM, df_all_gcs_table_x_out.GENETIC_BACKGROUND,
+                df_all_gcs_table_x_out.MEDIUM, df_all_gcs_table_x_out.MEDIUM_SUPPLEMENTS,
+                df_all_gcs_table_x_out.TEMPERATURE, df_all_gcs_table_x_out.OPTICAL_DENSITY,
+                df_all_gcs_table_x_out.GROWTH_PHASE, df_all_gcs_table_x_out.AGITATION_SPEED,
+                df_all_gcs_table_x_out.AERATION, df_all_gcs_table_x_out.pH,
+                df_all_gcs_table_x_out.num_gc_classes, df_all_gcs_table_x_out.num_gc_phrases],
+        # line_color=[df.Color], fill_color=[df.Color],
+        align='center', font=dict(color='black', size=11)
+      ))
+    ])
+    fig.show()
+
+
+path_in = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
+path_out = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
+# Curator: PL
+file_in = "PL&VT_GC-PMID.tsv"
+file_out = "PL&VT_GC-PMID-GCs.tsv"
+file_gcs = "PL&VT_GC-PMID-GCs-All.tsv"
+file_gcs_x = "PL&VT_GC-PMID-GCs-All-X.tsv"
+### extract_gcs_curadores_PL(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x)
+
+# Curator: SG
+file_in = "GC-Catalog_resumido-sgama.tsv"
+file_out = "GC-Catalog_resumido-sgama-GCs.tsv"
+file_gcs = "GC-Catalog_resumido-sgama-GCs-All.tsv"
+file_gcs_x = "GC-Catalog_resumido-sgama-GCs-All-X.tsv"
+extract_gcs_curadores_SG(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x)
--- a/data-sets/bin/extract-gcs-curadores_v2.py 0 → 100644
View file @2088c9b
+++ b/data-sets/bin/extract-gcs-curadores_v2.py 0 → 100644
View file @2088c9b
+from plotly.colors import n_colors
+import plotly.graph_objects as go
+import numpy as np
+import os
+import pandas as pd
+
+def extract_gcs_curadores_SG(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_color):
+    df_output_table = pd.DataFrame()
+    # df_output_table_bin = pd.DataFrame()
+    df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t")
+    print("Shape df_input_table: {}".format(df_input_table.shape))
+    # print(df_input_table.head())
+    hash_all_gcs_control = {}
+    hash_all_gcs_num_control = {}
+    hash_all_gcs_test = {}
+    hash_all_gcs_num_test = {}
+    hash_gc_classes = {0: 'ORGANISM', 1: 'GENETIC_BACKGROUND', 2: 'MEDIUM',
+        3: 'MEDIUM_SUPPLEMENTS', 5: 'TEMPERATURE', 8: 'OPTICAL_DENSITY', 9: 'GROWTH_PHASE',
+        10: 'AGITATION_SPEED', 11: 'AERATION', 12: 'pH'}
+    idx = 0
+    for ind in df_input_table.index:
+        hash_gcs_table = {}
+        hash_gcs = {}
+        num_gcs = 0
+        pmid = df_input_table['Reference'][ind]
+        hash_gcs_table['pmid'] = str(pmid)
+        # print("Control_Growth_Condition...")
+        gcs_control = df_input_table['Control_Growth_Condition'][ind]
+        list_gcs_control = gcs_control.split('|')
+        hash_gcs_table['IDX'] = idx
+        hash_gcs_table['GC_TYPE'] = 'Control'
+        for gc in list_gcs_control:
+            if gc != '':
+                gc_class = hash_gc_classes[list_gcs_control.index(gc)]
+                # print("gc_class: {}".format(gc_class))
+                hash_gcs_table[gc_class] = gc
+                hash_gcs[gc_class] = gc
+                num_gcs += 1
+        str_gcs = ", ".join(hash_gcs.keys())
+        if str_gcs in hash_all_gcs_control:
+            hash_all_gcs_control[str_gcs] += 1
+        else:
+            hash_all_gcs_control[str_gcs] = 1
+            hash_all_gcs_num_control[str_gcs] = num_gcs
+        df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
+
+        hash_gcs_table = {}
+        hash_gcs = {}
+        num_gcs = 0
+        hash_gcs_table['pmid'] = str(pmid)
+        # print("Test_Growth_Condition...")
+        gcs_test = df_input_table['Test_Growth_Condition'][ind]
+        list_gcs_test = gcs_test.split('|')
+        hash_gcs_table['IDX'] = idx
+        hash_gcs_table['GC_TYPE'] = 'Test'
+        for gc in list_gcs_test:
+            if gc != '':
+                gc_class = hash_gc_classes[list_gcs_test.index(gc)]
+                # print("gc_class: {}".format(gc_class))
+                hash_gcs_table[gc_class] = gc
+                hash_gcs[gc_class] = gc
+                num_gcs += 1
+            # hash_gcs_bin[gc_class] = int(1)
+        str_gcs = ", ".join(hash_gcs.keys())
+        # print(str_gcs)
+        if str_gcs in hash_all_gcs_test:
+            hash_all_gcs_test[str_gcs] += 1
+        else:
+            hash_all_gcs_test[str_gcs] = 1
+            hash_all_gcs_num_test[str_gcs] = num_gcs
+
+        df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
+        idx += 1
+
+    df_output_table = df_output_table.replace(np.nan, '')
+    # print(df_output_table.head())
+    df_output_table_csv = df_output_table[['IDX', 'GC_TYPE', 'ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
+        'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE']].copy()
+        # 'AGITATION_SPEED', 'AERATION', 'pH'
+    # df.sort_values(by=['Country', 'Continent'],
+    #                ascending=[False, True])
+    # print(df_output_table_bin)
+    df_output_table_csv.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True)
+
+    ### Color table
+    columns = list(df_output_table_csv)
+    print(columns)
+    df_output_table_color = pd.DataFrame()
+    for ind in range(df_output_table_csv.index[-1]+1):
+        # print("ind: {}".format(ind))
+        # https://www.geeksforgeeks.org/iterating-over-rows-and-columns-in-pandas-dataframe/
+        if df_output_table_csv['GC_TYPE'][ind] == 'Control':
+            hash_output_table_control = {}
+            for i in columns:
+                # print("i: {}".format(i))
+                # printing the element of the column
+                # print(df_output_table_csv[i][ind])
+                if df_output_table_csv[i][ind] == '':
+                    hash_output_table_control[i + '_COLOR'] = 'rgb(239, 243, 255)'
+                else:
+                    hash_output_table_control[i + '_COLOR'] = 'rgb(189, 215, 231)'
+                hash_output_table_control[i] = df_output_table_csv[i][ind]
+            hash_output_table_control['IDX'] = df_output_table_csv['IDX'][ind]
+            hash_output_table_control['IDX_COLOR'] = 'rgb(189, 215, 231)'
+            hash_output_table_control['GC_TYPE'] = df_output_table_csv['GC_TYPE'][ind]
+            hash_output_table_control['GC_TYPE_COLOR'] = 'rgb(189, 215, 231)'
+            df_output_table_color = df_output_table_color.append(hash_output_table_control, ignore_index=True)
+        elif df_output_table_csv['GC_TYPE'][ind] == 'Test':
+            hash_output_table_test = {}
+            for i in columns:
+                # printing the element of the column
+                # print(df_output_table_csv[i][ind])
+                if df_output_table_csv[i][ind] == '':
+                    hash_output_table_test[i + '_COLOR'] = 'rgb(239, 243, 255)'
+                elif df_output_table_csv[i][ind] != df_output_table_csv[i][ind-1]:
+                    hash_output_table_test[i + '_COLOR'] = 'rgb(107, 174, 214)'
+                else:
+                    hash_output_table_test[i + '_COLOR'] = 'rgb(189, 215, 245)'
+                hash_output_table_test[i] = df_output_table_csv[i][ind]
+            hash_output_table_test['IDX'] = df_output_table_csv['IDX'][ind]
+            hash_output_table_test['IDX_COLOR'] = 'rgb(189, 215, 245)'
+            hash_output_table_test['GC_TYPE'] = df_output_table_csv['GC_TYPE'][ind]
+            hash_output_table_test['GC_TYPE_COLOR'] = 'rgb(189, 215, 245)'
+            df_output_table_color = df_output_table_color.append(hash_output_table_test, ignore_index=True)
+
+    df_output_table_color[['IDX', 'GC_TYPE', 'ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
+        'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
+       'ORGANISM_COLOR', 'GENETIC_BACKGROUND_COLOR', 'MEDIUM_COLOR',
+       'MEDIUM_SUPPLEMENTS_COLOR', 'TEMPERATURE_COLOR', 'OPTICAL_DENSITY_COLOR',
+       'GROWTH_PHASE_COLOR'
+                           ]].to_csv(os.path.join(path_output, file_all_gcs_color), sep='\t', index=False, header=True)
+    # https://plotly.com/python/table/
+    # Seleccionar color: https://redketchup.io/color-picker
+    colors = n_colors('rgb(255, 200, 200)', 'rgb(200, 0, 0)', 3, colortype='rgb')
+    fig = go.Figure(data=[go.Table(
+      header=dict(
+        values=['<b>IDX', '<b>GC_TYPE</b>', '<b>ORGANISM</b>', '<b>GENETIC_BACKGROUND</b>', '<b>MEDIUM</b>',
+        '<b>MEDIUM_SUPPLEMENTS</b>', '<b>TEMPERATURE</b>', '<b>OPTICAL_DENSITY</b>', '<b>GROWTH_PHASE</b>'],
+        line_color='black', fill_color='white',
+        align='center', font=dict(color='black', size=12)
+      ),
+      cells=dict(
+        values=[df_output_table_color.IDX,
+                df_output_table_color.GC_TYPE,
+                df_output_table_color.ORGANISM,
+                df_output_table_color.GENETIC_BACKGROUND,
+                df_output_table_color.MEDIUM,
+                df_output_table_color.MEDIUM_SUPPLEMENTS,
+                df_output_table_color.TEMPERATURE,
+                df_output_table_color.OPTICAL_DENSITY,
+                df_output_table_color.GROWTH_PHASE],
+          #, df_output_table_color.AGITATION_SPEED,
+          #      df_output_table_color.AERATION, df_output_table_color.pH,
+          #      df_output_table_color.num_gc_classes, df_output_table_color.num_gc_phrases],
+        # line_color=[df.Color],
+        fill_color=[df_output_table_color.IDX_COLOR,
+                    df_output_table_color.GC_TYPE_COLOR,
+                df_output_table_color.ORGANISM_COLOR,
+                df_output_table_color.GENETIC_BACKGROUND_COLOR,
+                df_output_table_color.MEDIUM_COLOR,
+                df_output_table_color.MEDIUM_SUPPLEMENTS_COLOR,
+                df_output_table_color.TEMPERATURE_COLOR,
+                df_output_table_color.OPTICAL_DENSITY_COLOR,
+                df_output_table_color.GROWTH_PHASE_COLOR],
+        align='center', font=dict(color='black', size=11)
+      ))
+    ])
+    # fig.show() os.path.join(path_output, file_all_gcs_color.replace('.tsv', '.png')
+    fig.write_image(os.path.join(path_output, file_all_gcs_color.replace('.tsv', '.png')), height=2500, width=1800)
+    quit()
+
+
+    df_all_gcs = pd.DataFrame()
+    df_all_gcs_num = pd.DataFrame()
+    print(hash_all_gcs_control)
+    print(hash_all_gcs_test)
+    # quit()
+    for all_gcs, num_gcs in hash_all_gcs_control.items():
+        # https://www.geeksforgeeks.org/how-to-append-a-list-as-a-row-to-a-pandas-dataframe-in-python/
+        # df = df.append(pd.DataFrame(list, columns=['Name', 'Age', 'City', 'Country']), ignore_index=True)
+        df_all_gcs = df_all_gcs.append(pd.DataFrame([['Control', all_gcs, num_gcs]],
+            columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True)
+    for all_gcs, num_gcs in hash_all_gcs_test.items():
+        df_all_gcs = df_all_gcs.append(pd.DataFrame([['Test', all_gcs, num_gcs]],
+            columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True)
+    # print(df_all_gcs)
+    for all_gcs, num_gcs in hash_all_gcs_num_control.items():
+        df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['Control', all_gcs, num_gcs]],
+            columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True)
+    for all_gcs, num_gcs in hash_all_gcs_num_test.items():
+        df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['Test', all_gcs, num_gcs]],
+            columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True)
+    # print(df_all_gcs_num)
+    # quit()
+
+    df_join = df_all_gcs.join(df_all_gcs_num.set_index(['class_type', 'gc_phrase']), on=['class_type', 'gc_phrase'], lsuffix='_all', rsuffix='_num')
+    # print(df_join)
+    # quit()
+
+    # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
+    # df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy()
+    # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
+    # df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases']
+    df_all_gcs_table = df_join.copy()
+    df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True)
+
+def extract_gcs_curadores_PL(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_color):
+    df_output_table = pd.DataFrame()
+    # df_output_table_bin = pd.DataFrame()
+    df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t")
+    print("Shape df_input_table: {}".format(df_input_table.shape))
+    # print(df_input_table.head())
+    hash_all_gcs = {}
+    hash_all_gcs_num = {}
+    for ind in df_input_table.index:
+        hash_gcs_table = {}
+        hash_gcs = {}
+        num_gcs = 0
+        # hash_gcs_bin = {}
+        pmid = df_input_table['PMID'][ind]
+        coleccion = df_input_table['Colección'][ind]
+        hash_gcs_table['pmid'] = str(pmid)
+        # hash_gcs_bin['pmid'] = str(pmid)
+        hash_gcs_table['coleccion'] = coleccion
+        gcs = df_input_table['GC'][ind]
+        list_gcs = gcs.split('|')
+        for gc in list_gcs:
+            list_gc = gc.split(':')
+            gc_class = list_gc[0]
+            gc_class = gc_class.strip()
+            gc_class = gc_class.strip("'")
+            gc_term = list_gc[1]
+            gc_term = gc_term.strip()
+            gc_term = gc_term.strip("'")
+            hash_gcs_table[gc_class] = gc_term
+            hash_gcs[gc_class] = gc_term
+            num_gcs += 1
+            # hash_gcs_bin[gc_class] = int(1)
+        str_gcs = ", ".join(hash_gcs.keys())
+        # print(str_gcs)
+        if str_gcs in hash_all_gcs:
+            hash_all_gcs[str_gcs] += 1
+        else:
+            hash_all_gcs[str_gcs] = 1
+            hash_all_gcs_num[str_gcs] = num_gcs
+        df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
+        # df_output_table_bin = df_output_table_bin.append(hash_gcs_bin, ignore_index=True)
+        # print(df_output_table)
+        # quit()
+    # df_output_table_bin = df_output_table_bin.replace(np.nan, 0)
+    # df_output_table_bin = pd.DataFrame(np.sort(df_output_table_bin.values, axis=0), index=df_output_table_bin.index, columns=df_output_table_bin.columns)
+    df_output_table = df_output_table.replace(np.nan, '')
+    # df.sort_values(by=['Country', 'Continent'],
+    #                ascending=[False, True])
+    # print(df_output_table_bin)
+    df_output_table.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True)
+    # df_output_table_bin.to_csv(os.path.join(path_output, file_bin), sep='\t', index=False, header=True)
+
+    df_all_gcs = pd.DataFrame(list(hash_all_gcs.items()))
+    df_all_gcs_num = pd.DataFrame(list(hash_all_gcs_num.items()))
+    df_join = df_all_gcs.join(df_all_gcs_num, lsuffix='_all', rsuffix='_num')
+    # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
+    df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy()
+    # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
+    df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases']
+    df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True)
+
+    df_all_gcs_table_x = pd.DataFrame()
+    for ind in df_all_gcs_table.index:
+        hash_gcs_table_x = {}
+        phrase = df_all_gcs_table['gc_phrase'][ind]
+        list_phrase = phrase.split(',')
+        for gc_class in list_phrase:
+            gc_class = gc_class.strip()
+            hash_gcs_table_x[gc_class] = 'X'
+        hash_gcs_table_x['num_gc_classes'] = df_all_gcs_table['num_gc_classes'][ind]
+        hash_gcs_table_x['num_gc_phrases'] = df_all_gcs_table['num_gc_phrases'][ind]
+        df_all_gcs_table_x = df_all_gcs_table_x.append(hash_gcs_table_x, ignore_index=True)
+    df_all_gcs_table_x_out = df_all_gcs_table_x[['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
+        'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
+        'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases']].copy()
+    df_all_gcs_table_x_out.to_csv(os.path.join(path_output, file_all_gcs_color), sep='\t', index=False, header=True)
+
+    # https://plotly.com/python/table/
+    fig = go.Figure(data=[go.Table(
+      header=dict(
+        values=['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
+        'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
+        'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases'],
+        line_color='white', fill_color='white',
+        align='center', font=dict(color='black', size=12)
+      ),
+      cells=dict(
+        values=[df_all_gcs_table_x_out.ORGANISM, df_all_gcs_table_x_out.GENETIC_BACKGROUND,
+                df_all_gcs_table_x_out.MEDIUM, df_all_gcs_table_x_out.MEDIUM_SUPPLEMENTS,
+                df_all_gcs_table_x_out.TEMPERATURE, df_all_gcs_table_x_out.OPTICAL_DENSITY,
+                df_all_gcs_table_x_out.GROWTH_PHASE, df_all_gcs_table_x_out.AGITATION_SPEED,
+                df_all_gcs_table_x_out.AERATION, df_all_gcs_table_x_out.pH,
+                df_all_gcs_table_x_out.num_gc_classes, df_all_gcs_table_x_out.num_gc_phrases],
+        # line_color=[df.Color], fill_color=[df.Color],
+        align='center', font=dict(color='black', size=11)
+      ))
+    ])
+    fig.show()
+
+
+path_in = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
+path_out = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
+# Curator: PL
+file_in = "PL&VT_GC-PMID.tsv"
+file_out = "PL&VT_GC-PMID-GCs.tsv"
+file_gcs = "PL&VT_GC-PMID-GCs-All.tsv"
+file_gcs_x = "PL&VT_GC-PMID-GCs-All-X.tsv"
+### extract_gcs_curadores_PL(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x)
+
+# Curator: SG
+file_in = "GC-Catalog_resumido-sgama.tsv"
+file_out = "GC-Catalog_resumido-sgama-GCs.tsv"
+file_gcs = "GC-Catalog_resumido-sgama-GCs-All.tsv"
+file_gcs_color = "GC-Catalog_resumido-sgama-GCs-color.tsv"
+extract_gcs_curadores_SG(path_in, file_in, path_out, file_out, file_gcs, file_gcs_color)