Extracción de GCs de literatura.

cmendezc
Commit 2088c9b2be214c46b7916fd21f609b6d71f247ce 2088c9b2 1 parent ecabc025
Showing 2 changed files with 588 additions and 0 deletions
data-sets/bin/extract-gcs-curadores_v1.py
data-sets/bin/extract-gcs-curadores_v2.py
--- a/data-sets/bin/extract-gcs-curadores_v1.py 0 → 100644
View file @2088c9b
+++ b/data-sets/bin/extract-gcs-curadores_v1.py 0 → 100644
View file @2088c9b
+ import plotly.graph_objects as go
+ import numpy as np
+ import os
+ import pandas as pd
+ 
+ def extract_gcs_curadores_SG(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_x):
+     df_output_table = pd.DataFrame()
+     # df_output_table_bin = pd.DataFrame()
+     df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t")
+     print("Shape df_input_table: {}".format(df_input_table.shape))
+     print(df_input_table.head())
+     hash_all_gcs_control = {}
+     hash_all_gcs_num_control = {}
+     hash_all_gcs_test = {}
+     hash_all_gcs_num_test = {}
+     hash_gc_classes = {0: 'ORGANISM', 1: 'GENETIC_BACKGROUND', 2: 'MEDIUM',
+         3: 'MEDIUM_SUPPLEMENTS', 5: 'TEMPERATURE', 8: 'OPTICAL_DENSITY', 9: 'GROWTH_PHASE',
+         10: 'AGITATION_SPEED', 11: 'AERATION', 12: 'pH'}
+     idx = 0
+     for ind in df_input_table.index:
+         hash_gcs_table = {}
+         hash_gcs = {}
+         num_gcs = 0
+         pmid = df_input_table['Reference'][ind]
+         hash_gcs_table['pmid'] = str(pmid)
+         # print("Control_Growth_Condition...")
+         gcs_control = df_input_table['Control_Growth_Condition'][ind]
+         list_gcs_control = gcs_control.split('|')
+         hash_gcs_table['idx'] = idx
+         hash_gcs_table['gc_type'] = 'control'
+         for gc in list_gcs_control:
+             if gc != '':
+                 gc_class = hash_gc_classes[list_gcs_control.index(gc)]
+                 # print("gc_class: {}".format(gc_class))
+                 hash_gcs_table[gc_class] = gc
+                 hash_gcs[gc_class] = gc
+                 num_gcs += 1
+         str_gcs = ", ".join(hash_gcs.keys())
+         if str_gcs in hash_all_gcs_control:
+             hash_all_gcs_control[str_gcs] += 1
+         else:
+             hash_all_gcs_control[str_gcs] = 1
+             hash_all_gcs_num_control[str_gcs] = num_gcs
+         df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
+ 
+         hash_gcs_table = {}
+         hash_gcs = {}
+         num_gcs = 0
+         hash_gcs_table['pmid'] = str(pmid)
+         # print("Test_Growth_Condition...")
+         gcs_test = df_input_table['Test_Growth_Condition'][ind]
+         list_gcs_test = gcs_test.split('|')
+         hash_gcs_table['idx'] = idx
+         hash_gcs_table['gc_type'] = 'test'
+         for gc in list_gcs_test:
+             if gc != '':
+                 gc_class = hash_gc_classes[list_gcs_test.index(gc)]
+                 # print("gc_class: {}".format(gc_class))
+                 hash_gcs_table[gc_class] = gc
+                 hash_gcs[gc_class] = gc
+                 num_gcs += 1
+             # hash_gcs_bin[gc_class] = int(1)
+         str_gcs = ", ".join(hash_gcs.keys())
+         # print(str_gcs)
+         if str_gcs in hash_all_gcs_test:
+             hash_all_gcs_test[str_gcs] += 1
+         else:
+             hash_all_gcs_test[str_gcs] = 1
+             hash_all_gcs_num_test[str_gcs] = num_gcs
+ 
+         df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
+         idx += 1
+ 
+     df_output_table = df_output_table.replace(np.nan, '')
+     # df.sort_values(by=['Country', 'Continent'],
+     #                ascending=[False, True])
+     # print(df_output_table_bin)
+     df_output_table.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True)
+ 
+     df_all_gcs = pd.DataFrame()
+     df_all_gcs_num = pd.DataFrame()
+     print(hash_all_gcs_control)
+     print(hash_all_gcs_test)
+     # quit()
+     for all_gcs, num_gcs in hash_all_gcs_control.items():
+         # https://www.geeksforgeeks.org/how-to-append-a-list-as-a-row-to-a-pandas-dataframe-in-python/
+         # df = df.append(pd.DataFrame(list, columns=['Name', 'Age', 'City', 'Country']), ignore_index=True)
+         df_all_gcs = df_all_gcs.append(pd.DataFrame([['control', all_gcs, num_gcs]],
+             columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True)
+     for all_gcs, num_gcs in hash_all_gcs_test.items():
+         df_all_gcs = df_all_gcs.append(pd.DataFrame([['test', all_gcs, num_gcs]],
+             columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True)
+     print(df_all_gcs)
+     for all_gcs, num_gcs in hash_all_gcs_num_control.items():
+         df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['control', all_gcs, num_gcs]],
+             columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True)
+     for all_gcs, num_gcs in hash_all_gcs_num_test.items():
+         df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['test', all_gcs, num_gcs]],
+             columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True)
+     print(df_all_gcs_num)
+     # quit()
+ 
+     df_join = df_all_gcs.join(df_all_gcs_num.set_index(['class_type', 'gc_phrase']), on=['class_type', 'gc_phrase'], lsuffix='_all', rsuffix='_num')
+     print(df_join)
+     # quit()
+ 
+     # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
+     # df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy()
+     # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
+     # df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases']
+     df_all_gcs_table = df_join.copy()
+     df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True)
+     quit()
+ 
+     df_all_gcs_table_x = pd.DataFrame()
+     for ind in df_all_gcs_table.index:
+         hash_gcs_table_x = {}
+         phrase = df_all_gcs_table['gc_phrase'][ind]
+         list_phrase = phrase.split(',')
+         for gc_class in list_phrase:
+             gc_class = gc_class.strip()
+             hash_gcs_table_x[gc_class] = 'X'
+         hash_gcs_table_x['num_gc_classes'] = df_all_gcs_table['num_gc_classes'][ind]
+         hash_gcs_table_x['num_gc_phrases'] = df_all_gcs_table['num_gc_phrases'][ind]
+         df_all_gcs_table_x = df_all_gcs_table_x.append(hash_gcs_table_x, ignore_index=True)
+     df_all_gcs_table_x_out = df_all_gcs_table_x[['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
+         'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
+         'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases']].copy()
+     df_all_gcs_table_x_out.to_csv(os.path.join(path_output, file_all_gcs_x), sep='\t', index=False, header=True)
+ 
+     # https://plotly.com/python/table/
+     fig = go.Figure(data=[go.Table(
+       header=dict(
+         values=['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
+         'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
+         'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases'],
+         line_color='white', fill_color='white',
+         align='center', font=dict(color='black', size=12)
+       ),
+       cells=dict(
+         values=[df_all_gcs_table_x_out.ORGANISM, df_all_gcs_table_x_out.GENETIC_BACKGROUND,
+                 df_all_gcs_table_x_out.MEDIUM, df_all_gcs_table_x_out.MEDIUM_SUPPLEMENTS,
+                 df_all_gcs_table_x_out.TEMPERATURE, df_all_gcs_table_x_out.OPTICAL_DENSITY,
+                 df_all_gcs_table_x_out.GROWTH_PHASE, df_all_gcs_table_x_out.AGITATION_SPEED,
+                 df_all_gcs_table_x_out.AERATION, df_all_gcs_table_x_out.pH,
+                 df_all_gcs_table_x_out.num_gc_classes, df_all_gcs_table_x_out.num_gc_phrases],
+         # line_color=[df.Color], fill_color=[df.Color],
+         align='center', font=dict(color='black', size=11)
+       ))
+     ])
+     fig.show()
+ 
+ def extract_gcs_curadores_PL(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_x):
+     df_output_table = pd.DataFrame()
+     # df_output_table_bin = pd.DataFrame()
+     df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t")
+     print("Shape df_input_table: {}".format(df_input_table.shape))
+     # print(df_input_table.head())
+     hash_all_gcs = {}
+     hash_all_gcs_num = {}
+     for ind in df_input_table.index:
+         hash_gcs_table = {}
+         hash_gcs = {}
+         num_gcs = 0
+         # hash_gcs_bin = {}
+         pmid = df_input_table['PMID'][ind]
+         coleccion = df_input_table['Colección'][ind]
+         hash_gcs_table['pmid'] = str(pmid)
+         # hash_gcs_bin['pmid'] = str(pmid)
+         hash_gcs_table['coleccion'] = coleccion
+         gcs = df_input_table['GC'][ind]
+         list_gcs = gcs.split('|')
+         for gc in list_gcs:
+             list_gc = gc.split(':')
+             gc_class = list_gc[0]
+             gc_class = gc_class.strip()
+             gc_class = gc_class.strip("'")
+             gc_term = list_gc[1]
+             gc_term = gc_term.strip()
+             gc_term = gc_term.strip("'")
+             hash_gcs_table[gc_class] = gc_term
+             hash_gcs[gc_class] = gc_term
+             num_gcs += 1
+             # hash_gcs_bin[gc_class] = int(1)
+         str_gcs = ", ".join(hash_gcs.keys())
+         # print(str_gcs)
+         if str_gcs in hash_all_gcs:
+             hash_all_gcs[str_gcs] += 1
+         else:
+             hash_all_gcs[str_gcs] = 1
+             hash_all_gcs_num[str_gcs] = num_gcs
+         df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
+         # df_output_table_bin = df_output_table_bin.append(hash_gcs_bin, ignore_index=True)
+         # print(df_output_table)
+         # quit()
+     # df_output_table_bin = df_output_table_bin.replace(np.nan, 0)
+     # df_output_table_bin = pd.DataFrame(np.sort(df_output_table_bin.values, axis=0), index=df_output_table_bin.index, columns=df_output_table_bin.columns)
+     df_output_table = df_output_table.replace(np.nan, '')
+     # df.sort_values(by=['Country', 'Continent'],
+     #                ascending=[False, True])
+     # print(df_output_table_bin)
+     df_output_table.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True)
+     # df_output_table_bin.to_csv(os.path.join(path_output, file_bin), sep='\t', index=False, header=True)
+ 
+     df_all_gcs = pd.DataFrame(list(hash_all_gcs.items()))
+     df_all_gcs_num = pd.DataFrame(list(hash_all_gcs_num.items()))
+     df_join = df_all_gcs.join(df_all_gcs_num, lsuffix='_all', rsuffix='_num')
+     # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
+     df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy()
+     # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
+     df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases']
+     df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True)
+ 
+     df_all_gcs_table_x = pd.DataFrame()
+     for ind in df_all_gcs_table.index:
+         hash_gcs_table_x = {}
+         phrase = df_all_gcs_table['gc_phrase'][ind]
+         list_phrase = phrase.split(',')
+         for gc_class in list_phrase:
+             gc_class = gc_class.strip()
+             hash_gcs_table_x[gc_class] = 'X'
+         hash_gcs_table_x['num_gc_classes'] = df_all_gcs_table['num_gc_classes'][ind]
+         hash_gcs_table_x['num_gc_phrases'] = df_all_gcs_table['num_gc_phrases'][ind]
+         df_all_gcs_table_x = df_all_gcs_table_x.append(hash_gcs_table_x, ignore_index=True)
+     df_all_gcs_table_x_out = df_all_gcs_table_x[['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
+         'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
+         'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases']].copy()
+     df_all_gcs_table_x_out.to_csv(os.path.join(path_output, file_all_gcs_x), sep='\t', index=False, header=True)
+ 
+     # https://plotly.com/python/table/
+     fig = go.Figure(data=[go.Table(
+       header=dict(
+         values=['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
+         'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
+         'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases'],
+         line_color='white', fill_color='white',
+         align='center', font=dict(color='black', size=12)
+       ),
+       cells=dict(
+         values=[df_all_gcs_table_x_out.ORGANISM, df_all_gcs_table_x_out.GENETIC_BACKGROUND,
+                 df_all_gcs_table_x_out.MEDIUM, df_all_gcs_table_x_out.MEDIUM_SUPPLEMENTS,
+                 df_all_gcs_table_x_out.TEMPERATURE, df_all_gcs_table_x_out.OPTICAL_DENSITY,
+                 df_all_gcs_table_x_out.GROWTH_PHASE, df_all_gcs_table_x_out.AGITATION_SPEED,
+                 df_all_gcs_table_x_out.AERATION, df_all_gcs_table_x_out.pH,
+                 df_all_gcs_table_x_out.num_gc_classes, df_all_gcs_table_x_out.num_gc_phrases],
+         # line_color=[df.Color], fill_color=[df.Color],
+         align='center', font=dict(color='black', size=11)
+       ))
+     ])
+     fig.show()
+ 
+ 
+ path_in = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
+ path_out = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
+ # Curator: PL
+ file_in = "PL&VT_GC-PMID.tsv"
+ file_out = "PL&VT_GC-PMID-GCs.tsv"
+ file_gcs = "PL&VT_GC-PMID-GCs-All.tsv"
+ file_gcs_x = "PL&VT_GC-PMID-GCs-All-X.tsv"
+ ### extract_gcs_curadores_PL(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x)
+ 
+ # Curator: SG
+ file_in = "GC-Catalog_resumido-sgama.tsv"
+ file_out = "GC-Catalog_resumido-sgama-GCs.tsv"
+ file_gcs = "GC-Catalog_resumido-sgama-GCs-All.tsv"
+ file_gcs_x = "GC-Catalog_resumido-sgama-GCs-All-X.tsv"
+ extract_gcs_curadores_SG(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x)
--- a/data-sets/bin/extract-gcs-curadores_v2.py 0 → 100644
View file @2088c9b
+++ b/data-sets/bin/extract-gcs-curadores_v2.py 0 → 100644
View file @2088c9b
+ from plotly.colors import n_colors
+ import plotly.graph_objects as go
+ import numpy as np
+ import os
+ import pandas as pd
+ 
+ def extract_gcs_curadores_SG(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_color):
+     df_output_table = pd.DataFrame()
+     # df_output_table_bin = pd.DataFrame()
+     df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t")
+     print("Shape df_input_table: {}".format(df_input_table.shape))
+     # print(df_input_table.head())
+     hash_all_gcs_control = {}
+     hash_all_gcs_num_control = {}
+     hash_all_gcs_test = {}
+     hash_all_gcs_num_test = {}
+     hash_gc_classes = {0: 'ORGANISM', 1: 'GENETIC_BACKGROUND', 2: 'MEDIUM',
+         3: 'MEDIUM_SUPPLEMENTS', 5: 'TEMPERATURE', 8: 'OPTICAL_DENSITY', 9: 'GROWTH_PHASE',
+         10: 'AGITATION_SPEED', 11: 'AERATION', 12: 'pH'}
+     idx = 0
+     for ind in df_input_table.index:
+         hash_gcs_table = {}
+         hash_gcs = {}
+         num_gcs = 0
+         pmid = df_input_table['Reference'][ind]
+         hash_gcs_table['pmid'] = str(pmid)
+         # print("Control_Growth_Condition...")
+         gcs_control = df_input_table['Control_Growth_Condition'][ind]
+         list_gcs_control = gcs_control.split('|')
+         hash_gcs_table['IDX'] = idx
+         hash_gcs_table['GC_TYPE'] = 'Control'
+         for gc in list_gcs_control:
+             if gc != '':
+                 gc_class = hash_gc_classes[list_gcs_control.index(gc)]
+                 # print("gc_class: {}".format(gc_class))
+                 hash_gcs_table[gc_class] = gc
+                 hash_gcs[gc_class] = gc
+                 num_gcs += 1
+         str_gcs = ", ".join(hash_gcs.keys())
+         if str_gcs in hash_all_gcs_control:
+             hash_all_gcs_control[str_gcs] += 1
+         else:
+             hash_all_gcs_control[str_gcs] = 1
+             hash_all_gcs_num_control[str_gcs] = num_gcs
+         df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
+ 
+         hash_gcs_table = {}
+         hash_gcs = {}
+         num_gcs = 0
+         hash_gcs_table['pmid'] = str(pmid)
+         # print("Test_Growth_Condition...")
+         gcs_test = df_input_table['Test_Growth_Condition'][ind]
+         list_gcs_test = gcs_test.split('|')
+         hash_gcs_table['IDX'] = idx
+         hash_gcs_table['GC_TYPE'] = 'Test'
+         for gc in list_gcs_test:
+             if gc != '':
+                 gc_class = hash_gc_classes[list_gcs_test.index(gc)]
+                 # print("gc_class: {}".format(gc_class))
+                 hash_gcs_table[gc_class] = gc
+                 hash_gcs[gc_class] = gc
+                 num_gcs += 1
+             # hash_gcs_bin[gc_class] = int(1)
+         str_gcs = ", ".join(hash_gcs.keys())
+         # print(str_gcs)
+         if str_gcs in hash_all_gcs_test:
+             hash_all_gcs_test[str_gcs] += 1
+         else:
+             hash_all_gcs_test[str_gcs] = 1
+             hash_all_gcs_num_test[str_gcs] = num_gcs
+ 
+         df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
+         idx += 1
+ 
+     df_output_table = df_output_table.replace(np.nan, '')
+     # print(df_output_table.head())
+     df_output_table_csv = df_output_table[['IDX', 'GC_TYPE', 'ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
+         'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE']].copy()
+         # 'AGITATION_SPEED', 'AERATION', 'pH'
+     # df.sort_values(by=['Country', 'Continent'],
+     #                ascending=[False, True])
+     # print(df_output_table_bin)
+     df_output_table_csv.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True)
+ 
+     ### Color table
+     columns = list(df_output_table_csv)
+     print(columns)
+     df_output_table_color = pd.DataFrame()
+     for ind in range(df_output_table_csv.index[-1]+1):
+         # print("ind: {}".format(ind))
+         # https://www.geeksforgeeks.org/iterating-over-rows-and-columns-in-pandas-dataframe/
+         if df_output_table_csv['GC_TYPE'][ind] == 'Control':
+             hash_output_table_control = {}
+             for i in columns:
+                 # print("i: {}".format(i))
+                 # printing the element of the column
+                 # print(df_output_table_csv[i][ind])
+                 if df_output_table_csv[i][ind] == '':
+                     hash_output_table_control[i + '_COLOR'] = 'rgb(239, 243, 255)'
+                 else:
+                     hash_output_table_control[i + '_COLOR'] = 'rgb(189, 215, 231)'
+                 hash_output_table_control[i] = df_output_table_csv[i][ind]
+             hash_output_table_control['IDX'] = df_output_table_csv['IDX'][ind]
+             hash_output_table_control['IDX_COLOR'] = 'rgb(189, 215, 231)'
+             hash_output_table_control['GC_TYPE'] = df_output_table_csv['GC_TYPE'][ind]
+             hash_output_table_control['GC_TYPE_COLOR'] = 'rgb(189, 215, 231)'
+             df_output_table_color = df_output_table_color.append(hash_output_table_control, ignore_index=True)
+         elif df_output_table_csv['GC_TYPE'][ind] == 'Test':
+             hash_output_table_test = {}
+             for i in columns:
+                 # printing the element of the column
+                 # print(df_output_table_csv[i][ind])
+                 if df_output_table_csv[i][ind] == '':
+                     hash_output_table_test[i + '_COLOR'] = 'rgb(239, 243, 255)'
+                 elif df_output_table_csv[i][ind] != df_output_table_csv[i][ind-1]:
+                     hash_output_table_test[i + '_COLOR'] = 'rgb(107, 174, 214)'
+                 else:
+                     hash_output_table_test[i + '_COLOR'] = 'rgb(189, 215, 245)'
+                 hash_output_table_test[i] = df_output_table_csv[i][ind]
+             hash_output_table_test['IDX'] = df_output_table_csv['IDX'][ind]
+             hash_output_table_test['IDX_COLOR'] = 'rgb(189, 215, 245)'
+             hash_output_table_test['GC_TYPE'] = df_output_table_csv['GC_TYPE'][ind]
+             hash_output_table_test['GC_TYPE_COLOR'] = 'rgb(189, 215, 245)'
+             df_output_table_color = df_output_table_color.append(hash_output_table_test, ignore_index=True)
+ 
+     df_output_table_color[['IDX', 'GC_TYPE', 'ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
+         'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
+        'ORGANISM_COLOR', 'GENETIC_BACKGROUND_COLOR', 'MEDIUM_COLOR',
+        'MEDIUM_SUPPLEMENTS_COLOR', 'TEMPERATURE_COLOR', 'OPTICAL_DENSITY_COLOR',
+        'GROWTH_PHASE_COLOR'
+                            ]].to_csv(os.path.join(path_output, file_all_gcs_color), sep='\t', index=False, header=True)
+     # https://plotly.com/python/table/
+     # Seleccionar color: https://redketchup.io/color-picker
+     colors = n_colors('rgb(255, 200, 200)', 'rgb(200, 0, 0)', 3, colortype='rgb')
+     fig = go.Figure(data=[go.Table(
+       header=dict(
+         values=['<b>IDX', '<b>GC_TYPE</b>', '<b>ORGANISM</b>', '<b>GENETIC_BACKGROUND</b>', '<b>MEDIUM</b>',
+         '<b>MEDIUM_SUPPLEMENTS</b>', '<b>TEMPERATURE</b>', '<b>OPTICAL_DENSITY</b>', '<b>GROWTH_PHASE</b>'],
+         line_color='black', fill_color='white',
+         align='center', font=dict(color='black', size=12)
+       ),
+       cells=dict(
+         values=[df_output_table_color.IDX,
+                 df_output_table_color.GC_TYPE,
+                 df_output_table_color.ORGANISM,
+                 df_output_table_color.GENETIC_BACKGROUND,
+                 df_output_table_color.MEDIUM,
+                 df_output_table_color.MEDIUM_SUPPLEMENTS,
+                 df_output_table_color.TEMPERATURE,
+                 df_output_table_color.OPTICAL_DENSITY,
+                 df_output_table_color.GROWTH_PHASE],
+           #, df_output_table_color.AGITATION_SPEED,
+           #      df_output_table_color.AERATION, df_output_table_color.pH,
+           #      df_output_table_color.num_gc_classes, df_output_table_color.num_gc_phrases],
+         # line_color=[df.Color],
+         fill_color=[df_output_table_color.IDX_COLOR,
+                     df_output_table_color.GC_TYPE_COLOR,
+                 df_output_table_color.ORGANISM_COLOR,
+                 df_output_table_color.GENETIC_BACKGROUND_COLOR,
+                 df_output_table_color.MEDIUM_COLOR,
+                 df_output_table_color.MEDIUM_SUPPLEMENTS_COLOR,
+                 df_output_table_color.TEMPERATURE_COLOR,
+                 df_output_table_color.OPTICAL_DENSITY_COLOR,
+                 df_output_table_color.GROWTH_PHASE_COLOR],
+         align='center', font=dict(color='black', size=11)
+       ))
+     ])
+     # fig.show() os.path.join(path_output, file_all_gcs_color.replace('.tsv', '.png')
+     fig.write_image(os.path.join(path_output, file_all_gcs_color.replace('.tsv', '.png')), height=2500, width=1800)
+     quit()
+ 
+ 
+     df_all_gcs = pd.DataFrame()
+     df_all_gcs_num = pd.DataFrame()
+     print(hash_all_gcs_control)
+     print(hash_all_gcs_test)
+     # quit()
+     for all_gcs, num_gcs in hash_all_gcs_control.items():
+         # https://www.geeksforgeeks.org/how-to-append-a-list-as-a-row-to-a-pandas-dataframe-in-python/
+         # df = df.append(pd.DataFrame(list, columns=['Name', 'Age', 'City', 'Country']), ignore_index=True)
+         df_all_gcs = df_all_gcs.append(pd.DataFrame([['Control', all_gcs, num_gcs]],
+             columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True)
+     for all_gcs, num_gcs in hash_all_gcs_test.items():
+         df_all_gcs = df_all_gcs.append(pd.DataFrame([['Test', all_gcs, num_gcs]],
+             columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True)
+     # print(df_all_gcs)
+     for all_gcs, num_gcs in hash_all_gcs_num_control.items():
+         df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['Control', all_gcs, num_gcs]],
+             columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True)
+     for all_gcs, num_gcs in hash_all_gcs_num_test.items():
+         df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['Test', all_gcs, num_gcs]],
+             columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True)
+     # print(df_all_gcs_num)
+     # quit()
+ 
+     df_join = df_all_gcs.join(df_all_gcs_num.set_index(['class_type', 'gc_phrase']), on=['class_type', 'gc_phrase'], lsuffix='_all', rsuffix='_num')
+     # print(df_join)
+     # quit()
+ 
+     # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
+     # df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy()
+     # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
+     # df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases']
+     df_all_gcs_table = df_join.copy()
+     df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True)
+ 
+ def extract_gcs_curadores_PL(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_color):
+     df_output_table = pd.DataFrame()
+     # df_output_table_bin = pd.DataFrame()
+     df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t")
+     print("Shape df_input_table: {}".format(df_input_table.shape))
+     # print(df_input_table.head())
+     hash_all_gcs = {}
+     hash_all_gcs_num = {}
+     for ind in df_input_table.index:
+         hash_gcs_table = {}
+         hash_gcs = {}
+         num_gcs = 0
+         # hash_gcs_bin = {}
+         pmid = df_input_table['PMID'][ind]
+         coleccion = df_input_table['Colección'][ind]
+         hash_gcs_table['pmid'] = str(pmid)
+         # hash_gcs_bin['pmid'] = str(pmid)
+         hash_gcs_table['coleccion'] = coleccion
+         gcs = df_input_table['GC'][ind]
+         list_gcs = gcs.split('|')
+         for gc in list_gcs:
+             list_gc = gc.split(':')
+             gc_class = list_gc[0]
+             gc_class = gc_class.strip()
+             gc_class = gc_class.strip("'")
+             gc_term = list_gc[1]
+             gc_term = gc_term.strip()
+             gc_term = gc_term.strip("'")
+             hash_gcs_table[gc_class] = gc_term
+             hash_gcs[gc_class] = gc_term
+             num_gcs += 1
+             # hash_gcs_bin[gc_class] = int(1)
+         str_gcs = ", ".join(hash_gcs.keys())
+         # print(str_gcs)
+         if str_gcs in hash_all_gcs:
+             hash_all_gcs[str_gcs] += 1
+         else:
+             hash_all_gcs[str_gcs] = 1
+             hash_all_gcs_num[str_gcs] = num_gcs
+         df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
+         # df_output_table_bin = df_output_table_bin.append(hash_gcs_bin, ignore_index=True)
+         # print(df_output_table)
+         # quit()
+     # df_output_table_bin = df_output_table_bin.replace(np.nan, 0)
+     # df_output_table_bin = pd.DataFrame(np.sort(df_output_table_bin.values, axis=0), index=df_output_table_bin.index, columns=df_output_table_bin.columns)
+     df_output_table = df_output_table.replace(np.nan, '')
+     # df.sort_values(by=['Country', 'Continent'],
+     #                ascending=[False, True])
+     # print(df_output_table_bin)
+     df_output_table.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True)
+     # df_output_table_bin.to_csv(os.path.join(path_output, file_bin), sep='\t', index=False, header=True)
+ 
+     df_all_gcs = pd.DataFrame(list(hash_all_gcs.items()))
+     df_all_gcs_num = pd.DataFrame(list(hash_all_gcs_num.items()))
+     df_join = df_all_gcs.join(df_all_gcs_num, lsuffix='_all', rsuffix='_num')
+     # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
+     df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy()
+     # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
+     df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases']
+     df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True)
+ 
+     df_all_gcs_table_x = pd.DataFrame()
+     for ind in df_all_gcs_table.index:
+         hash_gcs_table_x = {}
+         phrase = df_all_gcs_table['gc_phrase'][ind]
+         list_phrase = phrase.split(',')
+         for gc_class in list_phrase:
+             gc_class = gc_class.strip()
+             hash_gcs_table_x[gc_class] = 'X'
+         hash_gcs_table_x['num_gc_classes'] = df_all_gcs_table['num_gc_classes'][ind]
+         hash_gcs_table_x['num_gc_phrases'] = df_all_gcs_table['num_gc_phrases'][ind]
+         df_all_gcs_table_x = df_all_gcs_table_x.append(hash_gcs_table_x, ignore_index=True)
+     df_all_gcs_table_x_out = df_all_gcs_table_x[['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
+         'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
+         'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases']].copy()
+     df_all_gcs_table_x_out.to_csv(os.path.join(path_output, file_all_gcs_color), sep='\t', index=False, header=True)
+ 
+     # https://plotly.com/python/table/
+     fig = go.Figure(data=[go.Table(
+       header=dict(
+         values=['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
+         'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
+         'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases'],
+         line_color='white', fill_color='white',
+         align='center', font=dict(color='black', size=12)
+       ),
+       cells=dict(
+         values=[df_all_gcs_table_x_out.ORGANISM, df_all_gcs_table_x_out.GENETIC_BACKGROUND,
+                 df_all_gcs_table_x_out.MEDIUM, df_all_gcs_table_x_out.MEDIUM_SUPPLEMENTS,
+                 df_all_gcs_table_x_out.TEMPERATURE, df_all_gcs_table_x_out.OPTICAL_DENSITY,
+                 df_all_gcs_table_x_out.GROWTH_PHASE, df_all_gcs_table_x_out.AGITATION_SPEED,
+                 df_all_gcs_table_x_out.AERATION, df_all_gcs_table_x_out.pH,
+                 df_all_gcs_table_x_out.num_gc_classes, df_all_gcs_table_x_out.num_gc_phrases],
+         # line_color=[df.Color], fill_color=[df.Color],
+         align='center', font=dict(color='black', size=11)
+       ))
+     ])
+     fig.show()
+ 
+ 
+ path_in = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
+ path_out = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
+ # Curator: PL
+ file_in = "PL&VT_GC-PMID.tsv"
+ file_out = "PL&VT_GC-PMID-GCs.tsv"
+ file_gcs = "PL&VT_GC-PMID-GCs-All.tsv"
+ file_gcs_x = "PL&VT_GC-PMID-GCs-All-X.tsv"
+ ### extract_gcs_curadores_PL(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x)
+ 
+ # Curator: SG
+ file_in = "GC-Catalog_resumido-sgama.tsv"
+ file_out = "GC-Catalog_resumido-sgama-GCs.tsv"
+ file_gcs = "GC-Catalog_resumido-sgama-GCs-All.tsv"
+ file_gcs_color = "GC-Catalog_resumido-sgama-GCs-color.tsv"
+ extract_gcs_curadores_SG(path_in, file_in, path_out, file_out, file_gcs, file_gcs_color)