cmendezc

Extracción de GCs de literatura.

import plotly.graph_objects as go
import numpy as np
import os
import pandas as pd
def extract_gcs_curadores_SG(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_x):
df_output_table = pd.DataFrame()
# df_output_table_bin = pd.DataFrame()
df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t")
print("Shape df_input_table: {}".format(df_input_table.shape))
print(df_input_table.head())
hash_all_gcs_control = {}
hash_all_gcs_num_control = {}
hash_all_gcs_test = {}
hash_all_gcs_num_test = {}
hash_gc_classes = {0: 'ORGANISM', 1: 'GENETIC_BACKGROUND', 2: 'MEDIUM',
3: 'MEDIUM_SUPPLEMENTS', 5: 'TEMPERATURE', 8: 'OPTICAL_DENSITY', 9: 'GROWTH_PHASE',
10: 'AGITATION_SPEED', 11: 'AERATION', 12: 'pH'}
idx = 0
for ind in df_input_table.index:
hash_gcs_table = {}
hash_gcs = {}
num_gcs = 0
pmid = df_input_table['Reference'][ind]
hash_gcs_table['pmid'] = str(pmid)
# print("Control_Growth_Condition...")
gcs_control = df_input_table['Control_Growth_Condition'][ind]
list_gcs_control = gcs_control.split('|')
hash_gcs_table['idx'] = idx
hash_gcs_table['gc_type'] = 'control'
for gc in list_gcs_control:
if gc != '':
gc_class = hash_gc_classes[list_gcs_control.index(gc)]
# print("gc_class: {}".format(gc_class))
hash_gcs_table[gc_class] = gc
hash_gcs[gc_class] = gc
num_gcs += 1
str_gcs = ", ".join(hash_gcs.keys())
if str_gcs in hash_all_gcs_control:
hash_all_gcs_control[str_gcs] += 1
else:
hash_all_gcs_control[str_gcs] = 1
hash_all_gcs_num_control[str_gcs] = num_gcs
df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
hash_gcs_table = {}
hash_gcs = {}
num_gcs = 0
hash_gcs_table['pmid'] = str(pmid)
# print("Test_Growth_Condition...")
gcs_test = df_input_table['Test_Growth_Condition'][ind]
list_gcs_test = gcs_test.split('|')
hash_gcs_table['idx'] = idx
hash_gcs_table['gc_type'] = 'test'
for gc in list_gcs_test:
if gc != '':
gc_class = hash_gc_classes[list_gcs_test.index(gc)]
# print("gc_class: {}".format(gc_class))
hash_gcs_table[gc_class] = gc
hash_gcs[gc_class] = gc
num_gcs += 1
# hash_gcs_bin[gc_class] = int(1)
str_gcs = ", ".join(hash_gcs.keys())
# print(str_gcs)
if str_gcs in hash_all_gcs_test:
hash_all_gcs_test[str_gcs] += 1
else:
hash_all_gcs_test[str_gcs] = 1
hash_all_gcs_num_test[str_gcs] = num_gcs
df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
idx += 1
df_output_table = df_output_table.replace(np.nan, '')
# df.sort_values(by=['Country', 'Continent'],
# ascending=[False, True])
# print(df_output_table_bin)
df_output_table.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True)
df_all_gcs = pd.DataFrame()
df_all_gcs_num = pd.DataFrame()
print(hash_all_gcs_control)
print(hash_all_gcs_test)
# quit()
for all_gcs, num_gcs in hash_all_gcs_control.items():
# https://www.geeksforgeeks.org/how-to-append-a-list-as-a-row-to-a-pandas-dataframe-in-python/
# df = df.append(pd.DataFrame(list, columns=['Name', 'Age', 'City', 'Country']), ignore_index=True)
df_all_gcs = df_all_gcs.append(pd.DataFrame([['control', all_gcs, num_gcs]],
columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True)
for all_gcs, num_gcs in hash_all_gcs_test.items():
df_all_gcs = df_all_gcs.append(pd.DataFrame([['test', all_gcs, num_gcs]],
columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True)
print(df_all_gcs)
for all_gcs, num_gcs in hash_all_gcs_num_control.items():
df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['control', all_gcs, num_gcs]],
columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True)
for all_gcs, num_gcs in hash_all_gcs_num_test.items():
df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['test', all_gcs, num_gcs]],
columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True)
print(df_all_gcs_num)
# quit()
df_join = df_all_gcs.join(df_all_gcs_num.set_index(['class_type', 'gc_phrase']), on=['class_type', 'gc_phrase'], lsuffix='_all', rsuffix='_num')
print(df_join)
# quit()
# df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
# df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy()
# df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
# df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases']
df_all_gcs_table = df_join.copy()
df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True)
quit()
df_all_gcs_table_x = pd.DataFrame()
for ind in df_all_gcs_table.index:
hash_gcs_table_x = {}
phrase = df_all_gcs_table['gc_phrase'][ind]
list_phrase = phrase.split(',')
for gc_class in list_phrase:
gc_class = gc_class.strip()
hash_gcs_table_x[gc_class] = 'X'
hash_gcs_table_x['num_gc_classes'] = df_all_gcs_table['num_gc_classes'][ind]
hash_gcs_table_x['num_gc_phrases'] = df_all_gcs_table['num_gc_phrases'][ind]
df_all_gcs_table_x = df_all_gcs_table_x.append(hash_gcs_table_x, ignore_index=True)
df_all_gcs_table_x_out = df_all_gcs_table_x[['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases']].copy()
df_all_gcs_table_x_out.to_csv(os.path.join(path_output, file_all_gcs_x), sep='\t', index=False, header=True)
# https://plotly.com/python/table/
fig = go.Figure(data=[go.Table(
header=dict(
values=['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases'],
line_color='white', fill_color='white',
align='center', font=dict(color='black', size=12)
),
cells=dict(
values=[df_all_gcs_table_x_out.ORGANISM, df_all_gcs_table_x_out.GENETIC_BACKGROUND,
df_all_gcs_table_x_out.MEDIUM, df_all_gcs_table_x_out.MEDIUM_SUPPLEMENTS,
df_all_gcs_table_x_out.TEMPERATURE, df_all_gcs_table_x_out.OPTICAL_DENSITY,
df_all_gcs_table_x_out.GROWTH_PHASE, df_all_gcs_table_x_out.AGITATION_SPEED,
df_all_gcs_table_x_out.AERATION, df_all_gcs_table_x_out.pH,
df_all_gcs_table_x_out.num_gc_classes, df_all_gcs_table_x_out.num_gc_phrases],
# line_color=[df.Color], fill_color=[df.Color],
align='center', font=dict(color='black', size=11)
))
])
fig.show()
def extract_gcs_curadores_PL(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_x):
df_output_table = pd.DataFrame()
# df_output_table_bin = pd.DataFrame()
df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t")
print("Shape df_input_table: {}".format(df_input_table.shape))
# print(df_input_table.head())
hash_all_gcs = {}
hash_all_gcs_num = {}
for ind in df_input_table.index:
hash_gcs_table = {}
hash_gcs = {}
num_gcs = 0
# hash_gcs_bin = {}
pmid = df_input_table['PMID'][ind]
coleccion = df_input_table['Colección'][ind]
hash_gcs_table['pmid'] = str(pmid)
# hash_gcs_bin['pmid'] = str(pmid)
hash_gcs_table['coleccion'] = coleccion
gcs = df_input_table['GC'][ind]
list_gcs = gcs.split('|')
for gc in list_gcs:
list_gc = gc.split(':')
gc_class = list_gc[0]
gc_class = gc_class.strip()
gc_class = gc_class.strip("'")
gc_term = list_gc[1]
gc_term = gc_term.strip()
gc_term = gc_term.strip("'")
hash_gcs_table[gc_class] = gc_term
hash_gcs[gc_class] = gc_term
num_gcs += 1
# hash_gcs_bin[gc_class] = int(1)
str_gcs = ", ".join(hash_gcs.keys())
# print(str_gcs)
if str_gcs in hash_all_gcs:
hash_all_gcs[str_gcs] += 1
else:
hash_all_gcs[str_gcs] = 1
hash_all_gcs_num[str_gcs] = num_gcs
df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
# df_output_table_bin = df_output_table_bin.append(hash_gcs_bin, ignore_index=True)
# print(df_output_table)
# quit()
# df_output_table_bin = df_output_table_bin.replace(np.nan, 0)
# df_output_table_bin = pd.DataFrame(np.sort(df_output_table_bin.values, axis=0), index=df_output_table_bin.index, columns=df_output_table_bin.columns)
df_output_table = df_output_table.replace(np.nan, '')
# df.sort_values(by=['Country', 'Continent'],
# ascending=[False, True])
# print(df_output_table_bin)
df_output_table.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True)
# df_output_table_bin.to_csv(os.path.join(path_output, file_bin), sep='\t', index=False, header=True)
df_all_gcs = pd.DataFrame(list(hash_all_gcs.items()))
df_all_gcs_num = pd.DataFrame(list(hash_all_gcs_num.items()))
df_join = df_all_gcs.join(df_all_gcs_num, lsuffix='_all', rsuffix='_num')
# df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy()
# df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases']
df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True)
df_all_gcs_table_x = pd.DataFrame()
for ind in df_all_gcs_table.index:
hash_gcs_table_x = {}
phrase = df_all_gcs_table['gc_phrase'][ind]
list_phrase = phrase.split(',')
for gc_class in list_phrase:
gc_class = gc_class.strip()
hash_gcs_table_x[gc_class] = 'X'
hash_gcs_table_x['num_gc_classes'] = df_all_gcs_table['num_gc_classes'][ind]
hash_gcs_table_x['num_gc_phrases'] = df_all_gcs_table['num_gc_phrases'][ind]
df_all_gcs_table_x = df_all_gcs_table_x.append(hash_gcs_table_x, ignore_index=True)
df_all_gcs_table_x_out = df_all_gcs_table_x[['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases']].copy()
df_all_gcs_table_x_out.to_csv(os.path.join(path_output, file_all_gcs_x), sep='\t', index=False, header=True)
# https://plotly.com/python/table/
fig = go.Figure(data=[go.Table(
header=dict(
values=['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases'],
line_color='white', fill_color='white',
align='center', font=dict(color='black', size=12)
),
cells=dict(
values=[df_all_gcs_table_x_out.ORGANISM, df_all_gcs_table_x_out.GENETIC_BACKGROUND,
df_all_gcs_table_x_out.MEDIUM, df_all_gcs_table_x_out.MEDIUM_SUPPLEMENTS,
df_all_gcs_table_x_out.TEMPERATURE, df_all_gcs_table_x_out.OPTICAL_DENSITY,
df_all_gcs_table_x_out.GROWTH_PHASE, df_all_gcs_table_x_out.AGITATION_SPEED,
df_all_gcs_table_x_out.AERATION, df_all_gcs_table_x_out.pH,
df_all_gcs_table_x_out.num_gc_classes, df_all_gcs_table_x_out.num_gc_phrases],
# line_color=[df.Color], fill_color=[df.Color],
align='center', font=dict(color='black', size=11)
))
])
fig.show()
path_in = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
path_out = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
# Curator: PL
file_in = "PL&VT_GC-PMID.tsv"
file_out = "PL&VT_GC-PMID-GCs.tsv"
file_gcs = "PL&VT_GC-PMID-GCs-All.tsv"
file_gcs_x = "PL&VT_GC-PMID-GCs-All-X.tsv"
### extract_gcs_curadores_PL(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x)
# Curator: SG
file_in = "GC-Catalog_resumido-sgama.tsv"
file_out = "GC-Catalog_resumido-sgama-GCs.tsv"
file_gcs = "GC-Catalog_resumido-sgama-GCs-All.tsv"
file_gcs_x = "GC-Catalog_resumido-sgama-GCs-All-X.tsv"
extract_gcs_curadores_SG(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x)
from plotly.colors import n_colors
import plotly.graph_objects as go
import numpy as np
import os
import pandas as pd
def extract_gcs_curadores_SG(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_color):
df_output_table = pd.DataFrame()
# df_output_table_bin = pd.DataFrame()
df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t")
print("Shape df_input_table: {}".format(df_input_table.shape))
# print(df_input_table.head())
hash_all_gcs_control = {}
hash_all_gcs_num_control = {}
hash_all_gcs_test = {}
hash_all_gcs_num_test = {}
hash_gc_classes = {0: 'ORGANISM', 1: 'GENETIC_BACKGROUND', 2: 'MEDIUM',
3: 'MEDIUM_SUPPLEMENTS', 5: 'TEMPERATURE', 8: 'OPTICAL_DENSITY', 9: 'GROWTH_PHASE',
10: 'AGITATION_SPEED', 11: 'AERATION', 12: 'pH'}
idx = 0
for ind in df_input_table.index:
hash_gcs_table = {}
hash_gcs = {}
num_gcs = 0
pmid = df_input_table['Reference'][ind]
hash_gcs_table['pmid'] = str(pmid)
# print("Control_Growth_Condition...")
gcs_control = df_input_table['Control_Growth_Condition'][ind]
list_gcs_control = gcs_control.split('|')
hash_gcs_table['IDX'] = idx
hash_gcs_table['GC_TYPE'] = 'Control'
for gc in list_gcs_control:
if gc != '':
gc_class = hash_gc_classes[list_gcs_control.index(gc)]
# print("gc_class: {}".format(gc_class))
hash_gcs_table[gc_class] = gc
hash_gcs[gc_class] = gc
num_gcs += 1
str_gcs = ", ".join(hash_gcs.keys())
if str_gcs in hash_all_gcs_control:
hash_all_gcs_control[str_gcs] += 1
else:
hash_all_gcs_control[str_gcs] = 1
hash_all_gcs_num_control[str_gcs] = num_gcs
df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
hash_gcs_table = {}
hash_gcs = {}
num_gcs = 0
hash_gcs_table['pmid'] = str(pmid)
# print("Test_Growth_Condition...")
gcs_test = df_input_table['Test_Growth_Condition'][ind]
list_gcs_test = gcs_test.split('|')
hash_gcs_table['IDX'] = idx
hash_gcs_table['GC_TYPE'] = 'Test'
for gc in list_gcs_test:
if gc != '':
gc_class = hash_gc_classes[list_gcs_test.index(gc)]
# print("gc_class: {}".format(gc_class))
hash_gcs_table[gc_class] = gc
hash_gcs[gc_class] = gc
num_gcs += 1
# hash_gcs_bin[gc_class] = int(1)
str_gcs = ", ".join(hash_gcs.keys())
# print(str_gcs)
if str_gcs in hash_all_gcs_test:
hash_all_gcs_test[str_gcs] += 1
else:
hash_all_gcs_test[str_gcs] = 1
hash_all_gcs_num_test[str_gcs] = num_gcs
df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
idx += 1
df_output_table = df_output_table.replace(np.nan, '')
# print(df_output_table.head())
df_output_table_csv = df_output_table[['IDX', 'GC_TYPE', 'ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE']].copy()
# 'AGITATION_SPEED', 'AERATION', 'pH'
# df.sort_values(by=['Country', 'Continent'],
# ascending=[False, True])
# print(df_output_table_bin)
df_output_table_csv.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True)
### Color table
columns = list(df_output_table_csv)
print(columns)
df_output_table_color = pd.DataFrame()
for ind in range(df_output_table_csv.index[-1]+1):
# print("ind: {}".format(ind))
# https://www.geeksforgeeks.org/iterating-over-rows-and-columns-in-pandas-dataframe/
if df_output_table_csv['GC_TYPE'][ind] == 'Control':
hash_output_table_control = {}
for i in columns:
# print("i: {}".format(i))
# printing the element of the column
# print(df_output_table_csv[i][ind])
if df_output_table_csv[i][ind] == '':
hash_output_table_control[i + '_COLOR'] = 'rgb(239, 243, 255)'
else:
hash_output_table_control[i + '_COLOR'] = 'rgb(189, 215, 231)'
hash_output_table_control[i] = df_output_table_csv[i][ind]
hash_output_table_control['IDX'] = df_output_table_csv['IDX'][ind]
hash_output_table_control['IDX_COLOR'] = 'rgb(189, 215, 231)'
hash_output_table_control['GC_TYPE'] = df_output_table_csv['GC_TYPE'][ind]
hash_output_table_control['GC_TYPE_COLOR'] = 'rgb(189, 215, 231)'
df_output_table_color = df_output_table_color.append(hash_output_table_control, ignore_index=True)
elif df_output_table_csv['GC_TYPE'][ind] == 'Test':
hash_output_table_test = {}
for i in columns:
# printing the element of the column
# print(df_output_table_csv[i][ind])
if df_output_table_csv[i][ind] == '':
hash_output_table_test[i + '_COLOR'] = 'rgb(239, 243, 255)'
elif df_output_table_csv[i][ind] != df_output_table_csv[i][ind-1]:
hash_output_table_test[i + '_COLOR'] = 'rgb(107, 174, 214)'
else:
hash_output_table_test[i + '_COLOR'] = 'rgb(189, 215, 245)'
hash_output_table_test[i] = df_output_table_csv[i][ind]
hash_output_table_test['IDX'] = df_output_table_csv['IDX'][ind]
hash_output_table_test['IDX_COLOR'] = 'rgb(189, 215, 245)'
hash_output_table_test['GC_TYPE'] = df_output_table_csv['GC_TYPE'][ind]
hash_output_table_test['GC_TYPE_COLOR'] = 'rgb(189, 215, 245)'
df_output_table_color = df_output_table_color.append(hash_output_table_test, ignore_index=True)
df_output_table_color[['IDX', 'GC_TYPE', 'ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
'ORGANISM_COLOR', 'GENETIC_BACKGROUND_COLOR', 'MEDIUM_COLOR',
'MEDIUM_SUPPLEMENTS_COLOR', 'TEMPERATURE_COLOR', 'OPTICAL_DENSITY_COLOR',
'GROWTH_PHASE_COLOR'
]].to_csv(os.path.join(path_output, file_all_gcs_color), sep='\t', index=False, header=True)
# https://plotly.com/python/table/
# Seleccionar color: https://redketchup.io/color-picker
colors = n_colors('rgb(255, 200, 200)', 'rgb(200, 0, 0)', 3, colortype='rgb')
fig = go.Figure(data=[go.Table(
header=dict(
values=['<b>IDX', '<b>GC_TYPE</b>', '<b>ORGANISM</b>', '<b>GENETIC_BACKGROUND</b>', '<b>MEDIUM</b>',
'<b>MEDIUM_SUPPLEMENTS</b>', '<b>TEMPERATURE</b>', '<b>OPTICAL_DENSITY</b>', '<b>GROWTH_PHASE</b>'],
line_color='black', fill_color='white',
align='center', font=dict(color='black', size=12)
),
cells=dict(
values=[df_output_table_color.IDX,
df_output_table_color.GC_TYPE,
df_output_table_color.ORGANISM,
df_output_table_color.GENETIC_BACKGROUND,
df_output_table_color.MEDIUM,
df_output_table_color.MEDIUM_SUPPLEMENTS,
df_output_table_color.TEMPERATURE,
df_output_table_color.OPTICAL_DENSITY,
df_output_table_color.GROWTH_PHASE],
#, df_output_table_color.AGITATION_SPEED,
# df_output_table_color.AERATION, df_output_table_color.pH,
# df_output_table_color.num_gc_classes, df_output_table_color.num_gc_phrases],
# line_color=[df.Color],
fill_color=[df_output_table_color.IDX_COLOR,
df_output_table_color.GC_TYPE_COLOR,
df_output_table_color.ORGANISM_COLOR,
df_output_table_color.GENETIC_BACKGROUND_COLOR,
df_output_table_color.MEDIUM_COLOR,
df_output_table_color.MEDIUM_SUPPLEMENTS_COLOR,
df_output_table_color.TEMPERATURE_COLOR,
df_output_table_color.OPTICAL_DENSITY_COLOR,
df_output_table_color.GROWTH_PHASE_COLOR],
align='center', font=dict(color='black', size=11)
))
])
# fig.show() os.path.join(path_output, file_all_gcs_color.replace('.tsv', '.png')
fig.write_image(os.path.join(path_output, file_all_gcs_color.replace('.tsv', '.png')), height=2500, width=1800)
quit()
df_all_gcs = pd.DataFrame()
df_all_gcs_num = pd.DataFrame()
print(hash_all_gcs_control)
print(hash_all_gcs_test)
# quit()
for all_gcs, num_gcs in hash_all_gcs_control.items():
# https://www.geeksforgeeks.org/how-to-append-a-list-as-a-row-to-a-pandas-dataframe-in-python/
# df = df.append(pd.DataFrame(list, columns=['Name', 'Age', 'City', 'Country']), ignore_index=True)
df_all_gcs = df_all_gcs.append(pd.DataFrame([['Control', all_gcs, num_gcs]],
columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True)
for all_gcs, num_gcs in hash_all_gcs_test.items():
df_all_gcs = df_all_gcs.append(pd.DataFrame([['Test', all_gcs, num_gcs]],
columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True)
# print(df_all_gcs)
for all_gcs, num_gcs in hash_all_gcs_num_control.items():
df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['Control', all_gcs, num_gcs]],
columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True)
for all_gcs, num_gcs in hash_all_gcs_num_test.items():
df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['Test', all_gcs, num_gcs]],
columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True)
# print(df_all_gcs_num)
# quit()
df_join = df_all_gcs.join(df_all_gcs_num.set_index(['class_type', 'gc_phrase']), on=['class_type', 'gc_phrase'], lsuffix='_all', rsuffix='_num')
# print(df_join)
# quit()
# df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
# df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy()
# df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
# df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases']
df_all_gcs_table = df_join.copy()
df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True)
def extract_gcs_curadores_PL(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_color):
df_output_table = pd.DataFrame()
# df_output_table_bin = pd.DataFrame()
df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t")
print("Shape df_input_table: {}".format(df_input_table.shape))
# print(df_input_table.head())
hash_all_gcs = {}
hash_all_gcs_num = {}
for ind in df_input_table.index:
hash_gcs_table = {}
hash_gcs = {}
num_gcs = 0
# hash_gcs_bin = {}
pmid = df_input_table['PMID'][ind]
coleccion = df_input_table['Colección'][ind]
hash_gcs_table['pmid'] = str(pmid)
# hash_gcs_bin['pmid'] = str(pmid)
hash_gcs_table['coleccion'] = coleccion
gcs = df_input_table['GC'][ind]
list_gcs = gcs.split('|')
for gc in list_gcs:
list_gc = gc.split(':')
gc_class = list_gc[0]
gc_class = gc_class.strip()
gc_class = gc_class.strip("'")
gc_term = list_gc[1]
gc_term = gc_term.strip()
gc_term = gc_term.strip("'")
hash_gcs_table[gc_class] = gc_term
hash_gcs[gc_class] = gc_term
num_gcs += 1
# hash_gcs_bin[gc_class] = int(1)
str_gcs = ", ".join(hash_gcs.keys())
# print(str_gcs)
if str_gcs in hash_all_gcs:
hash_all_gcs[str_gcs] += 1
else:
hash_all_gcs[str_gcs] = 1
hash_all_gcs_num[str_gcs] = num_gcs
df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
# df_output_table_bin = df_output_table_bin.append(hash_gcs_bin, ignore_index=True)
# print(df_output_table)
# quit()
# df_output_table_bin = df_output_table_bin.replace(np.nan, 0)
# df_output_table_bin = pd.DataFrame(np.sort(df_output_table_bin.values, axis=0), index=df_output_table_bin.index, columns=df_output_table_bin.columns)
df_output_table = df_output_table.replace(np.nan, '')
# df.sort_values(by=['Country', 'Continent'],
# ascending=[False, True])
# print(df_output_table_bin)
df_output_table.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True)
# df_output_table_bin.to_csv(os.path.join(path_output, file_bin), sep='\t', index=False, header=True)
df_all_gcs = pd.DataFrame(list(hash_all_gcs.items()))
df_all_gcs_num = pd.DataFrame(list(hash_all_gcs_num.items()))
df_join = df_all_gcs.join(df_all_gcs_num, lsuffix='_all', rsuffix='_num')
# df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy()
# df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases']
df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True)
df_all_gcs_table_x = pd.DataFrame()
for ind in df_all_gcs_table.index:
hash_gcs_table_x = {}
phrase = df_all_gcs_table['gc_phrase'][ind]
list_phrase = phrase.split(',')
for gc_class in list_phrase:
gc_class = gc_class.strip()
hash_gcs_table_x[gc_class] = 'X'
hash_gcs_table_x['num_gc_classes'] = df_all_gcs_table['num_gc_classes'][ind]
hash_gcs_table_x['num_gc_phrases'] = df_all_gcs_table['num_gc_phrases'][ind]
df_all_gcs_table_x = df_all_gcs_table_x.append(hash_gcs_table_x, ignore_index=True)
df_all_gcs_table_x_out = df_all_gcs_table_x[['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases']].copy()
df_all_gcs_table_x_out.to_csv(os.path.join(path_output, file_all_gcs_color), sep='\t', index=False, header=True)
# https://plotly.com/python/table/
fig = go.Figure(data=[go.Table(
header=dict(
values=['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases'],
line_color='white', fill_color='white',
align='center', font=dict(color='black', size=12)
),
cells=dict(
values=[df_all_gcs_table_x_out.ORGANISM, df_all_gcs_table_x_out.GENETIC_BACKGROUND,
df_all_gcs_table_x_out.MEDIUM, df_all_gcs_table_x_out.MEDIUM_SUPPLEMENTS,
df_all_gcs_table_x_out.TEMPERATURE, df_all_gcs_table_x_out.OPTICAL_DENSITY,
df_all_gcs_table_x_out.GROWTH_PHASE, df_all_gcs_table_x_out.AGITATION_SPEED,
df_all_gcs_table_x_out.AERATION, df_all_gcs_table_x_out.pH,
df_all_gcs_table_x_out.num_gc_classes, df_all_gcs_table_x_out.num_gc_phrases],
# line_color=[df.Color], fill_color=[df.Color],
align='center', font=dict(color='black', size=11)
))
])
fig.show()
path_in = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
path_out = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
# Curator: PL
file_in = "PL&VT_GC-PMID.tsv"
file_out = "PL&VT_GC-PMID-GCs.tsv"
file_gcs = "PL&VT_GC-PMID-GCs-All.tsv"
file_gcs_x = "PL&VT_GC-PMID-GCs-All-X.tsv"
### extract_gcs_curadores_PL(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x)
# Curator: SG
file_in = "GC-Catalog_resumido-sgama.tsv"
file_out = "GC-Catalog_resumido-sgama-GCs.tsv"
file_gcs = "GC-Catalog_resumido-sgama-GCs-All.tsv"
file_gcs_color = "GC-Catalog_resumido-sgama-GCs-color.tsv"
extract_gcs_curadores_SG(path_in, file_in, path_out, file_out, file_gcs, file_gcs_color)