Showing
2 changed files
with
588 additions
and
0 deletions
data-sets/bin/extract-gcs-curadores_v1.py
0 → 100644
| 1 | +import plotly.graph_objects as go | ||
| 2 | +import numpy as np | ||
| 3 | +import os | ||
| 4 | +import pandas as pd | ||
| 5 | + | ||
| 6 | +def extract_gcs_curadores_SG(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_x): | ||
| 7 | + df_output_table = pd.DataFrame() | ||
| 8 | + # df_output_table_bin = pd.DataFrame() | ||
| 9 | + df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t") | ||
| 10 | + print("Shape df_input_table: {}".format(df_input_table.shape)) | ||
| 11 | + print(df_input_table.head()) | ||
| 12 | + hash_all_gcs_control = {} | ||
| 13 | + hash_all_gcs_num_control = {} | ||
| 14 | + hash_all_gcs_test = {} | ||
| 15 | + hash_all_gcs_num_test = {} | ||
| 16 | + hash_gc_classes = {0: 'ORGANISM', 1: 'GENETIC_BACKGROUND', 2: 'MEDIUM', | ||
| 17 | + 3: 'MEDIUM_SUPPLEMENTS', 5: 'TEMPERATURE', 8: 'OPTICAL_DENSITY', 9: 'GROWTH_PHASE', | ||
| 18 | + 10: 'AGITATION_SPEED', 11: 'AERATION', 12: 'pH'} | ||
| 19 | + idx = 0 | ||
| 20 | + for ind in df_input_table.index: | ||
| 21 | + hash_gcs_table = {} | ||
| 22 | + hash_gcs = {} | ||
| 23 | + num_gcs = 0 | ||
| 24 | + pmid = df_input_table['Reference'][ind] | ||
| 25 | + hash_gcs_table['pmid'] = str(pmid) | ||
| 26 | + # print("Control_Growth_Condition...") | ||
| 27 | + gcs_control = df_input_table['Control_Growth_Condition'][ind] | ||
| 28 | + list_gcs_control = gcs_control.split('|') | ||
| 29 | + hash_gcs_table['idx'] = idx | ||
| 30 | + hash_gcs_table['gc_type'] = 'control' | ||
| 31 | + for gc in list_gcs_control: | ||
| 32 | + if gc != '': | ||
| 33 | + gc_class = hash_gc_classes[list_gcs_control.index(gc)] | ||
| 34 | + # print("gc_class: {}".format(gc_class)) | ||
| 35 | + hash_gcs_table[gc_class] = gc | ||
| 36 | + hash_gcs[gc_class] = gc | ||
| 37 | + num_gcs += 1 | ||
| 38 | + str_gcs = ", ".join(hash_gcs.keys()) | ||
| 39 | + if str_gcs in hash_all_gcs_control: | ||
| 40 | + hash_all_gcs_control[str_gcs] += 1 | ||
| 41 | + else: | ||
| 42 | + hash_all_gcs_control[str_gcs] = 1 | ||
| 43 | + hash_all_gcs_num_control[str_gcs] = num_gcs | ||
| 44 | + df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True) | ||
| 45 | + | ||
| 46 | + hash_gcs_table = {} | ||
| 47 | + hash_gcs = {} | ||
| 48 | + num_gcs = 0 | ||
| 49 | + hash_gcs_table['pmid'] = str(pmid) | ||
| 50 | + # print("Test_Growth_Condition...") | ||
| 51 | + gcs_test = df_input_table['Test_Growth_Condition'][ind] | ||
| 52 | + list_gcs_test = gcs_test.split('|') | ||
| 53 | + hash_gcs_table['idx'] = idx | ||
| 54 | + hash_gcs_table['gc_type'] = 'test' | ||
| 55 | + for gc in list_gcs_test: | ||
| 56 | + if gc != '': | ||
| 57 | + gc_class = hash_gc_classes[list_gcs_test.index(gc)] | ||
| 58 | + # print("gc_class: {}".format(gc_class)) | ||
| 59 | + hash_gcs_table[gc_class] = gc | ||
| 60 | + hash_gcs[gc_class] = gc | ||
| 61 | + num_gcs += 1 | ||
| 62 | + # hash_gcs_bin[gc_class] = int(1) | ||
| 63 | + str_gcs = ", ".join(hash_gcs.keys()) | ||
| 64 | + # print(str_gcs) | ||
| 65 | + if str_gcs in hash_all_gcs_test: | ||
| 66 | + hash_all_gcs_test[str_gcs] += 1 | ||
| 67 | + else: | ||
| 68 | + hash_all_gcs_test[str_gcs] = 1 | ||
| 69 | + hash_all_gcs_num_test[str_gcs] = num_gcs | ||
| 70 | + | ||
| 71 | + df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True) | ||
| 72 | + idx += 1 | ||
| 73 | + | ||
| 74 | + df_output_table = df_output_table.replace(np.nan, '') | ||
| 75 | + # df.sort_values(by=['Country', 'Continent'], | ||
| 76 | + # ascending=[False, True]) | ||
| 77 | + # print(df_output_table_bin) | ||
| 78 | + df_output_table.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True) | ||
| 79 | + | ||
| 80 | + df_all_gcs = pd.DataFrame() | ||
| 81 | + df_all_gcs_num = pd.DataFrame() | ||
| 82 | + print(hash_all_gcs_control) | ||
| 83 | + print(hash_all_gcs_test) | ||
| 84 | + # quit() | ||
| 85 | + for all_gcs, num_gcs in hash_all_gcs_control.items(): | ||
| 86 | + # https://www.geeksforgeeks.org/how-to-append-a-list-as-a-row-to-a-pandas-dataframe-in-python/ | ||
| 87 | + # df = df.append(pd.DataFrame(list, columns=['Name', 'Age', 'City', 'Country']), ignore_index=True) | ||
| 88 | + df_all_gcs = df_all_gcs.append(pd.DataFrame([['control', all_gcs, num_gcs]], | ||
| 89 | + columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True) | ||
| 90 | + for all_gcs, num_gcs in hash_all_gcs_test.items(): | ||
| 91 | + df_all_gcs = df_all_gcs.append(pd.DataFrame([['test', all_gcs, num_gcs]], | ||
| 92 | + columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True) | ||
| 93 | + print(df_all_gcs) | ||
| 94 | + for all_gcs, num_gcs in hash_all_gcs_num_control.items(): | ||
| 95 | + df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['control', all_gcs, num_gcs]], | ||
| 96 | + columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True) | ||
| 97 | + for all_gcs, num_gcs in hash_all_gcs_num_test.items(): | ||
| 98 | + df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['test', all_gcs, num_gcs]], | ||
| 99 | + columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True) | ||
| 100 | + print(df_all_gcs_num) | ||
| 101 | + # quit() | ||
| 102 | + | ||
| 103 | + df_join = df_all_gcs.join(df_all_gcs_num.set_index(['class_type', 'gc_phrase']), on=['class_type', 'gc_phrase'], lsuffix='_all', rsuffix='_num') | ||
| 104 | + print(df_join) | ||
| 105 | + # quit() | ||
| 106 | + | ||
| 107 | + # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0')) | ||
| 108 | + # df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy() | ||
| 109 | + # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase'] | ||
| 110 | + # df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases'] | ||
| 111 | + df_all_gcs_table = df_join.copy() | ||
| 112 | + df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True) | ||
| 113 | + quit() | ||
| 114 | + | ||
| 115 | + df_all_gcs_table_x = pd.DataFrame() | ||
| 116 | + for ind in df_all_gcs_table.index: | ||
| 117 | + hash_gcs_table_x = {} | ||
| 118 | + phrase = df_all_gcs_table['gc_phrase'][ind] | ||
| 119 | + list_phrase = phrase.split(',') | ||
| 120 | + for gc_class in list_phrase: | ||
| 121 | + gc_class = gc_class.strip() | ||
| 122 | + hash_gcs_table_x[gc_class] = 'X' | ||
| 123 | + hash_gcs_table_x['num_gc_classes'] = df_all_gcs_table['num_gc_classes'][ind] | ||
| 124 | + hash_gcs_table_x['num_gc_phrases'] = df_all_gcs_table['num_gc_phrases'][ind] | ||
| 125 | + df_all_gcs_table_x = df_all_gcs_table_x.append(hash_gcs_table_x, ignore_index=True) | ||
| 126 | + df_all_gcs_table_x_out = df_all_gcs_table_x[['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM', | ||
| 127 | + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE', | ||
| 128 | + 'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases']].copy() | ||
| 129 | + df_all_gcs_table_x_out.to_csv(os.path.join(path_output, file_all_gcs_x), sep='\t', index=False, header=True) | ||
| 130 | + | ||
| 131 | + # https://plotly.com/python/table/ | ||
| 132 | + fig = go.Figure(data=[go.Table( | ||
| 133 | + header=dict( | ||
| 134 | + values=['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM', | ||
| 135 | + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE', | ||
| 136 | + 'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases'], | ||
| 137 | + line_color='white', fill_color='white', | ||
| 138 | + align='center', font=dict(color='black', size=12) | ||
| 139 | + ), | ||
| 140 | + cells=dict( | ||
| 141 | + values=[df_all_gcs_table_x_out.ORGANISM, df_all_gcs_table_x_out.GENETIC_BACKGROUND, | ||
| 142 | + df_all_gcs_table_x_out.MEDIUM, df_all_gcs_table_x_out.MEDIUM_SUPPLEMENTS, | ||
| 143 | + df_all_gcs_table_x_out.TEMPERATURE, df_all_gcs_table_x_out.OPTICAL_DENSITY, | ||
| 144 | + df_all_gcs_table_x_out.GROWTH_PHASE, df_all_gcs_table_x_out.AGITATION_SPEED, | ||
| 145 | + df_all_gcs_table_x_out.AERATION, df_all_gcs_table_x_out.pH, | ||
| 146 | + df_all_gcs_table_x_out.num_gc_classes, df_all_gcs_table_x_out.num_gc_phrases], | ||
| 147 | + # line_color=[df.Color], fill_color=[df.Color], | ||
| 148 | + align='center', font=dict(color='black', size=11) | ||
| 149 | + )) | ||
| 150 | + ]) | ||
| 151 | + fig.show() | ||
| 152 | + | ||
| 153 | +def extract_gcs_curadores_PL(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_x): | ||
| 154 | + df_output_table = pd.DataFrame() | ||
| 155 | + # df_output_table_bin = pd.DataFrame() | ||
| 156 | + df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t") | ||
| 157 | + print("Shape df_input_table: {}".format(df_input_table.shape)) | ||
| 158 | + # print(df_input_table.head()) | ||
| 159 | + hash_all_gcs = {} | ||
| 160 | + hash_all_gcs_num = {} | ||
| 161 | + for ind in df_input_table.index: | ||
| 162 | + hash_gcs_table = {} | ||
| 163 | + hash_gcs = {} | ||
| 164 | + num_gcs = 0 | ||
| 165 | + # hash_gcs_bin = {} | ||
| 166 | + pmid = df_input_table['PMID'][ind] | ||
| 167 | + coleccion = df_input_table['Colección'][ind] | ||
| 168 | + hash_gcs_table['pmid'] = str(pmid) | ||
| 169 | + # hash_gcs_bin['pmid'] = str(pmid) | ||
| 170 | + hash_gcs_table['coleccion'] = coleccion | ||
| 171 | + gcs = df_input_table['GC'][ind] | ||
| 172 | + list_gcs = gcs.split('|') | ||
| 173 | + for gc in list_gcs: | ||
| 174 | + list_gc = gc.split(':') | ||
| 175 | + gc_class = list_gc[0] | ||
| 176 | + gc_class = gc_class.strip() | ||
| 177 | + gc_class = gc_class.strip("'") | ||
| 178 | + gc_term = list_gc[1] | ||
| 179 | + gc_term = gc_term.strip() | ||
| 180 | + gc_term = gc_term.strip("'") | ||
| 181 | + hash_gcs_table[gc_class] = gc_term | ||
| 182 | + hash_gcs[gc_class] = gc_term | ||
| 183 | + num_gcs += 1 | ||
| 184 | + # hash_gcs_bin[gc_class] = int(1) | ||
| 185 | + str_gcs = ", ".join(hash_gcs.keys()) | ||
| 186 | + # print(str_gcs) | ||
| 187 | + if str_gcs in hash_all_gcs: | ||
| 188 | + hash_all_gcs[str_gcs] += 1 | ||
| 189 | + else: | ||
| 190 | + hash_all_gcs[str_gcs] = 1 | ||
| 191 | + hash_all_gcs_num[str_gcs] = num_gcs | ||
| 192 | + df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True) | ||
| 193 | + # df_output_table_bin = df_output_table_bin.append(hash_gcs_bin, ignore_index=True) | ||
| 194 | + # print(df_output_table) | ||
| 195 | + # quit() | ||
| 196 | + # df_output_table_bin = df_output_table_bin.replace(np.nan, 0) | ||
| 197 | + # df_output_table_bin = pd.DataFrame(np.sort(df_output_table_bin.values, axis=0), index=df_output_table_bin.index, columns=df_output_table_bin.columns) | ||
| 198 | + df_output_table = df_output_table.replace(np.nan, '') | ||
| 199 | + # df.sort_values(by=['Country', 'Continent'], | ||
| 200 | + # ascending=[False, True]) | ||
| 201 | + # print(df_output_table_bin) | ||
| 202 | + df_output_table.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True) | ||
| 203 | + # df_output_table_bin.to_csv(os.path.join(path_output, file_bin), sep='\t', index=False, header=True) | ||
| 204 | + | ||
| 205 | + df_all_gcs = pd.DataFrame(list(hash_all_gcs.items())) | ||
| 206 | + df_all_gcs_num = pd.DataFrame(list(hash_all_gcs_num.items())) | ||
| 207 | + df_join = df_all_gcs.join(df_all_gcs_num, lsuffix='_all', rsuffix='_num') | ||
| 208 | + # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0')) | ||
| 209 | + df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy() | ||
| 210 | + # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase'] | ||
| 211 | + df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases'] | ||
| 212 | + df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True) | ||
| 213 | + | ||
| 214 | + df_all_gcs_table_x = pd.DataFrame() | ||
| 215 | + for ind in df_all_gcs_table.index: | ||
| 216 | + hash_gcs_table_x = {} | ||
| 217 | + phrase = df_all_gcs_table['gc_phrase'][ind] | ||
| 218 | + list_phrase = phrase.split(',') | ||
| 219 | + for gc_class in list_phrase: | ||
| 220 | + gc_class = gc_class.strip() | ||
| 221 | + hash_gcs_table_x[gc_class] = 'X' | ||
| 222 | + hash_gcs_table_x['num_gc_classes'] = df_all_gcs_table['num_gc_classes'][ind] | ||
| 223 | + hash_gcs_table_x['num_gc_phrases'] = df_all_gcs_table['num_gc_phrases'][ind] | ||
| 224 | + df_all_gcs_table_x = df_all_gcs_table_x.append(hash_gcs_table_x, ignore_index=True) | ||
| 225 | + df_all_gcs_table_x_out = df_all_gcs_table_x[['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM', | ||
| 226 | + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE', | ||
| 227 | + 'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases']].copy() | ||
| 228 | + df_all_gcs_table_x_out.to_csv(os.path.join(path_output, file_all_gcs_x), sep='\t', index=False, header=True) | ||
| 229 | + | ||
| 230 | + # https://plotly.com/python/table/ | ||
| 231 | + fig = go.Figure(data=[go.Table( | ||
| 232 | + header=dict( | ||
| 233 | + values=['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM', | ||
| 234 | + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE', | ||
| 235 | + 'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases'], | ||
| 236 | + line_color='white', fill_color='white', | ||
| 237 | + align='center', font=dict(color='black', size=12) | ||
| 238 | + ), | ||
| 239 | + cells=dict( | ||
| 240 | + values=[df_all_gcs_table_x_out.ORGANISM, df_all_gcs_table_x_out.GENETIC_BACKGROUND, | ||
| 241 | + df_all_gcs_table_x_out.MEDIUM, df_all_gcs_table_x_out.MEDIUM_SUPPLEMENTS, | ||
| 242 | + df_all_gcs_table_x_out.TEMPERATURE, df_all_gcs_table_x_out.OPTICAL_DENSITY, | ||
| 243 | + df_all_gcs_table_x_out.GROWTH_PHASE, df_all_gcs_table_x_out.AGITATION_SPEED, | ||
| 244 | + df_all_gcs_table_x_out.AERATION, df_all_gcs_table_x_out.pH, | ||
| 245 | + df_all_gcs_table_x_out.num_gc_classes, df_all_gcs_table_x_out.num_gc_phrases], | ||
| 246 | + # line_color=[df.Color], fill_color=[df.Color], | ||
| 247 | + align='center', font=dict(color='black', size=11) | ||
| 248 | + )) | ||
| 249 | + ]) | ||
| 250 | + fig.show() | ||
| 251 | + | ||
| 252 | + | ||
| 253 | +path_in = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores" | ||
| 254 | +path_out = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores" | ||
| 255 | +# Curator: PL | ||
| 256 | +file_in = "PL&VT_GC-PMID.tsv" | ||
| 257 | +file_out = "PL&VT_GC-PMID-GCs.tsv" | ||
| 258 | +file_gcs = "PL&VT_GC-PMID-GCs-All.tsv" | ||
| 259 | +file_gcs_x = "PL&VT_GC-PMID-GCs-All-X.tsv" | ||
| 260 | +### extract_gcs_curadores_PL(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x) | ||
| 261 | + | ||
| 262 | +# Curator: SG | ||
| 263 | +file_in = "GC-Catalog_resumido-sgama.tsv" | ||
| 264 | +file_out = "GC-Catalog_resumido-sgama-GCs.tsv" | ||
| 265 | +file_gcs = "GC-Catalog_resumido-sgama-GCs-All.tsv" | ||
| 266 | +file_gcs_x = "GC-Catalog_resumido-sgama-GCs-All-X.tsv" | ||
| 267 | +extract_gcs_curadores_SG(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x) |
data-sets/bin/extract-gcs-curadores_v2.py
0 → 100644
| 1 | +from plotly.colors import n_colors | ||
| 2 | +import plotly.graph_objects as go | ||
| 3 | +import numpy as np | ||
| 4 | +import os | ||
| 5 | +import pandas as pd | ||
| 6 | + | ||
| 7 | +def extract_gcs_curadores_SG(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_color): | ||
| 8 | + df_output_table = pd.DataFrame() | ||
| 9 | + # df_output_table_bin = pd.DataFrame() | ||
| 10 | + df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t") | ||
| 11 | + print("Shape df_input_table: {}".format(df_input_table.shape)) | ||
| 12 | + # print(df_input_table.head()) | ||
| 13 | + hash_all_gcs_control = {} | ||
| 14 | + hash_all_gcs_num_control = {} | ||
| 15 | + hash_all_gcs_test = {} | ||
| 16 | + hash_all_gcs_num_test = {} | ||
| 17 | + hash_gc_classes = {0: 'ORGANISM', 1: 'GENETIC_BACKGROUND', 2: 'MEDIUM', | ||
| 18 | + 3: 'MEDIUM_SUPPLEMENTS', 5: 'TEMPERATURE', 8: 'OPTICAL_DENSITY', 9: 'GROWTH_PHASE', | ||
| 19 | + 10: 'AGITATION_SPEED', 11: 'AERATION', 12: 'pH'} | ||
| 20 | + idx = 0 | ||
| 21 | + for ind in df_input_table.index: | ||
| 22 | + hash_gcs_table = {} | ||
| 23 | + hash_gcs = {} | ||
| 24 | + num_gcs = 0 | ||
| 25 | + pmid = df_input_table['Reference'][ind] | ||
| 26 | + hash_gcs_table['pmid'] = str(pmid) | ||
| 27 | + # print("Control_Growth_Condition...") | ||
| 28 | + gcs_control = df_input_table['Control_Growth_Condition'][ind] | ||
| 29 | + list_gcs_control = gcs_control.split('|') | ||
| 30 | + hash_gcs_table['IDX'] = idx | ||
| 31 | + hash_gcs_table['GC_TYPE'] = 'Control' | ||
| 32 | + for gc in list_gcs_control: | ||
| 33 | + if gc != '': | ||
| 34 | + gc_class = hash_gc_classes[list_gcs_control.index(gc)] | ||
| 35 | + # print("gc_class: {}".format(gc_class)) | ||
| 36 | + hash_gcs_table[gc_class] = gc | ||
| 37 | + hash_gcs[gc_class] = gc | ||
| 38 | + num_gcs += 1 | ||
| 39 | + str_gcs = ", ".join(hash_gcs.keys()) | ||
| 40 | + if str_gcs in hash_all_gcs_control: | ||
| 41 | + hash_all_gcs_control[str_gcs] += 1 | ||
| 42 | + else: | ||
| 43 | + hash_all_gcs_control[str_gcs] = 1 | ||
| 44 | + hash_all_gcs_num_control[str_gcs] = num_gcs | ||
| 45 | + df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True) | ||
| 46 | + | ||
| 47 | + hash_gcs_table = {} | ||
| 48 | + hash_gcs = {} | ||
| 49 | + num_gcs = 0 | ||
| 50 | + hash_gcs_table['pmid'] = str(pmid) | ||
| 51 | + # print("Test_Growth_Condition...") | ||
| 52 | + gcs_test = df_input_table['Test_Growth_Condition'][ind] | ||
| 53 | + list_gcs_test = gcs_test.split('|') | ||
| 54 | + hash_gcs_table['IDX'] = idx | ||
| 55 | + hash_gcs_table['GC_TYPE'] = 'Test' | ||
| 56 | + for gc in list_gcs_test: | ||
| 57 | + if gc != '': | ||
| 58 | + gc_class = hash_gc_classes[list_gcs_test.index(gc)] | ||
| 59 | + # print("gc_class: {}".format(gc_class)) | ||
| 60 | + hash_gcs_table[gc_class] = gc | ||
| 61 | + hash_gcs[gc_class] = gc | ||
| 62 | + num_gcs += 1 | ||
| 63 | + # hash_gcs_bin[gc_class] = int(1) | ||
| 64 | + str_gcs = ", ".join(hash_gcs.keys()) | ||
| 65 | + # print(str_gcs) | ||
| 66 | + if str_gcs in hash_all_gcs_test: | ||
| 67 | + hash_all_gcs_test[str_gcs] += 1 | ||
| 68 | + else: | ||
| 69 | + hash_all_gcs_test[str_gcs] = 1 | ||
| 70 | + hash_all_gcs_num_test[str_gcs] = num_gcs | ||
| 71 | + | ||
| 72 | + df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True) | ||
| 73 | + idx += 1 | ||
| 74 | + | ||
| 75 | + df_output_table = df_output_table.replace(np.nan, '') | ||
| 76 | + # print(df_output_table.head()) | ||
| 77 | + df_output_table_csv = df_output_table[['IDX', 'GC_TYPE', 'ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM', | ||
| 78 | + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE']].copy() | ||
| 79 | + # 'AGITATION_SPEED', 'AERATION', 'pH' | ||
| 80 | + # df.sort_values(by=['Country', 'Continent'], | ||
| 81 | + # ascending=[False, True]) | ||
| 82 | + # print(df_output_table_bin) | ||
| 83 | + df_output_table_csv.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True) | ||
| 84 | + | ||
| 85 | + ### Color table | ||
| 86 | + columns = list(df_output_table_csv) | ||
| 87 | + print(columns) | ||
| 88 | + df_output_table_color = pd.DataFrame() | ||
| 89 | + for ind in range(df_output_table_csv.index[-1]+1): | ||
| 90 | + # print("ind: {}".format(ind)) | ||
| 91 | + # https://www.geeksforgeeks.org/iterating-over-rows-and-columns-in-pandas-dataframe/ | ||
| 92 | + if df_output_table_csv['GC_TYPE'][ind] == 'Control': | ||
| 93 | + hash_output_table_control = {} | ||
| 94 | + for i in columns: | ||
| 95 | + # print("i: {}".format(i)) | ||
| 96 | + # printing the element of the column | ||
| 97 | + # print(df_output_table_csv[i][ind]) | ||
| 98 | + if df_output_table_csv[i][ind] == '': | ||
| 99 | + hash_output_table_control[i + '_COLOR'] = 'rgb(239, 243, 255)' | ||
| 100 | + else: | ||
| 101 | + hash_output_table_control[i + '_COLOR'] = 'rgb(189, 215, 231)' | ||
| 102 | + hash_output_table_control[i] = df_output_table_csv[i][ind] | ||
| 103 | + hash_output_table_control['IDX'] = df_output_table_csv['IDX'][ind] | ||
| 104 | + hash_output_table_control['IDX_COLOR'] = 'rgb(189, 215, 231)' | ||
| 105 | + hash_output_table_control['GC_TYPE'] = df_output_table_csv['GC_TYPE'][ind] | ||
| 106 | + hash_output_table_control['GC_TYPE_COLOR'] = 'rgb(189, 215, 231)' | ||
| 107 | + df_output_table_color = df_output_table_color.append(hash_output_table_control, ignore_index=True) | ||
| 108 | + elif df_output_table_csv['GC_TYPE'][ind] == 'Test': | ||
| 109 | + hash_output_table_test = {} | ||
| 110 | + for i in columns: | ||
| 111 | + # printing the element of the column | ||
| 112 | + # print(df_output_table_csv[i][ind]) | ||
| 113 | + if df_output_table_csv[i][ind] == '': | ||
| 114 | + hash_output_table_test[i + '_COLOR'] = 'rgb(239, 243, 255)' | ||
| 115 | + elif df_output_table_csv[i][ind] != df_output_table_csv[i][ind-1]: | ||
| 116 | + hash_output_table_test[i + '_COLOR'] = 'rgb(107, 174, 214)' | ||
| 117 | + else: | ||
| 118 | + hash_output_table_test[i + '_COLOR'] = 'rgb(189, 215, 245)' | ||
| 119 | + hash_output_table_test[i] = df_output_table_csv[i][ind] | ||
| 120 | + hash_output_table_test['IDX'] = df_output_table_csv['IDX'][ind] | ||
| 121 | + hash_output_table_test['IDX_COLOR'] = 'rgb(189, 215, 245)' | ||
| 122 | + hash_output_table_test['GC_TYPE'] = df_output_table_csv['GC_TYPE'][ind] | ||
| 123 | + hash_output_table_test['GC_TYPE_COLOR'] = 'rgb(189, 215, 245)' | ||
| 124 | + df_output_table_color = df_output_table_color.append(hash_output_table_test, ignore_index=True) | ||
| 125 | + | ||
| 126 | + df_output_table_color[['IDX', 'GC_TYPE', 'ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM', | ||
| 127 | + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE', | ||
| 128 | + 'ORGANISM_COLOR', 'GENETIC_BACKGROUND_COLOR', 'MEDIUM_COLOR', | ||
| 129 | + 'MEDIUM_SUPPLEMENTS_COLOR', 'TEMPERATURE_COLOR', 'OPTICAL_DENSITY_COLOR', | ||
| 130 | + 'GROWTH_PHASE_COLOR' | ||
| 131 | + ]].to_csv(os.path.join(path_output, file_all_gcs_color), sep='\t', index=False, header=True) | ||
| 132 | + # https://plotly.com/python/table/ | ||
| 133 | + # Seleccionar color: https://redketchup.io/color-picker | ||
| 134 | + colors = n_colors('rgb(255, 200, 200)', 'rgb(200, 0, 0)', 3, colortype='rgb') | ||
| 135 | + fig = go.Figure(data=[go.Table( | ||
| 136 | + header=dict( | ||
| 137 | + values=['<b>IDX', '<b>GC_TYPE</b>', '<b>ORGANISM</b>', '<b>GENETIC_BACKGROUND</b>', '<b>MEDIUM</b>', | ||
| 138 | + '<b>MEDIUM_SUPPLEMENTS</b>', '<b>TEMPERATURE</b>', '<b>OPTICAL_DENSITY</b>', '<b>GROWTH_PHASE</b>'], | ||
| 139 | + line_color='black', fill_color='white', | ||
| 140 | + align='center', font=dict(color='black', size=12) | ||
| 141 | + ), | ||
| 142 | + cells=dict( | ||
| 143 | + values=[df_output_table_color.IDX, | ||
| 144 | + df_output_table_color.GC_TYPE, | ||
| 145 | + df_output_table_color.ORGANISM, | ||
| 146 | + df_output_table_color.GENETIC_BACKGROUND, | ||
| 147 | + df_output_table_color.MEDIUM, | ||
| 148 | + df_output_table_color.MEDIUM_SUPPLEMENTS, | ||
| 149 | + df_output_table_color.TEMPERATURE, | ||
| 150 | + df_output_table_color.OPTICAL_DENSITY, | ||
| 151 | + df_output_table_color.GROWTH_PHASE], | ||
| 152 | + #, df_output_table_color.AGITATION_SPEED, | ||
| 153 | + # df_output_table_color.AERATION, df_output_table_color.pH, | ||
| 154 | + # df_output_table_color.num_gc_classes, df_output_table_color.num_gc_phrases], | ||
| 155 | + # line_color=[df.Color], | ||
| 156 | + fill_color=[df_output_table_color.IDX_COLOR, | ||
| 157 | + df_output_table_color.GC_TYPE_COLOR, | ||
| 158 | + df_output_table_color.ORGANISM_COLOR, | ||
| 159 | + df_output_table_color.GENETIC_BACKGROUND_COLOR, | ||
| 160 | + df_output_table_color.MEDIUM_COLOR, | ||
| 161 | + df_output_table_color.MEDIUM_SUPPLEMENTS_COLOR, | ||
| 162 | + df_output_table_color.TEMPERATURE_COLOR, | ||
| 163 | + df_output_table_color.OPTICAL_DENSITY_COLOR, | ||
| 164 | + df_output_table_color.GROWTH_PHASE_COLOR], | ||
| 165 | + align='center', font=dict(color='black', size=11) | ||
| 166 | + )) | ||
| 167 | + ]) | ||
| 168 | + # fig.show() os.path.join(path_output, file_all_gcs_color.replace('.tsv', '.png') | ||
| 169 | + fig.write_image(os.path.join(path_output, file_all_gcs_color.replace('.tsv', '.png')), height=2500, width=1800) | ||
| 170 | + quit() | ||
| 171 | + | ||
| 172 | + | ||
| 173 | + df_all_gcs = pd.DataFrame() | ||
| 174 | + df_all_gcs_num = pd.DataFrame() | ||
| 175 | + print(hash_all_gcs_control) | ||
| 176 | + print(hash_all_gcs_test) | ||
| 177 | + # quit() | ||
| 178 | + for all_gcs, num_gcs in hash_all_gcs_control.items(): | ||
| 179 | + # https://www.geeksforgeeks.org/how-to-append-a-list-as-a-row-to-a-pandas-dataframe-in-python/ | ||
| 180 | + # df = df.append(pd.DataFrame(list, columns=['Name', 'Age', 'City', 'Country']), ignore_index=True) | ||
| 181 | + df_all_gcs = df_all_gcs.append(pd.DataFrame([['Control', all_gcs, num_gcs]], | ||
| 182 | + columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True) | ||
| 183 | + for all_gcs, num_gcs in hash_all_gcs_test.items(): | ||
| 184 | + df_all_gcs = df_all_gcs.append(pd.DataFrame([['Test', all_gcs, num_gcs]], | ||
| 185 | + columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True) | ||
| 186 | + # print(df_all_gcs) | ||
| 187 | + for all_gcs, num_gcs in hash_all_gcs_num_control.items(): | ||
| 188 | + df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['Control', all_gcs, num_gcs]], | ||
| 189 | + columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True) | ||
| 190 | + for all_gcs, num_gcs in hash_all_gcs_num_test.items(): | ||
| 191 | + df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['Test', all_gcs, num_gcs]], | ||
| 192 | + columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True) | ||
| 193 | + # print(df_all_gcs_num) | ||
| 194 | + # quit() | ||
| 195 | + | ||
| 196 | + df_join = df_all_gcs.join(df_all_gcs_num.set_index(['class_type', 'gc_phrase']), on=['class_type', 'gc_phrase'], lsuffix='_all', rsuffix='_num') | ||
| 197 | + # print(df_join) | ||
| 198 | + # quit() | ||
| 199 | + | ||
| 200 | + # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0')) | ||
| 201 | + # df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy() | ||
| 202 | + # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase'] | ||
| 203 | + # df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases'] | ||
| 204 | + df_all_gcs_table = df_join.copy() | ||
| 205 | + df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True) | ||
| 206 | + | ||
| 207 | +def extract_gcs_curadores_PL(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_color): | ||
| 208 | + df_output_table = pd.DataFrame() | ||
| 209 | + # df_output_table_bin = pd.DataFrame() | ||
| 210 | + df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t") | ||
| 211 | + print("Shape df_input_table: {}".format(df_input_table.shape)) | ||
| 212 | + # print(df_input_table.head()) | ||
| 213 | + hash_all_gcs = {} | ||
| 214 | + hash_all_gcs_num = {} | ||
| 215 | + for ind in df_input_table.index: | ||
| 216 | + hash_gcs_table = {} | ||
| 217 | + hash_gcs = {} | ||
| 218 | + num_gcs = 0 | ||
| 219 | + # hash_gcs_bin = {} | ||
| 220 | + pmid = df_input_table['PMID'][ind] | ||
| 221 | + coleccion = df_input_table['Colección'][ind] | ||
| 222 | + hash_gcs_table['pmid'] = str(pmid) | ||
| 223 | + # hash_gcs_bin['pmid'] = str(pmid) | ||
| 224 | + hash_gcs_table['coleccion'] = coleccion | ||
| 225 | + gcs = df_input_table['GC'][ind] | ||
| 226 | + list_gcs = gcs.split('|') | ||
| 227 | + for gc in list_gcs: | ||
| 228 | + list_gc = gc.split(':') | ||
| 229 | + gc_class = list_gc[0] | ||
| 230 | + gc_class = gc_class.strip() | ||
| 231 | + gc_class = gc_class.strip("'") | ||
| 232 | + gc_term = list_gc[1] | ||
| 233 | + gc_term = gc_term.strip() | ||
| 234 | + gc_term = gc_term.strip("'") | ||
| 235 | + hash_gcs_table[gc_class] = gc_term | ||
| 236 | + hash_gcs[gc_class] = gc_term | ||
| 237 | + num_gcs += 1 | ||
| 238 | + # hash_gcs_bin[gc_class] = int(1) | ||
| 239 | + str_gcs = ", ".join(hash_gcs.keys()) | ||
| 240 | + # print(str_gcs) | ||
| 241 | + if str_gcs in hash_all_gcs: | ||
| 242 | + hash_all_gcs[str_gcs] += 1 | ||
| 243 | + else: | ||
| 244 | + hash_all_gcs[str_gcs] = 1 | ||
| 245 | + hash_all_gcs_num[str_gcs] = num_gcs | ||
| 246 | + df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True) | ||
| 247 | + # df_output_table_bin = df_output_table_bin.append(hash_gcs_bin, ignore_index=True) | ||
| 248 | + # print(df_output_table) | ||
| 249 | + # quit() | ||
| 250 | + # df_output_table_bin = df_output_table_bin.replace(np.nan, 0) | ||
| 251 | + # df_output_table_bin = pd.DataFrame(np.sort(df_output_table_bin.values, axis=0), index=df_output_table_bin.index, columns=df_output_table_bin.columns) | ||
| 252 | + df_output_table = df_output_table.replace(np.nan, '') | ||
| 253 | + # df.sort_values(by=['Country', 'Continent'], | ||
| 254 | + # ascending=[False, True]) | ||
| 255 | + # print(df_output_table_bin) | ||
| 256 | + df_output_table.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True) | ||
| 257 | + # df_output_table_bin.to_csv(os.path.join(path_output, file_bin), sep='\t', index=False, header=True) | ||
| 258 | + | ||
| 259 | + df_all_gcs = pd.DataFrame(list(hash_all_gcs.items())) | ||
| 260 | + df_all_gcs_num = pd.DataFrame(list(hash_all_gcs_num.items())) | ||
| 261 | + df_join = df_all_gcs.join(df_all_gcs_num, lsuffix='_all', rsuffix='_num') | ||
| 262 | + # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0')) | ||
| 263 | + df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy() | ||
| 264 | + # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase'] | ||
| 265 | + df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases'] | ||
| 266 | + df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True) | ||
| 267 | + | ||
| 268 | + df_all_gcs_table_x = pd.DataFrame() | ||
| 269 | + for ind in df_all_gcs_table.index: | ||
| 270 | + hash_gcs_table_x = {} | ||
| 271 | + phrase = df_all_gcs_table['gc_phrase'][ind] | ||
| 272 | + list_phrase = phrase.split(',') | ||
| 273 | + for gc_class in list_phrase: | ||
| 274 | + gc_class = gc_class.strip() | ||
| 275 | + hash_gcs_table_x[gc_class] = 'X' | ||
| 276 | + hash_gcs_table_x['num_gc_classes'] = df_all_gcs_table['num_gc_classes'][ind] | ||
| 277 | + hash_gcs_table_x['num_gc_phrases'] = df_all_gcs_table['num_gc_phrases'][ind] | ||
| 278 | + df_all_gcs_table_x = df_all_gcs_table_x.append(hash_gcs_table_x, ignore_index=True) | ||
| 279 | + df_all_gcs_table_x_out = df_all_gcs_table_x[['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM', | ||
| 280 | + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE', | ||
| 281 | + 'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases']].copy() | ||
| 282 | + df_all_gcs_table_x_out.to_csv(os.path.join(path_output, file_all_gcs_color), sep='\t', index=False, header=True) | ||
| 283 | + | ||
| 284 | + # https://plotly.com/python/table/ | ||
| 285 | + fig = go.Figure(data=[go.Table( | ||
| 286 | + header=dict( | ||
| 287 | + values=['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM', | ||
| 288 | + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE', | ||
| 289 | + 'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases'], | ||
| 290 | + line_color='white', fill_color='white', | ||
| 291 | + align='center', font=dict(color='black', size=12) | ||
| 292 | + ), | ||
| 293 | + cells=dict( | ||
| 294 | + values=[df_all_gcs_table_x_out.ORGANISM, df_all_gcs_table_x_out.GENETIC_BACKGROUND, | ||
| 295 | + df_all_gcs_table_x_out.MEDIUM, df_all_gcs_table_x_out.MEDIUM_SUPPLEMENTS, | ||
| 296 | + df_all_gcs_table_x_out.TEMPERATURE, df_all_gcs_table_x_out.OPTICAL_DENSITY, | ||
| 297 | + df_all_gcs_table_x_out.GROWTH_PHASE, df_all_gcs_table_x_out.AGITATION_SPEED, | ||
| 298 | + df_all_gcs_table_x_out.AERATION, df_all_gcs_table_x_out.pH, | ||
| 299 | + df_all_gcs_table_x_out.num_gc_classes, df_all_gcs_table_x_out.num_gc_phrases], | ||
| 300 | + # line_color=[df.Color], fill_color=[df.Color], | ||
| 301 | + align='center', font=dict(color='black', size=11) | ||
| 302 | + )) | ||
| 303 | + ]) | ||
| 304 | + fig.show() | ||
| 305 | + | ||
| 306 | + | ||
| 307 | +path_in = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores" | ||
| 308 | +path_out = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores" | ||
| 309 | +# Curator: PL | ||
| 310 | +file_in = "PL&VT_GC-PMID.tsv" | ||
| 311 | +file_out = "PL&VT_GC-PMID-GCs.tsv" | ||
| 312 | +file_gcs = "PL&VT_GC-PMID-GCs-All.tsv" | ||
| 313 | +file_gcs_x = "PL&VT_GC-PMID-GCs-All-X.tsv" | ||
| 314 | +### extract_gcs_curadores_PL(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x) | ||
| 315 | + | ||
| 316 | +# Curator: SG | ||
| 317 | +file_in = "GC-Catalog_resumido-sgama.tsv" | ||
| 318 | +file_out = "GC-Catalog_resumido-sgama-GCs.tsv" | ||
| 319 | +file_gcs = "GC-Catalog_resumido-sgama-GCs-All.tsv" | ||
| 320 | +file_gcs_color = "GC-Catalog_resumido-sgama-GCs-color.tsv" | ||
| 321 | +extract_gcs_curadores_SG(path_in, file_in, path_out, file_out, file_gcs, file_gcs_color) |
-
Please register or login to post a comment