Showing
2 changed files
with
588 additions
and
0 deletions
data-sets/bin/extract-gcs-curadores_v1.py
0 → 100644
1 | +import plotly.graph_objects as go | ||
2 | +import numpy as np | ||
3 | +import os | ||
4 | +import pandas as pd | ||
5 | + | ||
6 | +def extract_gcs_curadores_SG(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_x): | ||
7 | + df_output_table = pd.DataFrame() | ||
8 | + # df_output_table_bin = pd.DataFrame() | ||
9 | + df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t") | ||
10 | + print("Shape df_input_table: {}".format(df_input_table.shape)) | ||
11 | + print(df_input_table.head()) | ||
12 | + hash_all_gcs_control = {} | ||
13 | + hash_all_gcs_num_control = {} | ||
14 | + hash_all_gcs_test = {} | ||
15 | + hash_all_gcs_num_test = {} | ||
16 | + hash_gc_classes = {0: 'ORGANISM', 1: 'GENETIC_BACKGROUND', 2: 'MEDIUM', | ||
17 | + 3: 'MEDIUM_SUPPLEMENTS', 5: 'TEMPERATURE', 8: 'OPTICAL_DENSITY', 9: 'GROWTH_PHASE', | ||
18 | + 10: 'AGITATION_SPEED', 11: 'AERATION', 12: 'pH'} | ||
19 | + idx = 0 | ||
20 | + for ind in df_input_table.index: | ||
21 | + hash_gcs_table = {} | ||
22 | + hash_gcs = {} | ||
23 | + num_gcs = 0 | ||
24 | + pmid = df_input_table['Reference'][ind] | ||
25 | + hash_gcs_table['pmid'] = str(pmid) | ||
26 | + # print("Control_Growth_Condition...") | ||
27 | + gcs_control = df_input_table['Control_Growth_Condition'][ind] | ||
28 | + list_gcs_control = gcs_control.split('|') | ||
29 | + hash_gcs_table['idx'] = idx | ||
30 | + hash_gcs_table['gc_type'] = 'control' | ||
31 | + for gc in list_gcs_control: | ||
32 | + if gc != '': | ||
33 | + gc_class = hash_gc_classes[list_gcs_control.index(gc)] | ||
34 | + # print("gc_class: {}".format(gc_class)) | ||
35 | + hash_gcs_table[gc_class] = gc | ||
36 | + hash_gcs[gc_class] = gc | ||
37 | + num_gcs += 1 | ||
38 | + str_gcs = ", ".join(hash_gcs.keys()) | ||
39 | + if str_gcs in hash_all_gcs_control: | ||
40 | + hash_all_gcs_control[str_gcs] += 1 | ||
41 | + else: | ||
42 | + hash_all_gcs_control[str_gcs] = 1 | ||
43 | + hash_all_gcs_num_control[str_gcs] = num_gcs | ||
44 | + df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True) | ||
45 | + | ||
46 | + hash_gcs_table = {} | ||
47 | + hash_gcs = {} | ||
48 | + num_gcs = 0 | ||
49 | + hash_gcs_table['pmid'] = str(pmid) | ||
50 | + # print("Test_Growth_Condition...") | ||
51 | + gcs_test = df_input_table['Test_Growth_Condition'][ind] | ||
52 | + list_gcs_test = gcs_test.split('|') | ||
53 | + hash_gcs_table['idx'] = idx | ||
54 | + hash_gcs_table['gc_type'] = 'test' | ||
55 | + for gc in list_gcs_test: | ||
56 | + if gc != '': | ||
57 | + gc_class = hash_gc_classes[list_gcs_test.index(gc)] | ||
58 | + # print("gc_class: {}".format(gc_class)) | ||
59 | + hash_gcs_table[gc_class] = gc | ||
60 | + hash_gcs[gc_class] = gc | ||
61 | + num_gcs += 1 | ||
62 | + # hash_gcs_bin[gc_class] = int(1) | ||
63 | + str_gcs = ", ".join(hash_gcs.keys()) | ||
64 | + # print(str_gcs) | ||
65 | + if str_gcs in hash_all_gcs_test: | ||
66 | + hash_all_gcs_test[str_gcs] += 1 | ||
67 | + else: | ||
68 | + hash_all_gcs_test[str_gcs] = 1 | ||
69 | + hash_all_gcs_num_test[str_gcs] = num_gcs | ||
70 | + | ||
71 | + df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True) | ||
72 | + idx += 1 | ||
73 | + | ||
74 | + df_output_table = df_output_table.replace(np.nan, '') | ||
75 | + # df.sort_values(by=['Country', 'Continent'], | ||
76 | + # ascending=[False, True]) | ||
77 | + # print(df_output_table_bin) | ||
78 | + df_output_table.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True) | ||
79 | + | ||
80 | + df_all_gcs = pd.DataFrame() | ||
81 | + df_all_gcs_num = pd.DataFrame() | ||
82 | + print(hash_all_gcs_control) | ||
83 | + print(hash_all_gcs_test) | ||
84 | + # quit() | ||
85 | + for all_gcs, num_gcs in hash_all_gcs_control.items(): | ||
86 | + # https://www.geeksforgeeks.org/how-to-append-a-list-as-a-row-to-a-pandas-dataframe-in-python/ | ||
87 | + # df = df.append(pd.DataFrame(list, columns=['Name', 'Age', 'City', 'Country']), ignore_index=True) | ||
88 | + df_all_gcs = df_all_gcs.append(pd.DataFrame([['control', all_gcs, num_gcs]], | ||
89 | + columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True) | ||
90 | + for all_gcs, num_gcs in hash_all_gcs_test.items(): | ||
91 | + df_all_gcs = df_all_gcs.append(pd.DataFrame([['test', all_gcs, num_gcs]], | ||
92 | + columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True) | ||
93 | + print(df_all_gcs) | ||
94 | + for all_gcs, num_gcs in hash_all_gcs_num_control.items(): | ||
95 | + df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['control', all_gcs, num_gcs]], | ||
96 | + columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True) | ||
97 | + for all_gcs, num_gcs in hash_all_gcs_num_test.items(): | ||
98 | + df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['test', all_gcs, num_gcs]], | ||
99 | + columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True) | ||
100 | + print(df_all_gcs_num) | ||
101 | + # quit() | ||
102 | + | ||
103 | + df_join = df_all_gcs.join(df_all_gcs_num.set_index(['class_type', 'gc_phrase']), on=['class_type', 'gc_phrase'], lsuffix='_all', rsuffix='_num') | ||
104 | + print(df_join) | ||
105 | + # quit() | ||
106 | + | ||
107 | + # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0')) | ||
108 | + # df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy() | ||
109 | + # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase'] | ||
110 | + # df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases'] | ||
111 | + df_all_gcs_table = df_join.copy() | ||
112 | + df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True) | ||
113 | + quit() | ||
114 | + | ||
115 | + df_all_gcs_table_x = pd.DataFrame() | ||
116 | + for ind in df_all_gcs_table.index: | ||
117 | + hash_gcs_table_x = {} | ||
118 | + phrase = df_all_gcs_table['gc_phrase'][ind] | ||
119 | + list_phrase = phrase.split(',') | ||
120 | + for gc_class in list_phrase: | ||
121 | + gc_class = gc_class.strip() | ||
122 | + hash_gcs_table_x[gc_class] = 'X' | ||
123 | + hash_gcs_table_x['num_gc_classes'] = df_all_gcs_table['num_gc_classes'][ind] | ||
124 | + hash_gcs_table_x['num_gc_phrases'] = df_all_gcs_table['num_gc_phrases'][ind] | ||
125 | + df_all_gcs_table_x = df_all_gcs_table_x.append(hash_gcs_table_x, ignore_index=True) | ||
126 | + df_all_gcs_table_x_out = df_all_gcs_table_x[['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM', | ||
127 | + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE', | ||
128 | + 'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases']].copy() | ||
129 | + df_all_gcs_table_x_out.to_csv(os.path.join(path_output, file_all_gcs_x), sep='\t', index=False, header=True) | ||
130 | + | ||
131 | + # https://plotly.com/python/table/ | ||
132 | + fig = go.Figure(data=[go.Table( | ||
133 | + header=dict( | ||
134 | + values=['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM', | ||
135 | + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE', | ||
136 | + 'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases'], | ||
137 | + line_color='white', fill_color='white', | ||
138 | + align='center', font=dict(color='black', size=12) | ||
139 | + ), | ||
140 | + cells=dict( | ||
141 | + values=[df_all_gcs_table_x_out.ORGANISM, df_all_gcs_table_x_out.GENETIC_BACKGROUND, | ||
142 | + df_all_gcs_table_x_out.MEDIUM, df_all_gcs_table_x_out.MEDIUM_SUPPLEMENTS, | ||
143 | + df_all_gcs_table_x_out.TEMPERATURE, df_all_gcs_table_x_out.OPTICAL_DENSITY, | ||
144 | + df_all_gcs_table_x_out.GROWTH_PHASE, df_all_gcs_table_x_out.AGITATION_SPEED, | ||
145 | + df_all_gcs_table_x_out.AERATION, df_all_gcs_table_x_out.pH, | ||
146 | + df_all_gcs_table_x_out.num_gc_classes, df_all_gcs_table_x_out.num_gc_phrases], | ||
147 | + # line_color=[df.Color], fill_color=[df.Color], | ||
148 | + align='center', font=dict(color='black', size=11) | ||
149 | + )) | ||
150 | + ]) | ||
151 | + fig.show() | ||
152 | + | ||
153 | +def extract_gcs_curadores_PL(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_x): | ||
154 | + df_output_table = pd.DataFrame() | ||
155 | + # df_output_table_bin = pd.DataFrame() | ||
156 | + df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t") | ||
157 | + print("Shape df_input_table: {}".format(df_input_table.shape)) | ||
158 | + # print(df_input_table.head()) | ||
159 | + hash_all_gcs = {} | ||
160 | + hash_all_gcs_num = {} | ||
161 | + for ind in df_input_table.index: | ||
162 | + hash_gcs_table = {} | ||
163 | + hash_gcs = {} | ||
164 | + num_gcs = 0 | ||
165 | + # hash_gcs_bin = {} | ||
166 | + pmid = df_input_table['PMID'][ind] | ||
167 | + coleccion = df_input_table['Colección'][ind] | ||
168 | + hash_gcs_table['pmid'] = str(pmid) | ||
169 | + # hash_gcs_bin['pmid'] = str(pmid) | ||
170 | + hash_gcs_table['coleccion'] = coleccion | ||
171 | + gcs = df_input_table['GC'][ind] | ||
172 | + list_gcs = gcs.split('|') | ||
173 | + for gc in list_gcs: | ||
174 | + list_gc = gc.split(':') | ||
175 | + gc_class = list_gc[0] | ||
176 | + gc_class = gc_class.strip() | ||
177 | + gc_class = gc_class.strip("'") | ||
178 | + gc_term = list_gc[1] | ||
179 | + gc_term = gc_term.strip() | ||
180 | + gc_term = gc_term.strip("'") | ||
181 | + hash_gcs_table[gc_class] = gc_term | ||
182 | + hash_gcs[gc_class] = gc_term | ||
183 | + num_gcs += 1 | ||
184 | + # hash_gcs_bin[gc_class] = int(1) | ||
185 | + str_gcs = ", ".join(hash_gcs.keys()) | ||
186 | + # print(str_gcs) | ||
187 | + if str_gcs in hash_all_gcs: | ||
188 | + hash_all_gcs[str_gcs] += 1 | ||
189 | + else: | ||
190 | + hash_all_gcs[str_gcs] = 1 | ||
191 | + hash_all_gcs_num[str_gcs] = num_gcs | ||
192 | + df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True) | ||
193 | + # df_output_table_bin = df_output_table_bin.append(hash_gcs_bin, ignore_index=True) | ||
194 | + # print(df_output_table) | ||
195 | + # quit() | ||
196 | + # df_output_table_bin = df_output_table_bin.replace(np.nan, 0) | ||
197 | + # df_output_table_bin = pd.DataFrame(np.sort(df_output_table_bin.values, axis=0), index=df_output_table_bin.index, columns=df_output_table_bin.columns) | ||
198 | + df_output_table = df_output_table.replace(np.nan, '') | ||
199 | + # df.sort_values(by=['Country', 'Continent'], | ||
200 | + # ascending=[False, True]) | ||
201 | + # print(df_output_table_bin) | ||
202 | + df_output_table.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True) | ||
203 | + # df_output_table_bin.to_csv(os.path.join(path_output, file_bin), sep='\t', index=False, header=True) | ||
204 | + | ||
205 | + df_all_gcs = pd.DataFrame(list(hash_all_gcs.items())) | ||
206 | + df_all_gcs_num = pd.DataFrame(list(hash_all_gcs_num.items())) | ||
207 | + df_join = df_all_gcs.join(df_all_gcs_num, lsuffix='_all', rsuffix='_num') | ||
208 | + # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0')) | ||
209 | + df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy() | ||
210 | + # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase'] | ||
211 | + df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases'] | ||
212 | + df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True) | ||
213 | + | ||
214 | + df_all_gcs_table_x = pd.DataFrame() | ||
215 | + for ind in df_all_gcs_table.index: | ||
216 | + hash_gcs_table_x = {} | ||
217 | + phrase = df_all_gcs_table['gc_phrase'][ind] | ||
218 | + list_phrase = phrase.split(',') | ||
219 | + for gc_class in list_phrase: | ||
220 | + gc_class = gc_class.strip() | ||
221 | + hash_gcs_table_x[gc_class] = 'X' | ||
222 | + hash_gcs_table_x['num_gc_classes'] = df_all_gcs_table['num_gc_classes'][ind] | ||
223 | + hash_gcs_table_x['num_gc_phrases'] = df_all_gcs_table['num_gc_phrases'][ind] | ||
224 | + df_all_gcs_table_x = df_all_gcs_table_x.append(hash_gcs_table_x, ignore_index=True) | ||
225 | + df_all_gcs_table_x_out = df_all_gcs_table_x[['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM', | ||
226 | + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE', | ||
227 | + 'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases']].copy() | ||
228 | + df_all_gcs_table_x_out.to_csv(os.path.join(path_output, file_all_gcs_x), sep='\t', index=False, header=True) | ||
229 | + | ||
230 | + # https://plotly.com/python/table/ | ||
231 | + fig = go.Figure(data=[go.Table( | ||
232 | + header=dict( | ||
233 | + values=['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM', | ||
234 | + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE', | ||
235 | + 'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases'], | ||
236 | + line_color='white', fill_color='white', | ||
237 | + align='center', font=dict(color='black', size=12) | ||
238 | + ), | ||
239 | + cells=dict( | ||
240 | + values=[df_all_gcs_table_x_out.ORGANISM, df_all_gcs_table_x_out.GENETIC_BACKGROUND, | ||
241 | + df_all_gcs_table_x_out.MEDIUM, df_all_gcs_table_x_out.MEDIUM_SUPPLEMENTS, | ||
242 | + df_all_gcs_table_x_out.TEMPERATURE, df_all_gcs_table_x_out.OPTICAL_DENSITY, | ||
243 | + df_all_gcs_table_x_out.GROWTH_PHASE, df_all_gcs_table_x_out.AGITATION_SPEED, | ||
244 | + df_all_gcs_table_x_out.AERATION, df_all_gcs_table_x_out.pH, | ||
245 | + df_all_gcs_table_x_out.num_gc_classes, df_all_gcs_table_x_out.num_gc_phrases], | ||
246 | + # line_color=[df.Color], fill_color=[df.Color], | ||
247 | + align='center', font=dict(color='black', size=11) | ||
248 | + )) | ||
249 | + ]) | ||
250 | + fig.show() | ||
251 | + | ||
252 | + | ||
253 | +path_in = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores" | ||
254 | +path_out = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores" | ||
255 | +# Curator: PL | ||
256 | +file_in = "PL&VT_GC-PMID.tsv" | ||
257 | +file_out = "PL&VT_GC-PMID-GCs.tsv" | ||
258 | +file_gcs = "PL&VT_GC-PMID-GCs-All.tsv" | ||
259 | +file_gcs_x = "PL&VT_GC-PMID-GCs-All-X.tsv" | ||
260 | +### extract_gcs_curadores_PL(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x) | ||
261 | + | ||
262 | +# Curator: SG | ||
263 | +file_in = "GC-Catalog_resumido-sgama.tsv" | ||
264 | +file_out = "GC-Catalog_resumido-sgama-GCs.tsv" | ||
265 | +file_gcs = "GC-Catalog_resumido-sgama-GCs-All.tsv" | ||
266 | +file_gcs_x = "GC-Catalog_resumido-sgama-GCs-All-X.tsv" | ||
267 | +extract_gcs_curadores_SG(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x) |
data-sets/bin/extract-gcs-curadores_v2.py
0 → 100644
1 | +from plotly.colors import n_colors | ||
2 | +import plotly.graph_objects as go | ||
3 | +import numpy as np | ||
4 | +import os | ||
5 | +import pandas as pd | ||
6 | + | ||
7 | +def extract_gcs_curadores_SG(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_color): | ||
8 | + df_output_table = pd.DataFrame() | ||
9 | + # df_output_table_bin = pd.DataFrame() | ||
10 | + df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t") | ||
11 | + print("Shape df_input_table: {}".format(df_input_table.shape)) | ||
12 | + # print(df_input_table.head()) | ||
13 | + hash_all_gcs_control = {} | ||
14 | + hash_all_gcs_num_control = {} | ||
15 | + hash_all_gcs_test = {} | ||
16 | + hash_all_gcs_num_test = {} | ||
17 | + hash_gc_classes = {0: 'ORGANISM', 1: 'GENETIC_BACKGROUND', 2: 'MEDIUM', | ||
18 | + 3: 'MEDIUM_SUPPLEMENTS', 5: 'TEMPERATURE', 8: 'OPTICAL_DENSITY', 9: 'GROWTH_PHASE', | ||
19 | + 10: 'AGITATION_SPEED', 11: 'AERATION', 12: 'pH'} | ||
20 | + idx = 0 | ||
21 | + for ind in df_input_table.index: | ||
22 | + hash_gcs_table = {} | ||
23 | + hash_gcs = {} | ||
24 | + num_gcs = 0 | ||
25 | + pmid = df_input_table['Reference'][ind] | ||
26 | + hash_gcs_table['pmid'] = str(pmid) | ||
27 | + # print("Control_Growth_Condition...") | ||
28 | + gcs_control = df_input_table['Control_Growth_Condition'][ind] | ||
29 | + list_gcs_control = gcs_control.split('|') | ||
30 | + hash_gcs_table['IDX'] = idx | ||
31 | + hash_gcs_table['GC_TYPE'] = 'Control' | ||
32 | + for gc in list_gcs_control: | ||
33 | + if gc != '': | ||
34 | + gc_class = hash_gc_classes[list_gcs_control.index(gc)] | ||
35 | + # print("gc_class: {}".format(gc_class)) | ||
36 | + hash_gcs_table[gc_class] = gc | ||
37 | + hash_gcs[gc_class] = gc | ||
38 | + num_gcs += 1 | ||
39 | + str_gcs = ", ".join(hash_gcs.keys()) | ||
40 | + if str_gcs in hash_all_gcs_control: | ||
41 | + hash_all_gcs_control[str_gcs] += 1 | ||
42 | + else: | ||
43 | + hash_all_gcs_control[str_gcs] = 1 | ||
44 | + hash_all_gcs_num_control[str_gcs] = num_gcs | ||
45 | + df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True) | ||
46 | + | ||
47 | + hash_gcs_table = {} | ||
48 | + hash_gcs = {} | ||
49 | + num_gcs = 0 | ||
50 | + hash_gcs_table['pmid'] = str(pmid) | ||
51 | + # print("Test_Growth_Condition...") | ||
52 | + gcs_test = df_input_table['Test_Growth_Condition'][ind] | ||
53 | + list_gcs_test = gcs_test.split('|') | ||
54 | + hash_gcs_table['IDX'] = idx | ||
55 | + hash_gcs_table['GC_TYPE'] = 'Test' | ||
56 | + for gc in list_gcs_test: | ||
57 | + if gc != '': | ||
58 | + gc_class = hash_gc_classes[list_gcs_test.index(gc)] | ||
59 | + # print("gc_class: {}".format(gc_class)) | ||
60 | + hash_gcs_table[gc_class] = gc | ||
61 | + hash_gcs[gc_class] = gc | ||
62 | + num_gcs += 1 | ||
63 | + # hash_gcs_bin[gc_class] = int(1) | ||
64 | + str_gcs = ", ".join(hash_gcs.keys()) | ||
65 | + # print(str_gcs) | ||
66 | + if str_gcs in hash_all_gcs_test: | ||
67 | + hash_all_gcs_test[str_gcs] += 1 | ||
68 | + else: | ||
69 | + hash_all_gcs_test[str_gcs] = 1 | ||
70 | + hash_all_gcs_num_test[str_gcs] = num_gcs | ||
71 | + | ||
72 | + df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True) | ||
73 | + idx += 1 | ||
74 | + | ||
75 | + df_output_table = df_output_table.replace(np.nan, '') | ||
76 | + # print(df_output_table.head()) | ||
77 | + df_output_table_csv = df_output_table[['IDX', 'GC_TYPE', 'ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM', | ||
78 | + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE']].copy() | ||
79 | + # 'AGITATION_SPEED', 'AERATION', 'pH' | ||
80 | + # df.sort_values(by=['Country', 'Continent'], | ||
81 | + # ascending=[False, True]) | ||
82 | + # print(df_output_table_bin) | ||
83 | + df_output_table_csv.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True) | ||
84 | + | ||
85 | + ### Color table | ||
86 | + columns = list(df_output_table_csv) | ||
87 | + print(columns) | ||
88 | + df_output_table_color = pd.DataFrame() | ||
89 | + for ind in range(df_output_table_csv.index[-1]+1): | ||
90 | + # print("ind: {}".format(ind)) | ||
91 | + # https://www.geeksforgeeks.org/iterating-over-rows-and-columns-in-pandas-dataframe/ | ||
92 | + if df_output_table_csv['GC_TYPE'][ind] == 'Control': | ||
93 | + hash_output_table_control = {} | ||
94 | + for i in columns: | ||
95 | + # print("i: {}".format(i)) | ||
96 | + # printing the element of the column | ||
97 | + # print(df_output_table_csv[i][ind]) | ||
98 | + if df_output_table_csv[i][ind] == '': | ||
99 | + hash_output_table_control[i + '_COLOR'] = 'rgb(239, 243, 255)' | ||
100 | + else: | ||
101 | + hash_output_table_control[i + '_COLOR'] = 'rgb(189, 215, 231)' | ||
102 | + hash_output_table_control[i] = df_output_table_csv[i][ind] | ||
103 | + hash_output_table_control['IDX'] = df_output_table_csv['IDX'][ind] | ||
104 | + hash_output_table_control['IDX_COLOR'] = 'rgb(189, 215, 231)' | ||
105 | + hash_output_table_control['GC_TYPE'] = df_output_table_csv['GC_TYPE'][ind] | ||
106 | + hash_output_table_control['GC_TYPE_COLOR'] = 'rgb(189, 215, 231)' | ||
107 | + df_output_table_color = df_output_table_color.append(hash_output_table_control, ignore_index=True) | ||
108 | + elif df_output_table_csv['GC_TYPE'][ind] == 'Test': | ||
109 | + hash_output_table_test = {} | ||
110 | + for i in columns: | ||
111 | + # printing the element of the column | ||
112 | + # print(df_output_table_csv[i][ind]) | ||
113 | + if df_output_table_csv[i][ind] == '': | ||
114 | + hash_output_table_test[i + '_COLOR'] = 'rgb(239, 243, 255)' | ||
115 | + elif df_output_table_csv[i][ind] != df_output_table_csv[i][ind-1]: | ||
116 | + hash_output_table_test[i + '_COLOR'] = 'rgb(107, 174, 214)' | ||
117 | + else: | ||
118 | + hash_output_table_test[i + '_COLOR'] = 'rgb(189, 215, 245)' | ||
119 | + hash_output_table_test[i] = df_output_table_csv[i][ind] | ||
120 | + hash_output_table_test['IDX'] = df_output_table_csv['IDX'][ind] | ||
121 | + hash_output_table_test['IDX_COLOR'] = 'rgb(189, 215, 245)' | ||
122 | + hash_output_table_test['GC_TYPE'] = df_output_table_csv['GC_TYPE'][ind] | ||
123 | + hash_output_table_test['GC_TYPE_COLOR'] = 'rgb(189, 215, 245)' | ||
124 | + df_output_table_color = df_output_table_color.append(hash_output_table_test, ignore_index=True) | ||
125 | + | ||
126 | + df_output_table_color[['IDX', 'GC_TYPE', 'ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM', | ||
127 | + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE', | ||
128 | + 'ORGANISM_COLOR', 'GENETIC_BACKGROUND_COLOR', 'MEDIUM_COLOR', | ||
129 | + 'MEDIUM_SUPPLEMENTS_COLOR', 'TEMPERATURE_COLOR', 'OPTICAL_DENSITY_COLOR', | ||
130 | + 'GROWTH_PHASE_COLOR' | ||
131 | + ]].to_csv(os.path.join(path_output, file_all_gcs_color), sep='\t', index=False, header=True) | ||
132 | + # https://plotly.com/python/table/ | ||
133 | + # Seleccionar color: https://redketchup.io/color-picker | ||
134 | + colors = n_colors('rgb(255, 200, 200)', 'rgb(200, 0, 0)', 3, colortype='rgb') | ||
135 | + fig = go.Figure(data=[go.Table( | ||
136 | + header=dict( | ||
137 | + values=['<b>IDX', '<b>GC_TYPE</b>', '<b>ORGANISM</b>', '<b>GENETIC_BACKGROUND</b>', '<b>MEDIUM</b>', | ||
138 | + '<b>MEDIUM_SUPPLEMENTS</b>', '<b>TEMPERATURE</b>', '<b>OPTICAL_DENSITY</b>', '<b>GROWTH_PHASE</b>'], | ||
139 | + line_color='black', fill_color='white', | ||
140 | + align='center', font=dict(color='black', size=12) | ||
141 | + ), | ||
142 | + cells=dict( | ||
143 | + values=[df_output_table_color.IDX, | ||
144 | + df_output_table_color.GC_TYPE, | ||
145 | + df_output_table_color.ORGANISM, | ||
146 | + df_output_table_color.GENETIC_BACKGROUND, | ||
147 | + df_output_table_color.MEDIUM, | ||
148 | + df_output_table_color.MEDIUM_SUPPLEMENTS, | ||
149 | + df_output_table_color.TEMPERATURE, | ||
150 | + df_output_table_color.OPTICAL_DENSITY, | ||
151 | + df_output_table_color.GROWTH_PHASE], | ||
152 | + #, df_output_table_color.AGITATION_SPEED, | ||
153 | + # df_output_table_color.AERATION, df_output_table_color.pH, | ||
154 | + # df_output_table_color.num_gc_classes, df_output_table_color.num_gc_phrases], | ||
155 | + # line_color=[df.Color], | ||
156 | + fill_color=[df_output_table_color.IDX_COLOR, | ||
157 | + df_output_table_color.GC_TYPE_COLOR, | ||
158 | + df_output_table_color.ORGANISM_COLOR, | ||
159 | + df_output_table_color.GENETIC_BACKGROUND_COLOR, | ||
160 | + df_output_table_color.MEDIUM_COLOR, | ||
161 | + df_output_table_color.MEDIUM_SUPPLEMENTS_COLOR, | ||
162 | + df_output_table_color.TEMPERATURE_COLOR, | ||
163 | + df_output_table_color.OPTICAL_DENSITY_COLOR, | ||
164 | + df_output_table_color.GROWTH_PHASE_COLOR], | ||
165 | + align='center', font=dict(color='black', size=11) | ||
166 | + )) | ||
167 | + ]) | ||
168 | + # fig.show() os.path.join(path_output, file_all_gcs_color.replace('.tsv', '.png') | ||
169 | + fig.write_image(os.path.join(path_output, file_all_gcs_color.replace('.tsv', '.png')), height=2500, width=1800) | ||
170 | + quit() | ||
171 | + | ||
172 | + | ||
173 | + df_all_gcs = pd.DataFrame() | ||
174 | + df_all_gcs_num = pd.DataFrame() | ||
175 | + print(hash_all_gcs_control) | ||
176 | + print(hash_all_gcs_test) | ||
177 | + # quit() | ||
178 | + for all_gcs, num_gcs in hash_all_gcs_control.items(): | ||
179 | + # https://www.geeksforgeeks.org/how-to-append-a-list-as-a-row-to-a-pandas-dataframe-in-python/ | ||
180 | + # df = df.append(pd.DataFrame(list, columns=['Name', 'Age', 'City', 'Country']), ignore_index=True) | ||
181 | + df_all_gcs = df_all_gcs.append(pd.DataFrame([['Control', all_gcs, num_gcs]], | ||
182 | + columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True) | ||
183 | + for all_gcs, num_gcs in hash_all_gcs_test.items(): | ||
184 | + df_all_gcs = df_all_gcs.append(pd.DataFrame([['Test', all_gcs, num_gcs]], | ||
185 | + columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True) | ||
186 | + # print(df_all_gcs) | ||
187 | + for all_gcs, num_gcs in hash_all_gcs_num_control.items(): | ||
188 | + df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['Control', all_gcs, num_gcs]], | ||
189 | + columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True) | ||
190 | + for all_gcs, num_gcs in hash_all_gcs_num_test.items(): | ||
191 | + df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['Test', all_gcs, num_gcs]], | ||
192 | + columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True) | ||
193 | + # print(df_all_gcs_num) | ||
194 | + # quit() | ||
195 | + | ||
196 | + df_join = df_all_gcs.join(df_all_gcs_num.set_index(['class_type', 'gc_phrase']), on=['class_type', 'gc_phrase'], lsuffix='_all', rsuffix='_num') | ||
197 | + # print(df_join) | ||
198 | + # quit() | ||
199 | + | ||
200 | + # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0')) | ||
201 | + # df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy() | ||
202 | + # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase'] | ||
203 | + # df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases'] | ||
204 | + df_all_gcs_table = df_join.copy() | ||
205 | + df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True) | ||
206 | + | ||
207 | +def extract_gcs_curadores_PL(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_color): | ||
208 | + df_output_table = pd.DataFrame() | ||
209 | + # df_output_table_bin = pd.DataFrame() | ||
210 | + df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t") | ||
211 | + print("Shape df_input_table: {}".format(df_input_table.shape)) | ||
212 | + # print(df_input_table.head()) | ||
213 | + hash_all_gcs = {} | ||
214 | + hash_all_gcs_num = {} | ||
215 | + for ind in df_input_table.index: | ||
216 | + hash_gcs_table = {} | ||
217 | + hash_gcs = {} | ||
218 | + num_gcs = 0 | ||
219 | + # hash_gcs_bin = {} | ||
220 | + pmid = df_input_table['PMID'][ind] | ||
221 | + coleccion = df_input_table['Colección'][ind] | ||
222 | + hash_gcs_table['pmid'] = str(pmid) | ||
223 | + # hash_gcs_bin['pmid'] = str(pmid) | ||
224 | + hash_gcs_table['coleccion'] = coleccion | ||
225 | + gcs = df_input_table['GC'][ind] | ||
226 | + list_gcs = gcs.split('|') | ||
227 | + for gc in list_gcs: | ||
228 | + list_gc = gc.split(':') | ||
229 | + gc_class = list_gc[0] | ||
230 | + gc_class = gc_class.strip() | ||
231 | + gc_class = gc_class.strip("'") | ||
232 | + gc_term = list_gc[1] | ||
233 | + gc_term = gc_term.strip() | ||
234 | + gc_term = gc_term.strip("'") | ||
235 | + hash_gcs_table[gc_class] = gc_term | ||
236 | + hash_gcs[gc_class] = gc_term | ||
237 | + num_gcs += 1 | ||
238 | + # hash_gcs_bin[gc_class] = int(1) | ||
239 | + str_gcs = ", ".join(hash_gcs.keys()) | ||
240 | + # print(str_gcs) | ||
241 | + if str_gcs in hash_all_gcs: | ||
242 | + hash_all_gcs[str_gcs] += 1 | ||
243 | + else: | ||
244 | + hash_all_gcs[str_gcs] = 1 | ||
245 | + hash_all_gcs_num[str_gcs] = num_gcs | ||
246 | + df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True) | ||
247 | + # df_output_table_bin = df_output_table_bin.append(hash_gcs_bin, ignore_index=True) | ||
248 | + # print(df_output_table) | ||
249 | + # quit() | ||
250 | + # df_output_table_bin = df_output_table_bin.replace(np.nan, 0) | ||
251 | + # df_output_table_bin = pd.DataFrame(np.sort(df_output_table_bin.values, axis=0), index=df_output_table_bin.index, columns=df_output_table_bin.columns) | ||
252 | + df_output_table = df_output_table.replace(np.nan, '') | ||
253 | + # df.sort_values(by=['Country', 'Continent'], | ||
254 | + # ascending=[False, True]) | ||
255 | + # print(df_output_table_bin) | ||
256 | + df_output_table.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True) | ||
257 | + # df_output_table_bin.to_csv(os.path.join(path_output, file_bin), sep='\t', index=False, header=True) | ||
258 | + | ||
259 | + df_all_gcs = pd.DataFrame(list(hash_all_gcs.items())) | ||
260 | + df_all_gcs_num = pd.DataFrame(list(hash_all_gcs_num.items())) | ||
261 | + df_join = df_all_gcs.join(df_all_gcs_num, lsuffix='_all', rsuffix='_num') | ||
262 | + # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0')) | ||
263 | + df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy() | ||
264 | + # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase'] | ||
265 | + df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases'] | ||
266 | + df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True) | ||
267 | + | ||
268 | + df_all_gcs_table_x = pd.DataFrame() | ||
269 | + for ind in df_all_gcs_table.index: | ||
270 | + hash_gcs_table_x = {} | ||
271 | + phrase = df_all_gcs_table['gc_phrase'][ind] | ||
272 | + list_phrase = phrase.split(',') | ||
273 | + for gc_class in list_phrase: | ||
274 | + gc_class = gc_class.strip() | ||
275 | + hash_gcs_table_x[gc_class] = 'X' | ||
276 | + hash_gcs_table_x['num_gc_classes'] = df_all_gcs_table['num_gc_classes'][ind] | ||
277 | + hash_gcs_table_x['num_gc_phrases'] = df_all_gcs_table['num_gc_phrases'][ind] | ||
278 | + df_all_gcs_table_x = df_all_gcs_table_x.append(hash_gcs_table_x, ignore_index=True) | ||
279 | + df_all_gcs_table_x_out = df_all_gcs_table_x[['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM', | ||
280 | + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE', | ||
281 | + 'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases']].copy() | ||
282 | + df_all_gcs_table_x_out.to_csv(os.path.join(path_output, file_all_gcs_color), sep='\t', index=False, header=True) | ||
283 | + | ||
284 | + # https://plotly.com/python/table/ | ||
285 | + fig = go.Figure(data=[go.Table( | ||
286 | + header=dict( | ||
287 | + values=['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM', | ||
288 | + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE', | ||
289 | + 'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases'], | ||
290 | + line_color='white', fill_color='white', | ||
291 | + align='center', font=dict(color='black', size=12) | ||
292 | + ), | ||
293 | + cells=dict( | ||
294 | + values=[df_all_gcs_table_x_out.ORGANISM, df_all_gcs_table_x_out.GENETIC_BACKGROUND, | ||
295 | + df_all_gcs_table_x_out.MEDIUM, df_all_gcs_table_x_out.MEDIUM_SUPPLEMENTS, | ||
296 | + df_all_gcs_table_x_out.TEMPERATURE, df_all_gcs_table_x_out.OPTICAL_DENSITY, | ||
297 | + df_all_gcs_table_x_out.GROWTH_PHASE, df_all_gcs_table_x_out.AGITATION_SPEED, | ||
298 | + df_all_gcs_table_x_out.AERATION, df_all_gcs_table_x_out.pH, | ||
299 | + df_all_gcs_table_x_out.num_gc_classes, df_all_gcs_table_x_out.num_gc_phrases], | ||
300 | + # line_color=[df.Color], fill_color=[df.Color], | ||
301 | + align='center', font=dict(color='black', size=11) | ||
302 | + )) | ||
303 | + ]) | ||
304 | + fig.show() | ||
305 | + | ||
306 | + | ||
307 | +path_in = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores" | ||
308 | +path_out = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores" | ||
309 | +# Curator: PL | ||
310 | +file_in = "PL&VT_GC-PMID.tsv" | ||
311 | +file_out = "PL&VT_GC-PMID-GCs.tsv" | ||
312 | +file_gcs = "PL&VT_GC-PMID-GCs-All.tsv" | ||
313 | +file_gcs_x = "PL&VT_GC-PMID-GCs-All-X.tsv" | ||
314 | +### extract_gcs_curadores_PL(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x) | ||
315 | + | ||
316 | +# Curator: SG | ||
317 | +file_in = "GC-Catalog_resumido-sgama.tsv" | ||
318 | +file_out = "GC-Catalog_resumido-sgama-GCs.tsv" | ||
319 | +file_gcs = "GC-Catalog_resumido-sgama-GCs-All.tsv" | ||
320 | +file_gcs_color = "GC-Catalog_resumido-sgama-GCs-color.tsv" | ||
321 | +extract_gcs_curadores_SG(path_in, file_in, path_out, file_out, file_gcs, file_gcs_color) |
-
Please register or login to post a comment