cmendezc

Extracción de GCs de literatura.

1 +import plotly.graph_objects as go
2 +import numpy as np
3 +import os
4 +import pandas as pd
5 +
6 +def extract_gcs_curadores_SG(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_x):
7 + df_output_table = pd.DataFrame()
8 + # df_output_table_bin = pd.DataFrame()
9 + df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t")
10 + print("Shape df_input_table: {}".format(df_input_table.shape))
11 + print(df_input_table.head())
12 + hash_all_gcs_control = {}
13 + hash_all_gcs_num_control = {}
14 + hash_all_gcs_test = {}
15 + hash_all_gcs_num_test = {}
16 + hash_gc_classes = {0: 'ORGANISM', 1: 'GENETIC_BACKGROUND', 2: 'MEDIUM',
17 + 3: 'MEDIUM_SUPPLEMENTS', 5: 'TEMPERATURE', 8: 'OPTICAL_DENSITY', 9: 'GROWTH_PHASE',
18 + 10: 'AGITATION_SPEED', 11: 'AERATION', 12: 'pH'}
19 + idx = 0
20 + for ind in df_input_table.index:
21 + hash_gcs_table = {}
22 + hash_gcs = {}
23 + num_gcs = 0
24 + pmid = df_input_table['Reference'][ind]
25 + hash_gcs_table['pmid'] = str(pmid)
26 + # print("Control_Growth_Condition...")
27 + gcs_control = df_input_table['Control_Growth_Condition'][ind]
28 + list_gcs_control = gcs_control.split('|')
29 + hash_gcs_table['idx'] = idx
30 + hash_gcs_table['gc_type'] = 'control'
31 + for gc in list_gcs_control:
32 + if gc != '':
33 + gc_class = hash_gc_classes[list_gcs_control.index(gc)]
34 + # print("gc_class: {}".format(gc_class))
35 + hash_gcs_table[gc_class] = gc
36 + hash_gcs[gc_class] = gc
37 + num_gcs += 1
38 + str_gcs = ", ".join(hash_gcs.keys())
39 + if str_gcs in hash_all_gcs_control:
40 + hash_all_gcs_control[str_gcs] += 1
41 + else:
42 + hash_all_gcs_control[str_gcs] = 1
43 + hash_all_gcs_num_control[str_gcs] = num_gcs
44 + df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
45 +
46 + hash_gcs_table = {}
47 + hash_gcs = {}
48 + num_gcs = 0
49 + hash_gcs_table['pmid'] = str(pmid)
50 + # print("Test_Growth_Condition...")
51 + gcs_test = df_input_table['Test_Growth_Condition'][ind]
52 + list_gcs_test = gcs_test.split('|')
53 + hash_gcs_table['idx'] = idx
54 + hash_gcs_table['gc_type'] = 'test'
55 + for gc in list_gcs_test:
56 + if gc != '':
57 + gc_class = hash_gc_classes[list_gcs_test.index(gc)]
58 + # print("gc_class: {}".format(gc_class))
59 + hash_gcs_table[gc_class] = gc
60 + hash_gcs[gc_class] = gc
61 + num_gcs += 1
62 + # hash_gcs_bin[gc_class] = int(1)
63 + str_gcs = ", ".join(hash_gcs.keys())
64 + # print(str_gcs)
65 + if str_gcs in hash_all_gcs_test:
66 + hash_all_gcs_test[str_gcs] += 1
67 + else:
68 + hash_all_gcs_test[str_gcs] = 1
69 + hash_all_gcs_num_test[str_gcs] = num_gcs
70 +
71 + df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
72 + idx += 1
73 +
74 + df_output_table = df_output_table.replace(np.nan, '')
75 + # df.sort_values(by=['Country', 'Continent'],
76 + # ascending=[False, True])
77 + # print(df_output_table_bin)
78 + df_output_table.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True)
79 +
80 + df_all_gcs = pd.DataFrame()
81 + df_all_gcs_num = pd.DataFrame()
82 + print(hash_all_gcs_control)
83 + print(hash_all_gcs_test)
84 + # quit()
85 + for all_gcs, num_gcs in hash_all_gcs_control.items():
86 + # https://www.geeksforgeeks.org/how-to-append-a-list-as-a-row-to-a-pandas-dataframe-in-python/
87 + # df = df.append(pd.DataFrame(list, columns=['Name', 'Age', 'City', 'Country']), ignore_index=True)
88 + df_all_gcs = df_all_gcs.append(pd.DataFrame([['control', all_gcs, num_gcs]],
89 + columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True)
90 + for all_gcs, num_gcs in hash_all_gcs_test.items():
91 + df_all_gcs = df_all_gcs.append(pd.DataFrame([['test', all_gcs, num_gcs]],
92 + columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True)
93 + print(df_all_gcs)
94 + for all_gcs, num_gcs in hash_all_gcs_num_control.items():
95 + df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['control', all_gcs, num_gcs]],
96 + columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True)
97 + for all_gcs, num_gcs in hash_all_gcs_num_test.items():
98 + df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['test', all_gcs, num_gcs]],
99 + columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True)
100 + print(df_all_gcs_num)
101 + # quit()
102 +
103 + df_join = df_all_gcs.join(df_all_gcs_num.set_index(['class_type', 'gc_phrase']), on=['class_type', 'gc_phrase'], lsuffix='_all', rsuffix='_num')
104 + print(df_join)
105 + # quit()
106 +
107 + # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
108 + # df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy()
109 + # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
110 + # df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases']
111 + df_all_gcs_table = df_join.copy()
112 + df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True)
113 + quit()
114 +
115 + df_all_gcs_table_x = pd.DataFrame()
116 + for ind in df_all_gcs_table.index:
117 + hash_gcs_table_x = {}
118 + phrase = df_all_gcs_table['gc_phrase'][ind]
119 + list_phrase = phrase.split(',')
120 + for gc_class in list_phrase:
121 + gc_class = gc_class.strip()
122 + hash_gcs_table_x[gc_class] = 'X'
123 + hash_gcs_table_x['num_gc_classes'] = df_all_gcs_table['num_gc_classes'][ind]
124 + hash_gcs_table_x['num_gc_phrases'] = df_all_gcs_table['num_gc_phrases'][ind]
125 + df_all_gcs_table_x = df_all_gcs_table_x.append(hash_gcs_table_x, ignore_index=True)
126 + df_all_gcs_table_x_out = df_all_gcs_table_x[['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
127 + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
128 + 'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases']].copy()
129 + df_all_gcs_table_x_out.to_csv(os.path.join(path_output, file_all_gcs_x), sep='\t', index=False, header=True)
130 +
131 + # https://plotly.com/python/table/
132 + fig = go.Figure(data=[go.Table(
133 + header=dict(
134 + values=['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
135 + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
136 + 'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases'],
137 + line_color='white', fill_color='white',
138 + align='center', font=dict(color='black', size=12)
139 + ),
140 + cells=dict(
141 + values=[df_all_gcs_table_x_out.ORGANISM, df_all_gcs_table_x_out.GENETIC_BACKGROUND,
142 + df_all_gcs_table_x_out.MEDIUM, df_all_gcs_table_x_out.MEDIUM_SUPPLEMENTS,
143 + df_all_gcs_table_x_out.TEMPERATURE, df_all_gcs_table_x_out.OPTICAL_DENSITY,
144 + df_all_gcs_table_x_out.GROWTH_PHASE, df_all_gcs_table_x_out.AGITATION_SPEED,
145 + df_all_gcs_table_x_out.AERATION, df_all_gcs_table_x_out.pH,
146 + df_all_gcs_table_x_out.num_gc_classes, df_all_gcs_table_x_out.num_gc_phrases],
147 + # line_color=[df.Color], fill_color=[df.Color],
148 + align='center', font=dict(color='black', size=11)
149 + ))
150 + ])
151 + fig.show()
152 +
153 +def extract_gcs_curadores_PL(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_x):
154 + df_output_table = pd.DataFrame()
155 + # df_output_table_bin = pd.DataFrame()
156 + df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t")
157 + print("Shape df_input_table: {}".format(df_input_table.shape))
158 + # print(df_input_table.head())
159 + hash_all_gcs = {}
160 + hash_all_gcs_num = {}
161 + for ind in df_input_table.index:
162 + hash_gcs_table = {}
163 + hash_gcs = {}
164 + num_gcs = 0
165 + # hash_gcs_bin = {}
166 + pmid = df_input_table['PMID'][ind]
167 + coleccion = df_input_table['Colección'][ind]
168 + hash_gcs_table['pmid'] = str(pmid)
169 + # hash_gcs_bin['pmid'] = str(pmid)
170 + hash_gcs_table['coleccion'] = coleccion
171 + gcs = df_input_table['GC'][ind]
172 + list_gcs = gcs.split('|')
173 + for gc in list_gcs:
174 + list_gc = gc.split(':')
175 + gc_class = list_gc[0]
176 + gc_class = gc_class.strip()
177 + gc_class = gc_class.strip("'")
178 + gc_term = list_gc[1]
179 + gc_term = gc_term.strip()
180 + gc_term = gc_term.strip("'")
181 + hash_gcs_table[gc_class] = gc_term
182 + hash_gcs[gc_class] = gc_term
183 + num_gcs += 1
184 + # hash_gcs_bin[gc_class] = int(1)
185 + str_gcs = ", ".join(hash_gcs.keys())
186 + # print(str_gcs)
187 + if str_gcs in hash_all_gcs:
188 + hash_all_gcs[str_gcs] += 1
189 + else:
190 + hash_all_gcs[str_gcs] = 1
191 + hash_all_gcs_num[str_gcs] = num_gcs
192 + df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
193 + # df_output_table_bin = df_output_table_bin.append(hash_gcs_bin, ignore_index=True)
194 + # print(df_output_table)
195 + # quit()
196 + # df_output_table_bin = df_output_table_bin.replace(np.nan, 0)
197 + # df_output_table_bin = pd.DataFrame(np.sort(df_output_table_bin.values, axis=0), index=df_output_table_bin.index, columns=df_output_table_bin.columns)
198 + df_output_table = df_output_table.replace(np.nan, '')
199 + # df.sort_values(by=['Country', 'Continent'],
200 + # ascending=[False, True])
201 + # print(df_output_table_bin)
202 + df_output_table.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True)
203 + # df_output_table_bin.to_csv(os.path.join(path_output, file_bin), sep='\t', index=False, header=True)
204 +
205 + df_all_gcs = pd.DataFrame(list(hash_all_gcs.items()))
206 + df_all_gcs_num = pd.DataFrame(list(hash_all_gcs_num.items()))
207 + df_join = df_all_gcs.join(df_all_gcs_num, lsuffix='_all', rsuffix='_num')
208 + # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
209 + df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy()
210 + # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
211 + df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases']
212 + df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True)
213 +
214 + df_all_gcs_table_x = pd.DataFrame()
215 + for ind in df_all_gcs_table.index:
216 + hash_gcs_table_x = {}
217 + phrase = df_all_gcs_table['gc_phrase'][ind]
218 + list_phrase = phrase.split(',')
219 + for gc_class in list_phrase:
220 + gc_class = gc_class.strip()
221 + hash_gcs_table_x[gc_class] = 'X'
222 + hash_gcs_table_x['num_gc_classes'] = df_all_gcs_table['num_gc_classes'][ind]
223 + hash_gcs_table_x['num_gc_phrases'] = df_all_gcs_table['num_gc_phrases'][ind]
224 + df_all_gcs_table_x = df_all_gcs_table_x.append(hash_gcs_table_x, ignore_index=True)
225 + df_all_gcs_table_x_out = df_all_gcs_table_x[['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
226 + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
227 + 'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases']].copy()
228 + df_all_gcs_table_x_out.to_csv(os.path.join(path_output, file_all_gcs_x), sep='\t', index=False, header=True)
229 +
230 + # https://plotly.com/python/table/
231 + fig = go.Figure(data=[go.Table(
232 + header=dict(
233 + values=['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
234 + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
235 + 'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases'],
236 + line_color='white', fill_color='white',
237 + align='center', font=dict(color='black', size=12)
238 + ),
239 + cells=dict(
240 + values=[df_all_gcs_table_x_out.ORGANISM, df_all_gcs_table_x_out.GENETIC_BACKGROUND,
241 + df_all_gcs_table_x_out.MEDIUM, df_all_gcs_table_x_out.MEDIUM_SUPPLEMENTS,
242 + df_all_gcs_table_x_out.TEMPERATURE, df_all_gcs_table_x_out.OPTICAL_DENSITY,
243 + df_all_gcs_table_x_out.GROWTH_PHASE, df_all_gcs_table_x_out.AGITATION_SPEED,
244 + df_all_gcs_table_x_out.AERATION, df_all_gcs_table_x_out.pH,
245 + df_all_gcs_table_x_out.num_gc_classes, df_all_gcs_table_x_out.num_gc_phrases],
246 + # line_color=[df.Color], fill_color=[df.Color],
247 + align='center', font=dict(color='black', size=11)
248 + ))
249 + ])
250 + fig.show()
251 +
252 +
253 +path_in = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
254 +path_out = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
255 +# Curator: PL
256 +file_in = "PL&VT_GC-PMID.tsv"
257 +file_out = "PL&VT_GC-PMID-GCs.tsv"
258 +file_gcs = "PL&VT_GC-PMID-GCs-All.tsv"
259 +file_gcs_x = "PL&VT_GC-PMID-GCs-All-X.tsv"
260 +### extract_gcs_curadores_PL(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x)
261 +
262 +# Curator: SG
263 +file_in = "GC-Catalog_resumido-sgama.tsv"
264 +file_out = "GC-Catalog_resumido-sgama-GCs.tsv"
265 +file_gcs = "GC-Catalog_resumido-sgama-GCs-All.tsv"
266 +file_gcs_x = "GC-Catalog_resumido-sgama-GCs-All-X.tsv"
267 +extract_gcs_curadores_SG(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x)
1 +from plotly.colors import n_colors
2 +import plotly.graph_objects as go
3 +import numpy as np
4 +import os
5 +import pandas as pd
6 +
7 +def extract_gcs_curadores_SG(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_color):
8 + df_output_table = pd.DataFrame()
9 + # df_output_table_bin = pd.DataFrame()
10 + df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t")
11 + print("Shape df_input_table: {}".format(df_input_table.shape))
12 + # print(df_input_table.head())
13 + hash_all_gcs_control = {}
14 + hash_all_gcs_num_control = {}
15 + hash_all_gcs_test = {}
16 + hash_all_gcs_num_test = {}
17 + hash_gc_classes = {0: 'ORGANISM', 1: 'GENETIC_BACKGROUND', 2: 'MEDIUM',
18 + 3: 'MEDIUM_SUPPLEMENTS', 5: 'TEMPERATURE', 8: 'OPTICAL_DENSITY', 9: 'GROWTH_PHASE',
19 + 10: 'AGITATION_SPEED', 11: 'AERATION', 12: 'pH'}
20 + idx = 0
21 + for ind in df_input_table.index:
22 + hash_gcs_table = {}
23 + hash_gcs = {}
24 + num_gcs = 0
25 + pmid = df_input_table['Reference'][ind]
26 + hash_gcs_table['pmid'] = str(pmid)
27 + # print("Control_Growth_Condition...")
28 + gcs_control = df_input_table['Control_Growth_Condition'][ind]
29 + list_gcs_control = gcs_control.split('|')
30 + hash_gcs_table['IDX'] = idx
31 + hash_gcs_table['GC_TYPE'] = 'Control'
32 + for gc in list_gcs_control:
33 + if gc != '':
34 + gc_class = hash_gc_classes[list_gcs_control.index(gc)]
35 + # print("gc_class: {}".format(gc_class))
36 + hash_gcs_table[gc_class] = gc
37 + hash_gcs[gc_class] = gc
38 + num_gcs += 1
39 + str_gcs = ", ".join(hash_gcs.keys())
40 + if str_gcs in hash_all_gcs_control:
41 + hash_all_gcs_control[str_gcs] += 1
42 + else:
43 + hash_all_gcs_control[str_gcs] = 1
44 + hash_all_gcs_num_control[str_gcs] = num_gcs
45 + df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
46 +
47 + hash_gcs_table = {}
48 + hash_gcs = {}
49 + num_gcs = 0
50 + hash_gcs_table['pmid'] = str(pmid)
51 + # print("Test_Growth_Condition...")
52 + gcs_test = df_input_table['Test_Growth_Condition'][ind]
53 + list_gcs_test = gcs_test.split('|')
54 + hash_gcs_table['IDX'] = idx
55 + hash_gcs_table['GC_TYPE'] = 'Test'
56 + for gc in list_gcs_test:
57 + if gc != '':
58 + gc_class = hash_gc_classes[list_gcs_test.index(gc)]
59 + # print("gc_class: {}".format(gc_class))
60 + hash_gcs_table[gc_class] = gc
61 + hash_gcs[gc_class] = gc
62 + num_gcs += 1
63 + # hash_gcs_bin[gc_class] = int(1)
64 + str_gcs = ", ".join(hash_gcs.keys())
65 + # print(str_gcs)
66 + if str_gcs in hash_all_gcs_test:
67 + hash_all_gcs_test[str_gcs] += 1
68 + else:
69 + hash_all_gcs_test[str_gcs] = 1
70 + hash_all_gcs_num_test[str_gcs] = num_gcs
71 +
72 + df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
73 + idx += 1
74 +
75 + df_output_table = df_output_table.replace(np.nan, '')
76 + # print(df_output_table.head())
77 + df_output_table_csv = df_output_table[['IDX', 'GC_TYPE', 'ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
78 + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE']].copy()
79 + # 'AGITATION_SPEED', 'AERATION', 'pH'
80 + # df.sort_values(by=['Country', 'Continent'],
81 + # ascending=[False, True])
82 + # print(df_output_table_bin)
83 + df_output_table_csv.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True)
84 +
85 + ### Color table
86 + columns = list(df_output_table_csv)
87 + print(columns)
88 + df_output_table_color = pd.DataFrame()
89 + for ind in range(df_output_table_csv.index[-1]+1):
90 + # print("ind: {}".format(ind))
91 + # https://www.geeksforgeeks.org/iterating-over-rows-and-columns-in-pandas-dataframe/
92 + if df_output_table_csv['GC_TYPE'][ind] == 'Control':
93 + hash_output_table_control = {}
94 + for i in columns:
95 + # print("i: {}".format(i))
96 + # printing the element of the column
97 + # print(df_output_table_csv[i][ind])
98 + if df_output_table_csv[i][ind] == '':
99 + hash_output_table_control[i + '_COLOR'] = 'rgb(239, 243, 255)'
100 + else:
101 + hash_output_table_control[i + '_COLOR'] = 'rgb(189, 215, 231)'
102 + hash_output_table_control[i] = df_output_table_csv[i][ind]
103 + hash_output_table_control['IDX'] = df_output_table_csv['IDX'][ind]
104 + hash_output_table_control['IDX_COLOR'] = 'rgb(189, 215, 231)'
105 + hash_output_table_control['GC_TYPE'] = df_output_table_csv['GC_TYPE'][ind]
106 + hash_output_table_control['GC_TYPE_COLOR'] = 'rgb(189, 215, 231)'
107 + df_output_table_color = df_output_table_color.append(hash_output_table_control, ignore_index=True)
108 + elif df_output_table_csv['GC_TYPE'][ind] == 'Test':
109 + hash_output_table_test = {}
110 + for i in columns:
111 + # printing the element of the column
112 + # print(df_output_table_csv[i][ind])
113 + if df_output_table_csv[i][ind] == '':
114 + hash_output_table_test[i + '_COLOR'] = 'rgb(239, 243, 255)'
115 + elif df_output_table_csv[i][ind] != df_output_table_csv[i][ind-1]:
116 + hash_output_table_test[i + '_COLOR'] = 'rgb(107, 174, 214)'
117 + else:
118 + hash_output_table_test[i + '_COLOR'] = 'rgb(189, 215, 245)'
119 + hash_output_table_test[i] = df_output_table_csv[i][ind]
120 + hash_output_table_test['IDX'] = df_output_table_csv['IDX'][ind]
121 + hash_output_table_test['IDX_COLOR'] = 'rgb(189, 215, 245)'
122 + hash_output_table_test['GC_TYPE'] = df_output_table_csv['GC_TYPE'][ind]
123 + hash_output_table_test['GC_TYPE_COLOR'] = 'rgb(189, 215, 245)'
124 + df_output_table_color = df_output_table_color.append(hash_output_table_test, ignore_index=True)
125 +
126 + df_output_table_color[['IDX', 'GC_TYPE', 'ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
127 + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
128 + 'ORGANISM_COLOR', 'GENETIC_BACKGROUND_COLOR', 'MEDIUM_COLOR',
129 + 'MEDIUM_SUPPLEMENTS_COLOR', 'TEMPERATURE_COLOR', 'OPTICAL_DENSITY_COLOR',
130 + 'GROWTH_PHASE_COLOR'
131 + ]].to_csv(os.path.join(path_output, file_all_gcs_color), sep='\t', index=False, header=True)
132 + # https://plotly.com/python/table/
133 + # Seleccionar color: https://redketchup.io/color-picker
134 + colors = n_colors('rgb(255, 200, 200)', 'rgb(200, 0, 0)', 3, colortype='rgb')
135 + fig = go.Figure(data=[go.Table(
136 + header=dict(
137 + values=['<b>IDX', '<b>GC_TYPE</b>', '<b>ORGANISM</b>', '<b>GENETIC_BACKGROUND</b>', '<b>MEDIUM</b>',
138 + '<b>MEDIUM_SUPPLEMENTS</b>', '<b>TEMPERATURE</b>', '<b>OPTICAL_DENSITY</b>', '<b>GROWTH_PHASE</b>'],
139 + line_color='black', fill_color='white',
140 + align='center', font=dict(color='black', size=12)
141 + ),
142 + cells=dict(
143 + values=[df_output_table_color.IDX,
144 + df_output_table_color.GC_TYPE,
145 + df_output_table_color.ORGANISM,
146 + df_output_table_color.GENETIC_BACKGROUND,
147 + df_output_table_color.MEDIUM,
148 + df_output_table_color.MEDIUM_SUPPLEMENTS,
149 + df_output_table_color.TEMPERATURE,
150 + df_output_table_color.OPTICAL_DENSITY,
151 + df_output_table_color.GROWTH_PHASE],
152 + #, df_output_table_color.AGITATION_SPEED,
153 + # df_output_table_color.AERATION, df_output_table_color.pH,
154 + # df_output_table_color.num_gc_classes, df_output_table_color.num_gc_phrases],
155 + # line_color=[df.Color],
156 + fill_color=[df_output_table_color.IDX_COLOR,
157 + df_output_table_color.GC_TYPE_COLOR,
158 + df_output_table_color.ORGANISM_COLOR,
159 + df_output_table_color.GENETIC_BACKGROUND_COLOR,
160 + df_output_table_color.MEDIUM_COLOR,
161 + df_output_table_color.MEDIUM_SUPPLEMENTS_COLOR,
162 + df_output_table_color.TEMPERATURE_COLOR,
163 + df_output_table_color.OPTICAL_DENSITY_COLOR,
164 + df_output_table_color.GROWTH_PHASE_COLOR],
165 + align='center', font=dict(color='black', size=11)
166 + ))
167 + ])
168 + # fig.show() os.path.join(path_output, file_all_gcs_color.replace('.tsv', '.png')
169 + fig.write_image(os.path.join(path_output, file_all_gcs_color.replace('.tsv', '.png')), height=2500, width=1800)
170 + quit()
171 +
172 +
173 + df_all_gcs = pd.DataFrame()
174 + df_all_gcs_num = pd.DataFrame()
175 + print(hash_all_gcs_control)
176 + print(hash_all_gcs_test)
177 + # quit()
178 + for all_gcs, num_gcs in hash_all_gcs_control.items():
179 + # https://www.geeksforgeeks.org/how-to-append-a-list-as-a-row-to-a-pandas-dataframe-in-python/
180 + # df = df.append(pd.DataFrame(list, columns=['Name', 'Age', 'City', 'Country']), ignore_index=True)
181 + df_all_gcs = df_all_gcs.append(pd.DataFrame([['Control', all_gcs, num_gcs]],
182 + columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True)
183 + for all_gcs, num_gcs in hash_all_gcs_test.items():
184 + df_all_gcs = df_all_gcs.append(pd.DataFrame([['Test', all_gcs, num_gcs]],
185 + columns=['class_type', 'gc_phrase', 'num_gc_phrases']), ignore_index=True)
186 + # print(df_all_gcs)
187 + for all_gcs, num_gcs in hash_all_gcs_num_control.items():
188 + df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['Control', all_gcs, num_gcs]],
189 + columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True)
190 + for all_gcs, num_gcs in hash_all_gcs_num_test.items():
191 + df_all_gcs_num = df_all_gcs_num.append(pd.DataFrame([['Test', all_gcs, num_gcs]],
192 + columns=['class_type', 'gc_phrase', 'num_gc_classes']), ignore_index=True)
193 + # print(df_all_gcs_num)
194 + # quit()
195 +
196 + df_join = df_all_gcs.join(df_all_gcs_num.set_index(['class_type', 'gc_phrase']), on=['class_type', 'gc_phrase'], lsuffix='_all', rsuffix='_num')
197 + # print(df_join)
198 + # quit()
199 +
200 + # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
201 + # df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy()
202 + # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
203 + # df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases']
204 + df_all_gcs_table = df_join.copy()
205 + df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True)
206 +
207 +def extract_gcs_curadores_PL(path_input, file_input, path_output, file_output, file_all_gcs, file_all_gcs_color):
208 + df_output_table = pd.DataFrame()
209 + # df_output_table_bin = pd.DataFrame()
210 + df_input_table = pd.read_csv(os.path.join(path_input, file_input), sep="\t")
211 + print("Shape df_input_table: {}".format(df_input_table.shape))
212 + # print(df_input_table.head())
213 + hash_all_gcs = {}
214 + hash_all_gcs_num = {}
215 + for ind in df_input_table.index:
216 + hash_gcs_table = {}
217 + hash_gcs = {}
218 + num_gcs = 0
219 + # hash_gcs_bin = {}
220 + pmid = df_input_table['PMID'][ind]
221 + coleccion = df_input_table['Colección'][ind]
222 + hash_gcs_table['pmid'] = str(pmid)
223 + # hash_gcs_bin['pmid'] = str(pmid)
224 + hash_gcs_table['coleccion'] = coleccion
225 + gcs = df_input_table['GC'][ind]
226 + list_gcs = gcs.split('|')
227 + for gc in list_gcs:
228 + list_gc = gc.split(':')
229 + gc_class = list_gc[0]
230 + gc_class = gc_class.strip()
231 + gc_class = gc_class.strip("'")
232 + gc_term = list_gc[1]
233 + gc_term = gc_term.strip()
234 + gc_term = gc_term.strip("'")
235 + hash_gcs_table[gc_class] = gc_term
236 + hash_gcs[gc_class] = gc_term
237 + num_gcs += 1
238 + # hash_gcs_bin[gc_class] = int(1)
239 + str_gcs = ", ".join(hash_gcs.keys())
240 + # print(str_gcs)
241 + if str_gcs in hash_all_gcs:
242 + hash_all_gcs[str_gcs] += 1
243 + else:
244 + hash_all_gcs[str_gcs] = 1
245 + hash_all_gcs_num[str_gcs] = num_gcs
246 + df_output_table = df_output_table.append(hash_gcs_table, ignore_index=True)
247 + # df_output_table_bin = df_output_table_bin.append(hash_gcs_bin, ignore_index=True)
248 + # print(df_output_table)
249 + # quit()
250 + # df_output_table_bin = df_output_table_bin.replace(np.nan, 0)
251 + # df_output_table_bin = pd.DataFrame(np.sort(df_output_table_bin.values, axis=0), index=df_output_table_bin.index, columns=df_output_table_bin.columns)
252 + df_output_table = df_output_table.replace(np.nan, '')
253 + # df.sort_values(by=['Country', 'Continent'],
254 + # ascending=[False, True])
255 + # print(df_output_table_bin)
256 + df_output_table.to_csv(os.path.join(path_output, file_output), sep='\t', index=False, header=True)
257 + # df_output_table_bin.to_csv(os.path.join(path_output, file_bin), sep='\t', index=False, header=True)
258 +
259 + df_all_gcs = pd.DataFrame(list(hash_all_gcs.items()))
260 + df_all_gcs_num = pd.DataFrame(list(hash_all_gcs_num.items()))
261 + df_join = df_all_gcs.join(df_all_gcs_num, lsuffix='_all', rsuffix='_num')
262 + # df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
263 + df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy()
264 + # df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
265 + df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases']
266 + df_all_gcs_table.to_csv(os.path.join(path_output, file_all_gcs), sep='\t', index=False, header=True)
267 +
268 + df_all_gcs_table_x = pd.DataFrame()
269 + for ind in df_all_gcs_table.index:
270 + hash_gcs_table_x = {}
271 + phrase = df_all_gcs_table['gc_phrase'][ind]
272 + list_phrase = phrase.split(',')
273 + for gc_class in list_phrase:
274 + gc_class = gc_class.strip()
275 + hash_gcs_table_x[gc_class] = 'X'
276 + hash_gcs_table_x['num_gc_classes'] = df_all_gcs_table['num_gc_classes'][ind]
277 + hash_gcs_table_x['num_gc_phrases'] = df_all_gcs_table['num_gc_phrases'][ind]
278 + df_all_gcs_table_x = df_all_gcs_table_x.append(hash_gcs_table_x, ignore_index=True)
279 + df_all_gcs_table_x_out = df_all_gcs_table_x[['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
280 + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
281 + 'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases']].copy()
282 + df_all_gcs_table_x_out.to_csv(os.path.join(path_output, file_all_gcs_color), sep='\t', index=False, header=True)
283 +
284 + # https://plotly.com/python/table/
285 + fig = go.Figure(data=[go.Table(
286 + header=dict(
287 + values=['ORGANISM', 'GENETIC_BACKGROUND', 'MEDIUM',
288 + 'MEDIUM_SUPPLEMENTS', 'TEMPERATURE', 'OPTICAL_DENSITY', 'GROWTH_PHASE',
289 + 'AGITATION_SPEED', 'AERATION', 'pH', 'num_gc_classes', 'num_gc_phrases'],
290 + line_color='white', fill_color='white',
291 + align='center', font=dict(color='black', size=12)
292 + ),
293 + cells=dict(
294 + values=[df_all_gcs_table_x_out.ORGANISM, df_all_gcs_table_x_out.GENETIC_BACKGROUND,
295 + df_all_gcs_table_x_out.MEDIUM, df_all_gcs_table_x_out.MEDIUM_SUPPLEMENTS,
296 + df_all_gcs_table_x_out.TEMPERATURE, df_all_gcs_table_x_out.OPTICAL_DENSITY,
297 + df_all_gcs_table_x_out.GROWTH_PHASE, df_all_gcs_table_x_out.AGITATION_SPEED,
298 + df_all_gcs_table_x_out.AERATION, df_all_gcs_table_x_out.pH,
299 + df_all_gcs_table_x_out.num_gc_classes, df_all_gcs_table_x_out.num_gc_phrases],
300 + # line_color=[df.Color], fill_color=[df.Color],
301 + align='center', font=dict(color='black', size=11)
302 + ))
303 + ])
304 + fig.show()
305 +
306 +
307 +path_in = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
308 +path_out = "/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
309 +# Curator: PL
310 +file_in = "PL&VT_GC-PMID.tsv"
311 +file_out = "PL&VT_GC-PMID-GCs.tsv"
312 +file_gcs = "PL&VT_GC-PMID-GCs-All.tsv"
313 +file_gcs_x = "PL&VT_GC-PMID-GCs-All-X.tsv"
314 +### extract_gcs_curadores_PL(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x)
315 +
316 +# Curator: SG
317 +file_in = "GC-Catalog_resumido-sgama.tsv"
318 +file_out = "GC-Catalog_resumido-sgama-GCs.tsv"
319 +file_gcs = "GC-Catalog_resumido-sgama-GCs-All.tsv"
320 +file_gcs_color = "GC-Catalog_resumido-sgama-GCs-color.tsv"
321 +extract_gcs_curadores_SG(path_in, file_in, path_out, file_out, file_gcs, file_gcs_color)