Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Carlos-Francisco Méndez-Cruz
/
automatic-extraction-growth-conditions
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Authored by
cmendezc
2023-03-24 23:28:20 -0600
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
2088c9b2be214c46b7916fd21f609b6d71f247ce
2088c9b2
1 parent
ecabc025
Extracción de GCs de literatura.
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
588 additions
and
0 deletions
data-sets/bin/extract-gcs-curadores_v1.py
data-sets/bin/extract-gcs-curadores_v2.py
data-sets/bin/extract-gcs-curadores_v1.py
0 → 100644
View file @
2088c9b
import
plotly.graph_objects
as
go
import
numpy
as
np
import
os
import
pandas
as
pd
def
extract_gcs_curadores_SG
(
path_input
,
file_input
,
path_output
,
file_output
,
file_all_gcs
,
file_all_gcs_x
):
df_output_table
=
pd
.
DataFrame
()
# df_output_table_bin = pd.DataFrame()
df_input_table
=
pd
.
read_csv
(
os
.
path
.
join
(
path_input
,
file_input
),
sep
=
"
\t
"
)
print
(
"Shape df_input_table: {}"
.
format
(
df_input_table
.
shape
))
print
(
df_input_table
.
head
())
hash_all_gcs_control
=
{}
hash_all_gcs_num_control
=
{}
hash_all_gcs_test
=
{}
hash_all_gcs_num_test
=
{}
hash_gc_classes
=
{
0
:
'ORGANISM'
,
1
:
'GENETIC_BACKGROUND'
,
2
:
'MEDIUM'
,
3
:
'MEDIUM_SUPPLEMENTS'
,
5
:
'TEMPERATURE'
,
8
:
'OPTICAL_DENSITY'
,
9
:
'GROWTH_PHASE'
,
10
:
'AGITATION_SPEED'
,
11
:
'AERATION'
,
12
:
'pH'
}
idx
=
0
for
ind
in
df_input_table
.
index
:
hash_gcs_table
=
{}
hash_gcs
=
{}
num_gcs
=
0
pmid
=
df_input_table
[
'Reference'
][
ind
]
hash_gcs_table
[
'pmid'
]
=
str
(
pmid
)
# print("Control_Growth_Condition...")
gcs_control
=
df_input_table
[
'Control_Growth_Condition'
][
ind
]
list_gcs_control
=
gcs_control
.
split
(
'|'
)
hash_gcs_table
[
'idx'
]
=
idx
hash_gcs_table
[
'gc_type'
]
=
'control'
for
gc
in
list_gcs_control
:
if
gc
!=
''
:
gc_class
=
hash_gc_classes
[
list_gcs_control
.
index
(
gc
)]
# print("gc_class: {}".format(gc_class))
hash_gcs_table
[
gc_class
]
=
gc
hash_gcs
[
gc_class
]
=
gc
num_gcs
+=
1
str_gcs
=
", "
.
join
(
hash_gcs
.
keys
())
if
str_gcs
in
hash_all_gcs_control
:
hash_all_gcs_control
[
str_gcs
]
+=
1
else
:
hash_all_gcs_control
[
str_gcs
]
=
1
hash_all_gcs_num_control
[
str_gcs
]
=
num_gcs
df_output_table
=
df_output_table
.
append
(
hash_gcs_table
,
ignore_index
=
True
)
hash_gcs_table
=
{}
hash_gcs
=
{}
num_gcs
=
0
hash_gcs_table
[
'pmid'
]
=
str
(
pmid
)
# print("Test_Growth_Condition...")
gcs_test
=
df_input_table
[
'Test_Growth_Condition'
][
ind
]
list_gcs_test
=
gcs_test
.
split
(
'|'
)
hash_gcs_table
[
'idx'
]
=
idx
hash_gcs_table
[
'gc_type'
]
=
'test'
for
gc
in
list_gcs_test
:
if
gc
!=
''
:
gc_class
=
hash_gc_classes
[
list_gcs_test
.
index
(
gc
)]
# print("gc_class: {}".format(gc_class))
hash_gcs_table
[
gc_class
]
=
gc
hash_gcs
[
gc_class
]
=
gc
num_gcs
+=
1
# hash_gcs_bin[gc_class] = int(1)
str_gcs
=
", "
.
join
(
hash_gcs
.
keys
())
# print(str_gcs)
if
str_gcs
in
hash_all_gcs_test
:
hash_all_gcs_test
[
str_gcs
]
+=
1
else
:
hash_all_gcs_test
[
str_gcs
]
=
1
hash_all_gcs_num_test
[
str_gcs
]
=
num_gcs
df_output_table
=
df_output_table
.
append
(
hash_gcs_table
,
ignore_index
=
True
)
idx
+=
1
df_output_table
=
df_output_table
.
replace
(
np
.
nan
,
''
)
# df.sort_values(by=['Country', 'Continent'],
# ascending=[False, True])
# print(df_output_table_bin)
df_output_table
.
to_csv
(
os
.
path
.
join
(
path_output
,
file_output
),
sep
=
'
\t
'
,
index
=
False
,
header
=
True
)
df_all_gcs
=
pd
.
DataFrame
()
df_all_gcs_num
=
pd
.
DataFrame
()
print
(
hash_all_gcs_control
)
print
(
hash_all_gcs_test
)
# quit()
for
all_gcs
,
num_gcs
in
hash_all_gcs_control
.
items
():
# https://www.geeksforgeeks.org/how-to-append-a-list-as-a-row-to-a-pandas-dataframe-in-python/
# df = df.append(pd.DataFrame(list, columns=['Name', 'Age', 'City', 'Country']), ignore_index=True)
df_all_gcs
=
df_all_gcs
.
append
(
pd
.
DataFrame
([[
'control'
,
all_gcs
,
num_gcs
]],
columns
=
[
'class_type'
,
'gc_phrase'
,
'num_gc_phrases'
]),
ignore_index
=
True
)
for
all_gcs
,
num_gcs
in
hash_all_gcs_test
.
items
():
df_all_gcs
=
df_all_gcs
.
append
(
pd
.
DataFrame
([[
'test'
,
all_gcs
,
num_gcs
]],
columns
=
[
'class_type'
,
'gc_phrase'
,
'num_gc_phrases'
]),
ignore_index
=
True
)
print
(
df_all_gcs
)
for
all_gcs
,
num_gcs
in
hash_all_gcs_num_control
.
items
():
df_all_gcs_num
=
df_all_gcs_num
.
append
(
pd
.
DataFrame
([[
'control'
,
all_gcs
,
num_gcs
]],
columns
=
[
'class_type'
,
'gc_phrase'
,
'num_gc_classes'
]),
ignore_index
=
True
)
for
all_gcs
,
num_gcs
in
hash_all_gcs_num_test
.
items
():
df_all_gcs_num
=
df_all_gcs_num
.
append
(
pd
.
DataFrame
([[
'test'
,
all_gcs
,
num_gcs
]],
columns
=
[
'class_type'
,
'gc_phrase'
,
'num_gc_classes'
]),
ignore_index
=
True
)
print
(
df_all_gcs_num
)
# quit()
df_join
=
df_all_gcs
.
join
(
df_all_gcs_num
.
set_index
([
'class_type'
,
'gc_phrase'
]),
on
=
[
'class_type'
,
'gc_phrase'
],
lsuffix
=
'_all'
,
rsuffix
=
'_num'
)
print
(
df_join
)
# quit()
# df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
# df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy()
# df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
# df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases']
df_all_gcs_table
=
df_join
.
copy
()
df_all_gcs_table
.
to_csv
(
os
.
path
.
join
(
path_output
,
file_all_gcs
),
sep
=
'
\t
'
,
index
=
False
,
header
=
True
)
quit
()
df_all_gcs_table_x
=
pd
.
DataFrame
()
for
ind
in
df_all_gcs_table
.
index
:
hash_gcs_table_x
=
{}
phrase
=
df_all_gcs_table
[
'gc_phrase'
][
ind
]
list_phrase
=
phrase
.
split
(
','
)
for
gc_class
in
list_phrase
:
gc_class
=
gc_class
.
strip
()
hash_gcs_table_x
[
gc_class
]
=
'X'
hash_gcs_table_x
[
'num_gc_classes'
]
=
df_all_gcs_table
[
'num_gc_classes'
][
ind
]
hash_gcs_table_x
[
'num_gc_phrases'
]
=
df_all_gcs_table
[
'num_gc_phrases'
][
ind
]
df_all_gcs_table_x
=
df_all_gcs_table_x
.
append
(
hash_gcs_table_x
,
ignore_index
=
True
)
df_all_gcs_table_x_out
=
df_all_gcs_table_x
[[
'ORGANISM'
,
'GENETIC_BACKGROUND'
,
'MEDIUM'
,
'MEDIUM_SUPPLEMENTS'
,
'TEMPERATURE'
,
'OPTICAL_DENSITY'
,
'GROWTH_PHASE'
,
'AGITATION_SPEED'
,
'AERATION'
,
'pH'
,
'num_gc_classes'
,
'num_gc_phrases'
]]
.
copy
()
df_all_gcs_table_x_out
.
to_csv
(
os
.
path
.
join
(
path_output
,
file_all_gcs_x
),
sep
=
'
\t
'
,
index
=
False
,
header
=
True
)
# https://plotly.com/python/table/
fig
=
go
.
Figure
(
data
=
[
go
.
Table
(
header
=
dict
(
values
=
[
'ORGANISM'
,
'GENETIC_BACKGROUND'
,
'MEDIUM'
,
'MEDIUM_SUPPLEMENTS'
,
'TEMPERATURE'
,
'OPTICAL_DENSITY'
,
'GROWTH_PHASE'
,
'AGITATION_SPEED'
,
'AERATION'
,
'pH'
,
'num_gc_classes'
,
'num_gc_phrases'
],
line_color
=
'white'
,
fill_color
=
'white'
,
align
=
'center'
,
font
=
dict
(
color
=
'black'
,
size
=
12
)
),
cells
=
dict
(
values
=
[
df_all_gcs_table_x_out
.
ORGANISM
,
df_all_gcs_table_x_out
.
GENETIC_BACKGROUND
,
df_all_gcs_table_x_out
.
MEDIUM
,
df_all_gcs_table_x_out
.
MEDIUM_SUPPLEMENTS
,
df_all_gcs_table_x_out
.
TEMPERATURE
,
df_all_gcs_table_x_out
.
OPTICAL_DENSITY
,
df_all_gcs_table_x_out
.
GROWTH_PHASE
,
df_all_gcs_table_x_out
.
AGITATION_SPEED
,
df_all_gcs_table_x_out
.
AERATION
,
df_all_gcs_table_x_out
.
pH
,
df_all_gcs_table_x_out
.
num_gc_classes
,
df_all_gcs_table_x_out
.
num_gc_phrases
],
# line_color=[df.Color], fill_color=[df.Color],
align
=
'center'
,
font
=
dict
(
color
=
'black'
,
size
=
11
)
))
])
fig
.
show
()
def
extract_gcs_curadores_PL
(
path_input
,
file_input
,
path_output
,
file_output
,
file_all_gcs
,
file_all_gcs_x
):
df_output_table
=
pd
.
DataFrame
()
# df_output_table_bin = pd.DataFrame()
df_input_table
=
pd
.
read_csv
(
os
.
path
.
join
(
path_input
,
file_input
),
sep
=
"
\t
"
)
print
(
"Shape df_input_table: {}"
.
format
(
df_input_table
.
shape
))
# print(df_input_table.head())
hash_all_gcs
=
{}
hash_all_gcs_num
=
{}
for
ind
in
df_input_table
.
index
:
hash_gcs_table
=
{}
hash_gcs
=
{}
num_gcs
=
0
# hash_gcs_bin = {}
pmid
=
df_input_table
[
'PMID'
][
ind
]
coleccion
=
df_input_table
[
'Colección'
][
ind
]
hash_gcs_table
[
'pmid'
]
=
str
(
pmid
)
# hash_gcs_bin['pmid'] = str(pmid)
hash_gcs_table
[
'coleccion'
]
=
coleccion
gcs
=
df_input_table
[
'GC'
][
ind
]
list_gcs
=
gcs
.
split
(
'|'
)
for
gc
in
list_gcs
:
list_gc
=
gc
.
split
(
':'
)
gc_class
=
list_gc
[
0
]
gc_class
=
gc_class
.
strip
()
gc_class
=
gc_class
.
strip
(
"'"
)
gc_term
=
list_gc
[
1
]
gc_term
=
gc_term
.
strip
()
gc_term
=
gc_term
.
strip
(
"'"
)
hash_gcs_table
[
gc_class
]
=
gc_term
hash_gcs
[
gc_class
]
=
gc_term
num_gcs
+=
1
# hash_gcs_bin[gc_class] = int(1)
str_gcs
=
", "
.
join
(
hash_gcs
.
keys
())
# print(str_gcs)
if
str_gcs
in
hash_all_gcs
:
hash_all_gcs
[
str_gcs
]
+=
1
else
:
hash_all_gcs
[
str_gcs
]
=
1
hash_all_gcs_num
[
str_gcs
]
=
num_gcs
df_output_table
=
df_output_table
.
append
(
hash_gcs_table
,
ignore_index
=
True
)
# df_output_table_bin = df_output_table_bin.append(hash_gcs_bin, ignore_index=True)
# print(df_output_table)
# quit()
# df_output_table_bin = df_output_table_bin.replace(np.nan, 0)
# df_output_table_bin = pd.DataFrame(np.sort(df_output_table_bin.values, axis=0), index=df_output_table_bin.index, columns=df_output_table_bin.columns)
df_output_table
=
df_output_table
.
replace
(
np
.
nan
,
''
)
# df.sort_values(by=['Country', 'Continent'],
# ascending=[False, True])
# print(df_output_table_bin)
df_output_table
.
to_csv
(
os
.
path
.
join
(
path_output
,
file_output
),
sep
=
'
\t
'
,
index
=
False
,
header
=
True
)
# df_output_table_bin.to_csv(os.path.join(path_output, file_bin), sep='\t', index=False, header=True)
df_all_gcs
=
pd
.
DataFrame
(
list
(
hash_all_gcs
.
items
()))
df_all_gcs_num
=
pd
.
DataFrame
(
list
(
hash_all_gcs_num
.
items
()))
df_join
=
df_all_gcs
.
join
(
df_all_gcs_num
,
lsuffix
=
'_all'
,
rsuffix
=
'_num'
)
# df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
df_all_gcs_table
=
df_join
[[
'0_all'
,
'1_num'
,
'1_all'
]]
.
copy
()
# df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
df_all_gcs_table
.
columns
=
[
'gc_phrase'
,
'num_gc_classes'
,
'num_gc_phrases'
]
df_all_gcs_table
.
to_csv
(
os
.
path
.
join
(
path_output
,
file_all_gcs
),
sep
=
'
\t
'
,
index
=
False
,
header
=
True
)
df_all_gcs_table_x
=
pd
.
DataFrame
()
for
ind
in
df_all_gcs_table
.
index
:
hash_gcs_table_x
=
{}
phrase
=
df_all_gcs_table
[
'gc_phrase'
][
ind
]
list_phrase
=
phrase
.
split
(
','
)
for
gc_class
in
list_phrase
:
gc_class
=
gc_class
.
strip
()
hash_gcs_table_x
[
gc_class
]
=
'X'
hash_gcs_table_x
[
'num_gc_classes'
]
=
df_all_gcs_table
[
'num_gc_classes'
][
ind
]
hash_gcs_table_x
[
'num_gc_phrases'
]
=
df_all_gcs_table
[
'num_gc_phrases'
][
ind
]
df_all_gcs_table_x
=
df_all_gcs_table_x
.
append
(
hash_gcs_table_x
,
ignore_index
=
True
)
df_all_gcs_table_x_out
=
df_all_gcs_table_x
[[
'ORGANISM'
,
'GENETIC_BACKGROUND'
,
'MEDIUM'
,
'MEDIUM_SUPPLEMENTS'
,
'TEMPERATURE'
,
'OPTICAL_DENSITY'
,
'GROWTH_PHASE'
,
'AGITATION_SPEED'
,
'AERATION'
,
'pH'
,
'num_gc_classes'
,
'num_gc_phrases'
]]
.
copy
()
df_all_gcs_table_x_out
.
to_csv
(
os
.
path
.
join
(
path_output
,
file_all_gcs_x
),
sep
=
'
\t
'
,
index
=
False
,
header
=
True
)
# https://plotly.com/python/table/
fig
=
go
.
Figure
(
data
=
[
go
.
Table
(
header
=
dict
(
values
=
[
'ORGANISM'
,
'GENETIC_BACKGROUND'
,
'MEDIUM'
,
'MEDIUM_SUPPLEMENTS'
,
'TEMPERATURE'
,
'OPTICAL_DENSITY'
,
'GROWTH_PHASE'
,
'AGITATION_SPEED'
,
'AERATION'
,
'pH'
,
'num_gc_classes'
,
'num_gc_phrases'
],
line_color
=
'white'
,
fill_color
=
'white'
,
align
=
'center'
,
font
=
dict
(
color
=
'black'
,
size
=
12
)
),
cells
=
dict
(
values
=
[
df_all_gcs_table_x_out
.
ORGANISM
,
df_all_gcs_table_x_out
.
GENETIC_BACKGROUND
,
df_all_gcs_table_x_out
.
MEDIUM
,
df_all_gcs_table_x_out
.
MEDIUM_SUPPLEMENTS
,
df_all_gcs_table_x_out
.
TEMPERATURE
,
df_all_gcs_table_x_out
.
OPTICAL_DENSITY
,
df_all_gcs_table_x_out
.
GROWTH_PHASE
,
df_all_gcs_table_x_out
.
AGITATION_SPEED
,
df_all_gcs_table_x_out
.
AERATION
,
df_all_gcs_table_x_out
.
pH
,
df_all_gcs_table_x_out
.
num_gc_classes
,
df_all_gcs_table_x_out
.
num_gc_phrases
],
# line_color=[df.Color], fill_color=[df.Color],
align
=
'center'
,
font
=
dict
(
color
=
'black'
,
size
=
11
)
))
])
fig
.
show
()
path_in
=
"/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
path_out
=
"/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
# Curator: PL
file_in
=
"PL&VT_GC-PMID.tsv"
file_out
=
"PL&VT_GC-PMID-GCs.tsv"
file_gcs
=
"PL&VT_GC-PMID-GCs-All.tsv"
file_gcs_x
=
"PL&VT_GC-PMID-GCs-All-X.tsv"
### extract_gcs_curadores_PL(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x)
# Curator: SG
file_in
=
"GC-Catalog_resumido-sgama.tsv"
file_out
=
"GC-Catalog_resumido-sgama-GCs.tsv"
file_gcs
=
"GC-Catalog_resumido-sgama-GCs-All.tsv"
file_gcs_x
=
"GC-Catalog_resumido-sgama-GCs-All-X.tsv"
extract_gcs_curadores_SG
(
path_in
,
file_in
,
path_out
,
file_out
,
file_gcs
,
file_gcs_x
)
data-sets/bin/extract-gcs-curadores_v2.py
0 → 100644
View file @
2088c9b
from
plotly.colors
import
n_colors
import
plotly.graph_objects
as
go
import
numpy
as
np
import
os
import
pandas
as
pd
def
extract_gcs_curadores_SG
(
path_input
,
file_input
,
path_output
,
file_output
,
file_all_gcs
,
file_all_gcs_color
):
df_output_table
=
pd
.
DataFrame
()
# df_output_table_bin = pd.DataFrame()
df_input_table
=
pd
.
read_csv
(
os
.
path
.
join
(
path_input
,
file_input
),
sep
=
"
\t
"
)
print
(
"Shape df_input_table: {}"
.
format
(
df_input_table
.
shape
))
# print(df_input_table.head())
hash_all_gcs_control
=
{}
hash_all_gcs_num_control
=
{}
hash_all_gcs_test
=
{}
hash_all_gcs_num_test
=
{}
hash_gc_classes
=
{
0
:
'ORGANISM'
,
1
:
'GENETIC_BACKGROUND'
,
2
:
'MEDIUM'
,
3
:
'MEDIUM_SUPPLEMENTS'
,
5
:
'TEMPERATURE'
,
8
:
'OPTICAL_DENSITY'
,
9
:
'GROWTH_PHASE'
,
10
:
'AGITATION_SPEED'
,
11
:
'AERATION'
,
12
:
'pH'
}
idx
=
0
for
ind
in
df_input_table
.
index
:
hash_gcs_table
=
{}
hash_gcs
=
{}
num_gcs
=
0
pmid
=
df_input_table
[
'Reference'
][
ind
]
hash_gcs_table
[
'pmid'
]
=
str
(
pmid
)
# print("Control_Growth_Condition...")
gcs_control
=
df_input_table
[
'Control_Growth_Condition'
][
ind
]
list_gcs_control
=
gcs_control
.
split
(
'|'
)
hash_gcs_table
[
'IDX'
]
=
idx
hash_gcs_table
[
'GC_TYPE'
]
=
'Control'
for
gc
in
list_gcs_control
:
if
gc
!=
''
:
gc_class
=
hash_gc_classes
[
list_gcs_control
.
index
(
gc
)]
# print("gc_class: {}".format(gc_class))
hash_gcs_table
[
gc_class
]
=
gc
hash_gcs
[
gc_class
]
=
gc
num_gcs
+=
1
str_gcs
=
", "
.
join
(
hash_gcs
.
keys
())
if
str_gcs
in
hash_all_gcs_control
:
hash_all_gcs_control
[
str_gcs
]
+=
1
else
:
hash_all_gcs_control
[
str_gcs
]
=
1
hash_all_gcs_num_control
[
str_gcs
]
=
num_gcs
df_output_table
=
df_output_table
.
append
(
hash_gcs_table
,
ignore_index
=
True
)
hash_gcs_table
=
{}
hash_gcs
=
{}
num_gcs
=
0
hash_gcs_table
[
'pmid'
]
=
str
(
pmid
)
# print("Test_Growth_Condition...")
gcs_test
=
df_input_table
[
'Test_Growth_Condition'
][
ind
]
list_gcs_test
=
gcs_test
.
split
(
'|'
)
hash_gcs_table
[
'IDX'
]
=
idx
hash_gcs_table
[
'GC_TYPE'
]
=
'Test'
for
gc
in
list_gcs_test
:
if
gc
!=
''
:
gc_class
=
hash_gc_classes
[
list_gcs_test
.
index
(
gc
)]
# print("gc_class: {}".format(gc_class))
hash_gcs_table
[
gc_class
]
=
gc
hash_gcs
[
gc_class
]
=
gc
num_gcs
+=
1
# hash_gcs_bin[gc_class] = int(1)
str_gcs
=
", "
.
join
(
hash_gcs
.
keys
())
# print(str_gcs)
if
str_gcs
in
hash_all_gcs_test
:
hash_all_gcs_test
[
str_gcs
]
+=
1
else
:
hash_all_gcs_test
[
str_gcs
]
=
1
hash_all_gcs_num_test
[
str_gcs
]
=
num_gcs
df_output_table
=
df_output_table
.
append
(
hash_gcs_table
,
ignore_index
=
True
)
idx
+=
1
df_output_table
=
df_output_table
.
replace
(
np
.
nan
,
''
)
# print(df_output_table.head())
df_output_table_csv
=
df_output_table
[[
'IDX'
,
'GC_TYPE'
,
'ORGANISM'
,
'GENETIC_BACKGROUND'
,
'MEDIUM'
,
'MEDIUM_SUPPLEMENTS'
,
'TEMPERATURE'
,
'OPTICAL_DENSITY'
,
'GROWTH_PHASE'
]]
.
copy
()
# 'AGITATION_SPEED', 'AERATION', 'pH'
# df.sort_values(by=['Country', 'Continent'],
# ascending=[False, True])
# print(df_output_table_bin)
df_output_table_csv
.
to_csv
(
os
.
path
.
join
(
path_output
,
file_output
),
sep
=
'
\t
'
,
index
=
False
,
header
=
True
)
### Color table
columns
=
list
(
df_output_table_csv
)
print
(
columns
)
df_output_table_color
=
pd
.
DataFrame
()
for
ind
in
range
(
df_output_table_csv
.
index
[
-
1
]
+
1
):
# print("ind: {}".format(ind))
# https://www.geeksforgeeks.org/iterating-over-rows-and-columns-in-pandas-dataframe/
if
df_output_table_csv
[
'GC_TYPE'
][
ind
]
==
'Control'
:
hash_output_table_control
=
{}
for
i
in
columns
:
# print("i: {}".format(i))
# printing the element of the column
# print(df_output_table_csv[i][ind])
if
df_output_table_csv
[
i
][
ind
]
==
''
:
hash_output_table_control
[
i
+
'_COLOR'
]
=
'rgb(239, 243, 255)'
else
:
hash_output_table_control
[
i
+
'_COLOR'
]
=
'rgb(189, 215, 231)'
hash_output_table_control
[
i
]
=
df_output_table_csv
[
i
][
ind
]
hash_output_table_control
[
'IDX'
]
=
df_output_table_csv
[
'IDX'
][
ind
]
hash_output_table_control
[
'IDX_COLOR'
]
=
'rgb(189, 215, 231)'
hash_output_table_control
[
'GC_TYPE'
]
=
df_output_table_csv
[
'GC_TYPE'
][
ind
]
hash_output_table_control
[
'GC_TYPE_COLOR'
]
=
'rgb(189, 215, 231)'
df_output_table_color
=
df_output_table_color
.
append
(
hash_output_table_control
,
ignore_index
=
True
)
elif
df_output_table_csv
[
'GC_TYPE'
][
ind
]
==
'Test'
:
hash_output_table_test
=
{}
for
i
in
columns
:
# printing the element of the column
# print(df_output_table_csv[i][ind])
if
df_output_table_csv
[
i
][
ind
]
==
''
:
hash_output_table_test
[
i
+
'_COLOR'
]
=
'rgb(239, 243, 255)'
elif
df_output_table_csv
[
i
][
ind
]
!=
df_output_table_csv
[
i
][
ind
-
1
]:
hash_output_table_test
[
i
+
'_COLOR'
]
=
'rgb(107, 174, 214)'
else
:
hash_output_table_test
[
i
+
'_COLOR'
]
=
'rgb(189, 215, 245)'
hash_output_table_test
[
i
]
=
df_output_table_csv
[
i
][
ind
]
hash_output_table_test
[
'IDX'
]
=
df_output_table_csv
[
'IDX'
][
ind
]
hash_output_table_test
[
'IDX_COLOR'
]
=
'rgb(189, 215, 245)'
hash_output_table_test
[
'GC_TYPE'
]
=
df_output_table_csv
[
'GC_TYPE'
][
ind
]
hash_output_table_test
[
'GC_TYPE_COLOR'
]
=
'rgb(189, 215, 245)'
df_output_table_color
=
df_output_table_color
.
append
(
hash_output_table_test
,
ignore_index
=
True
)
df_output_table_color
[[
'IDX'
,
'GC_TYPE'
,
'ORGANISM'
,
'GENETIC_BACKGROUND'
,
'MEDIUM'
,
'MEDIUM_SUPPLEMENTS'
,
'TEMPERATURE'
,
'OPTICAL_DENSITY'
,
'GROWTH_PHASE'
,
'ORGANISM_COLOR'
,
'GENETIC_BACKGROUND_COLOR'
,
'MEDIUM_COLOR'
,
'MEDIUM_SUPPLEMENTS_COLOR'
,
'TEMPERATURE_COLOR'
,
'OPTICAL_DENSITY_COLOR'
,
'GROWTH_PHASE_COLOR'
]]
.
to_csv
(
os
.
path
.
join
(
path_output
,
file_all_gcs_color
),
sep
=
'
\t
'
,
index
=
False
,
header
=
True
)
# https://plotly.com/python/table/
# Seleccionar color: https://redketchup.io/color-picker
colors
=
n_colors
(
'rgb(255, 200, 200)'
,
'rgb(200, 0, 0)'
,
3
,
colortype
=
'rgb'
)
fig
=
go
.
Figure
(
data
=
[
go
.
Table
(
header
=
dict
(
values
=
[
'<b>IDX'
,
'<b>GC_TYPE</b>'
,
'<b>ORGANISM</b>'
,
'<b>GENETIC_BACKGROUND</b>'
,
'<b>MEDIUM</b>'
,
'<b>MEDIUM_SUPPLEMENTS</b>'
,
'<b>TEMPERATURE</b>'
,
'<b>OPTICAL_DENSITY</b>'
,
'<b>GROWTH_PHASE</b>'
],
line_color
=
'black'
,
fill_color
=
'white'
,
align
=
'center'
,
font
=
dict
(
color
=
'black'
,
size
=
12
)
),
cells
=
dict
(
values
=
[
df_output_table_color
.
IDX
,
df_output_table_color
.
GC_TYPE
,
df_output_table_color
.
ORGANISM
,
df_output_table_color
.
GENETIC_BACKGROUND
,
df_output_table_color
.
MEDIUM
,
df_output_table_color
.
MEDIUM_SUPPLEMENTS
,
df_output_table_color
.
TEMPERATURE
,
df_output_table_color
.
OPTICAL_DENSITY
,
df_output_table_color
.
GROWTH_PHASE
],
#, df_output_table_color.AGITATION_SPEED,
# df_output_table_color.AERATION, df_output_table_color.pH,
# df_output_table_color.num_gc_classes, df_output_table_color.num_gc_phrases],
# line_color=[df.Color],
fill_color
=
[
df_output_table_color
.
IDX_COLOR
,
df_output_table_color
.
GC_TYPE_COLOR
,
df_output_table_color
.
ORGANISM_COLOR
,
df_output_table_color
.
GENETIC_BACKGROUND_COLOR
,
df_output_table_color
.
MEDIUM_COLOR
,
df_output_table_color
.
MEDIUM_SUPPLEMENTS_COLOR
,
df_output_table_color
.
TEMPERATURE_COLOR
,
df_output_table_color
.
OPTICAL_DENSITY_COLOR
,
df_output_table_color
.
GROWTH_PHASE_COLOR
],
align
=
'center'
,
font
=
dict
(
color
=
'black'
,
size
=
11
)
))
])
# fig.show() os.path.join(path_output, file_all_gcs_color.replace('.tsv', '.png')
fig
.
write_image
(
os
.
path
.
join
(
path_output
,
file_all_gcs_color
.
replace
(
'.tsv'
,
'.png'
)),
height
=
2500
,
width
=
1800
)
quit
()
df_all_gcs
=
pd
.
DataFrame
()
df_all_gcs_num
=
pd
.
DataFrame
()
print
(
hash_all_gcs_control
)
print
(
hash_all_gcs_test
)
# quit()
for
all_gcs
,
num_gcs
in
hash_all_gcs_control
.
items
():
# https://www.geeksforgeeks.org/how-to-append-a-list-as-a-row-to-a-pandas-dataframe-in-python/
# df = df.append(pd.DataFrame(list, columns=['Name', 'Age', 'City', 'Country']), ignore_index=True)
df_all_gcs
=
df_all_gcs
.
append
(
pd
.
DataFrame
([[
'Control'
,
all_gcs
,
num_gcs
]],
columns
=
[
'class_type'
,
'gc_phrase'
,
'num_gc_phrases'
]),
ignore_index
=
True
)
for
all_gcs
,
num_gcs
in
hash_all_gcs_test
.
items
():
df_all_gcs
=
df_all_gcs
.
append
(
pd
.
DataFrame
([[
'Test'
,
all_gcs
,
num_gcs
]],
columns
=
[
'class_type'
,
'gc_phrase'
,
'num_gc_phrases'
]),
ignore_index
=
True
)
# print(df_all_gcs)
for
all_gcs
,
num_gcs
in
hash_all_gcs_num_control
.
items
():
df_all_gcs_num
=
df_all_gcs_num
.
append
(
pd
.
DataFrame
([[
'Control'
,
all_gcs
,
num_gcs
]],
columns
=
[
'class_type'
,
'gc_phrase'
,
'num_gc_classes'
]),
ignore_index
=
True
)
for
all_gcs
,
num_gcs
in
hash_all_gcs_num_test
.
items
():
df_all_gcs_num
=
df_all_gcs_num
.
append
(
pd
.
DataFrame
([[
'Test'
,
all_gcs
,
num_gcs
]],
columns
=
[
'class_type'
,
'gc_phrase'
,
'num_gc_classes'
]),
ignore_index
=
True
)
# print(df_all_gcs_num)
# quit()
df_join
=
df_all_gcs
.
join
(
df_all_gcs_num
.
set_index
([
'class_type'
,
'gc_phrase'
]),
on
=
[
'class_type'
,
'gc_phrase'
],
lsuffix
=
'_all'
,
rsuffix
=
'_num'
)
# print(df_join)
# quit()
# df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
# df_all_gcs_table = df_join[['0_all', '1_num', '1_all']].copy()
# df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
# df_all_gcs_table.columns = ['gc_phrase', 'num_gc_classes', 'num_gc_phrases']
df_all_gcs_table
=
df_join
.
copy
()
df_all_gcs_table
.
to_csv
(
os
.
path
.
join
(
path_output
,
file_all_gcs
),
sep
=
'
\t
'
,
index
=
False
,
header
=
True
)
def
extract_gcs_curadores_PL
(
path_input
,
file_input
,
path_output
,
file_output
,
file_all_gcs
,
file_all_gcs_color
):
df_output_table
=
pd
.
DataFrame
()
# df_output_table_bin = pd.DataFrame()
df_input_table
=
pd
.
read_csv
(
os
.
path
.
join
(
path_input
,
file_input
),
sep
=
"
\t
"
)
print
(
"Shape df_input_table: {}"
.
format
(
df_input_table
.
shape
))
# print(df_input_table.head())
hash_all_gcs
=
{}
hash_all_gcs_num
=
{}
for
ind
in
df_input_table
.
index
:
hash_gcs_table
=
{}
hash_gcs
=
{}
num_gcs
=
0
# hash_gcs_bin = {}
pmid
=
df_input_table
[
'PMID'
][
ind
]
coleccion
=
df_input_table
[
'Colección'
][
ind
]
hash_gcs_table
[
'pmid'
]
=
str
(
pmid
)
# hash_gcs_bin['pmid'] = str(pmid)
hash_gcs_table
[
'coleccion'
]
=
coleccion
gcs
=
df_input_table
[
'GC'
][
ind
]
list_gcs
=
gcs
.
split
(
'|'
)
for
gc
in
list_gcs
:
list_gc
=
gc
.
split
(
':'
)
gc_class
=
list_gc
[
0
]
gc_class
=
gc_class
.
strip
()
gc_class
=
gc_class
.
strip
(
"'"
)
gc_term
=
list_gc
[
1
]
gc_term
=
gc_term
.
strip
()
gc_term
=
gc_term
.
strip
(
"'"
)
hash_gcs_table
[
gc_class
]
=
gc_term
hash_gcs
[
gc_class
]
=
gc_term
num_gcs
+=
1
# hash_gcs_bin[gc_class] = int(1)
str_gcs
=
", "
.
join
(
hash_gcs
.
keys
())
# print(str_gcs)
if
str_gcs
in
hash_all_gcs
:
hash_all_gcs
[
str_gcs
]
+=
1
else
:
hash_all_gcs
[
str_gcs
]
=
1
hash_all_gcs_num
[
str_gcs
]
=
num_gcs
df_output_table
=
df_output_table
.
append
(
hash_gcs_table
,
ignore_index
=
True
)
# df_output_table_bin = df_output_table_bin.append(hash_gcs_bin, ignore_index=True)
# print(df_output_table)
# quit()
# df_output_table_bin = df_output_table_bin.replace(np.nan, 0)
# df_output_table_bin = pd.DataFrame(np.sort(df_output_table_bin.values, axis=0), index=df_output_table_bin.index, columns=df_output_table_bin.columns)
df_output_table
=
df_output_table
.
replace
(
np
.
nan
,
''
)
# df.sort_values(by=['Country', 'Continent'],
# ascending=[False, True])
# print(df_output_table_bin)
df_output_table
.
to_csv
(
os
.
path
.
join
(
path_output
,
file_output
),
sep
=
'
\t
'
,
index
=
False
,
header
=
True
)
# df_output_table_bin.to_csv(os.path.join(path_output, file_bin), sep='\t', index=False, header=True)
df_all_gcs
=
pd
.
DataFrame
(
list
(
hash_all_gcs
.
items
()))
df_all_gcs_num
=
pd
.
DataFrame
(
list
(
hash_all_gcs_num
.
items
()))
df_join
=
df_all_gcs
.
join
(
df_all_gcs_num
,
lsuffix
=
'_all'
,
rsuffix
=
'_num'
)
# df_all_gcs_table = df_all_gcs.set_index('0').join(df_all_gcs_num.set_index('0'))
df_all_gcs_table
=
df_join
[[
'0_all'
,
'1_num'
,
'1_all'
]]
.
copy
()
# df_all_gcs_table.columns = ['Clases de condiciones (frase)', 'Número de clases', 'Numero de repeticiones de la frase']
df_all_gcs_table
.
columns
=
[
'gc_phrase'
,
'num_gc_classes'
,
'num_gc_phrases'
]
df_all_gcs_table
.
to_csv
(
os
.
path
.
join
(
path_output
,
file_all_gcs
),
sep
=
'
\t
'
,
index
=
False
,
header
=
True
)
df_all_gcs_table_x
=
pd
.
DataFrame
()
for
ind
in
df_all_gcs_table
.
index
:
hash_gcs_table_x
=
{}
phrase
=
df_all_gcs_table
[
'gc_phrase'
][
ind
]
list_phrase
=
phrase
.
split
(
','
)
for
gc_class
in
list_phrase
:
gc_class
=
gc_class
.
strip
()
hash_gcs_table_x
[
gc_class
]
=
'X'
hash_gcs_table_x
[
'num_gc_classes'
]
=
df_all_gcs_table
[
'num_gc_classes'
][
ind
]
hash_gcs_table_x
[
'num_gc_phrases'
]
=
df_all_gcs_table
[
'num_gc_phrases'
][
ind
]
df_all_gcs_table_x
=
df_all_gcs_table_x
.
append
(
hash_gcs_table_x
,
ignore_index
=
True
)
df_all_gcs_table_x_out
=
df_all_gcs_table_x
[[
'ORGANISM'
,
'GENETIC_BACKGROUND'
,
'MEDIUM'
,
'MEDIUM_SUPPLEMENTS'
,
'TEMPERATURE'
,
'OPTICAL_DENSITY'
,
'GROWTH_PHASE'
,
'AGITATION_SPEED'
,
'AERATION'
,
'pH'
,
'num_gc_classes'
,
'num_gc_phrases'
]]
.
copy
()
df_all_gcs_table_x_out
.
to_csv
(
os
.
path
.
join
(
path_output
,
file_all_gcs_color
),
sep
=
'
\t
'
,
index
=
False
,
header
=
True
)
# https://plotly.com/python/table/
fig
=
go
.
Figure
(
data
=
[
go
.
Table
(
header
=
dict
(
values
=
[
'ORGANISM'
,
'GENETIC_BACKGROUND'
,
'MEDIUM'
,
'MEDIUM_SUPPLEMENTS'
,
'TEMPERATURE'
,
'OPTICAL_DENSITY'
,
'GROWTH_PHASE'
,
'AGITATION_SPEED'
,
'AERATION'
,
'pH'
,
'num_gc_classes'
,
'num_gc_phrases'
],
line_color
=
'white'
,
fill_color
=
'white'
,
align
=
'center'
,
font
=
dict
(
color
=
'black'
,
size
=
12
)
),
cells
=
dict
(
values
=
[
df_all_gcs_table_x_out
.
ORGANISM
,
df_all_gcs_table_x_out
.
GENETIC_BACKGROUND
,
df_all_gcs_table_x_out
.
MEDIUM
,
df_all_gcs_table_x_out
.
MEDIUM_SUPPLEMENTS
,
df_all_gcs_table_x_out
.
TEMPERATURE
,
df_all_gcs_table_x_out
.
OPTICAL_DENSITY
,
df_all_gcs_table_x_out
.
GROWTH_PHASE
,
df_all_gcs_table_x_out
.
AGITATION_SPEED
,
df_all_gcs_table_x_out
.
AERATION
,
df_all_gcs_table_x_out
.
pH
,
df_all_gcs_table_x_out
.
num_gc_classes
,
df_all_gcs_table_x_out
.
num_gc_phrases
],
# line_color=[df.Color], fill_color=[df.Color],
align
=
'center'
,
font
=
dict
(
color
=
'black'
,
size
=
11
)
))
])
fig
.
show
()
path_in
=
"/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
path_out
=
"/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/data_sets_curadores"
# Curator: PL
file_in
=
"PL&VT_GC-PMID.tsv"
file_out
=
"PL&VT_GC-PMID-GCs.tsv"
file_gcs
=
"PL&VT_GC-PMID-GCs-All.tsv"
file_gcs_x
=
"PL&VT_GC-PMID-GCs-All-X.tsv"
### extract_gcs_curadores_PL(path_in, file_in, path_out, file_out, file_gcs, file_gcs_x)
# Curator: SG
file_in
=
"GC-Catalog_resumido-sgama.tsv"
file_out
=
"GC-Catalog_resumido-sgama-GCs.tsv"
file_gcs
=
"GC-Catalog_resumido-sgama-GCs-All.tsv"
file_gcs_color
=
"GC-Catalog_resumido-sgama-GCs-color.tsv"
extract_gcs_curadores_SG
(
path_in
,
file_in
,
path_out
,
file_out
,
file_gcs
,
file_gcs_color
)
Please
register
or
login
to post a comment