Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Carlos-Francisco Méndez-Cruz
/
deep-learning-workshop
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
Carlos-Francisco Méndez-Cruz
2019-05-08 13:24:56 -0500
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
e1a9f20f73425620165f075bed29a948b0bda948
e1a9f20f
1 parent
8f0fa248
Deep Learning Workshop
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
21 additions
and
12 deletions
data-sets/get-hga-training-test-py27.py
data-sets/get-hga-training-test-py27.py
View file @
e1a9f20
...
...
@@ -62,18 +62,20 @@ if __name__ == "__main__":
# each row corresponds to one possible value of each feature.
one_hot_encoder
=
OneHotEncoder
(
categories
=
'auto'
)
input_features
=
[]
sequences
=
[]
# Read file with sequences
with
open
(
os
.
path
.
join
(
args
.
inputPath
,
args
.
inputFile
),
mode
=
"r"
)
as
tabfile
:
df
=
pd
.
read_csv
(
tabfile
,
delimiter
=
'
\t
'
)
print
(
"df: {}"
.
format
(
df
))
sequences
=
df
[
'sequence'
]
labels
=
df
[
'label'
]
print
(
"All rows in df: {}"
.
format
(
len
(
df
.
index
)))
df_filtered
=
df
.
loc
[
df
[
'label'
]
in
[
"exon"
,
"utr"
]]
print
(
"Only exon and utr rows in df: {}"
.
format
(
len
(
df_filtered
.
index
)))
# print("df: {}".format(df))
sequences
=
df_filtered
[
'sequence'
]
labels
=
df_filtered
[
'label'
]
max_exon_length
=
0
max_utr_length
=
0
#
One-hot-encoding
of sequences
#
Getting the max length
of sequences
for
sequence
,
label
in
zip
(
sequences
,
labels
):
if
label
==
"exon"
:
if
len
(
sequence
)
>
max_exon_length
:
...
...
@@ -81,17 +83,24 @@ if __name__ == "__main__":
elif
label
==
"utr"
:
if
len
(
sequence
)
>
max_utr_length
:
max_utr_length
=
len
(
sequence
)
'''
print
(
"Max exon length: {}"
.
format
(
max_exon_length
))
print
(
"Max utr length: {}"
.
format
(
max_utr_length
))
quit
()
# Fill sequence with X char to get max length
# One-hot-encoding of sequences
for
sequence
,
label
in
zip
(
sequences
,
labels
):
if
label
==
"exon"
:
if
len
(
sequence
)
<
max_exon_length
:
sequence
.
ljust
(
max_exon_length
+
len
(
sequence
),
'X'
)
elif
label
==
"utr"
:
if
len
(
sequence
)
<
max_utr_length
:
sequence
.
ljust
(
max_utr_length
+
len
(
sequence
),
'X'
)
integer_encoded
=
integer_encoder
.
fit_transform
(
list
(
sequence
))
integer_encoded
=
np
.
array
(
integer_encoded
)
.
reshape
(
-
1
,
1
)
one_hot_encoded
=
one_hot_encoder
.
fit_transform
(
integer_encoded
)
input_features
.
append
(
one_hot_encoded
.
toarray
())
'''
print
(
"Max exon length: {}"
.
format
(
max_exon_length
))
print
(
"Max utr length: {}"
.
format
(
max_utr_length
))
exit
()
# Print first sequence and one-hot-encoding
np
.
set_printoptions
(
threshold
=
40
)
...
...
Please
register
or
login
to post a comment