Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Carlos-Francisco Méndez-Cruz
/
deep-learning-workshop
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
Carlos-Francisco Méndez-Cruz
2019-05-08 18:08:29 -0500
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
4d536d405a43ac0f5ddefc59712ab3dd17846e2a
4d536d40
1 parent
dc0e4cbb
Deep Learning Workshop
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
26 additions
and
24 deletions
data-sets/get-hga-sequences-py3.py
data-sets/get-hga-sequences-py3.py
View file @
4d536d4
...
...
@@ -86,35 +86,37 @@ if __name__ == "__main__":
for
row
in
reader
:
# print(row)
filename
=
os
.
path
.
join
(
args
.
fastaPath
,
"Homo_sapiens.GRCh38.dna.chromosome.{}.fa"
.
format
(
row
[
'seqname'
]))
# We use only
sequence
=
get_sequence
(
filename
,
int
(
row
[
'start'
]),
int
(
row
[
'end'
]))
# Features in HGA:
# exon
# feature
# five_prime_utr
# gene
# Selenocysteine
# start_codon
# stop_codon
# three_prime_utr
length
=
int
(
row
[
'end'
])
-
int
(
row
[
'start'
])
+
1
if
row
[
'feature'
]
==
"exon"
:
label
=
row
[
'feature'
]
length_total_exon
+=
length
total_exon
+=
1
elif
row
[
'feature'
]
in
[
"five_prime_utr"
,
"three_prime_utr"
]:
label
=
"utr"
length_total_utr
+=
length
total_utr
+=
1
else
:
label
=
"other"
new_row
=
"{}
\t
{}
\t
{}
\t
{}
\t
{}
\t
{}
\n
"
.
format
(
row
[
'seqname'
],
row
[
'start'
],
row
[
'end'
],
length
,
sequence
,
label
)
list_rows
.
append
(
new_row
)
if
row
[
'feature'
]
in
[
"exon"
,
"five_prime_utr"
,
"three_prime_utr"
]:
# We use only exon, five_prime_utr, and three_prime_utr
sequence
=
get_sequence
(
filename
,
int
(
row
[
'start'
]),
int
(
row
[
'end'
]))
# Features in HGA:
# exon
# feature
# five_prime_utr
# gene
# Selenocysteine
# start_codon
# stop_codon
# three_prime_utr
length
=
int
(
row
[
'end'
])
-
int
(
row
[
'start'
])
+
1
if
row
[
'feature'
]
==
"exon"
:
label
=
row
[
'feature'
]
length_total_exon
+=
length
total_exon
+=
1
elif
row
[
'feature'
]
in
[
"five_prime_utr"
,
"three_prime_utr"
]:
label
=
"utr"
length_total_utr
+=
length
total_utr
+=
1
else
:
label
=
"other"
new_row
=
"{}
\t
{}
\t
{}
\t
{}
\t
{}
\t
{}
\n
"
.
format
(
row
[
'seqname'
],
row
[
'start'
],
row
[
'end'
],
length
,
sequence
,
label
)
list_rows
.
append
(
new_row
)
i
+=
1
if
(
i
%
100
)
==
0
:
print
(
"{} rows processed."
.
format
(
i
))
if
i
==
10000
:
break
print
(
"Count exons {} and utr {}"
.
format
(
total_exon
,
total_utr
))
print
(
"Length media exons {} and utr {}"
.
format
(
length_total_exon
/
total_exon
,
length_total_utr
/
total_utr
))
with
open
(
os
.
path
.
join
(
args
.
outputPath
,
args
.
outputFile
),
mode
=
"w"
)
as
oFile
:
oFile
.
write
(
"seqname
\t
start
\t
end
\t
length
\t
sequence
\t
label
\n
"
)
...
...
Please
register
or
login
to post a comment