Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Carlos-Francisco Méndez-Cruz
/
deep-learning-workshop
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
Carlos-Francisco Méndez-Cruz
2019-05-02 22:40:19 -0500
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
fd6da844a387cbbfb806b52c61ddb47fa8c976dc
fd6da844
1 parent
aa056f22
Deep Learning Workshop
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
14 additions
and
4 deletions
data-sets/get-hga-sequences.py
data-sets/get-hga-sequences.py
View file @
fd6da84
...
...
@@ -74,6 +74,11 @@ if __name__ == "__main__":
list_rows
=
[]
i
=
0
length
=
0
length_total_exon
=
0
length_total_utr
=
0
total_exon
=
0
total_utr
=
0
# Read HGA csv file
with
open
(
os
.
path
.
join
(
args
.
hgaPath
,
args
.
hgaFile
),
mode
=
"r"
,
encoding
=
"utf-8"
)
as
csvfile
:
reader
=
csv
.
DictReader
(
csvfile
)
...
...
@@ -91,22 +96,27 @@ if __name__ == "__main__":
# start_codon
# stop_codon
# three_prime_utr
length
=
int
(
row
[
'end'
])
-
int
(
row
[
'start'
])
+
1
if
row
[
'feature'
]
==
"exon"
:
label
=
row
[
'feature'
]
length_total_exon
+=
length
total_exon
+=
1
elif
row
[
'feature'
]
in
[
"five_prime_utr"
,
"three_prime_utr"
]:
label
=
"utr"
length_total_utr
+=
length
total_utr
+=
1
else
:
label
=
"other"
new_row
=
"{}
\t
{}
\t
{}
\t
{}
\t
{}
\
n
"
.
format
(
row
[
'seqname'
],
row
[
'start'
],
row
[
'end'
]
,
sequence
,
label
)
new_row
=
"{}
\t
{}
\t
{}
\t
{}
\t
{}
\
t
{}
\n
"
.
format
(
row
[
'seqname'
],
row
[
'start'
],
row
[
'end'
],
length
,
sequence
,
label
)
list_rows
.
append
(
new_row
)
i
+=
1
if
(
i
%
100
)
==
0
:
print
(
"{} rows processed."
.
format
(
i
))
if
i
==
10000
:
break
print
(
"Length media exons {} and utr {}"
.
format
(
length_total_exon
/
total_exon
,
length_total_utr
/
total_utr
))
with
open
(
os
.
path
.
join
(
args
.
outputPath
,
args
.
outputFile
),
mode
=
"w"
)
as
oFile
:
oFile
.
write
(
"seqname
\t
start
\t
end
\t
sequence
\t
label
\n
"
)
oFile
.
write
(
"seqname
\t
start
\t
end
\t
length
\t
sequence
\t
label
\n
"
)
for
elem
in
list_rows
:
oFile
.
write
(
elem
)
...
...
Please
register
or
login
to post a comment