Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Carlos-Francisco Méndez-Cruz
/
deep-learning-workshop
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
Carlos-Francisco Méndez-Cruz
2019-04-26 14:51:52 -0500
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
f61a4234e1c3fac6f576ac3e54147eeb6e9d6cfc
f61a4234
1 parent
e6cd562a
Deep Learning Workshop
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
170 additions
and
21 deletions
data-sets/get-hga-sequences.py
data-sets/get-hga-training-test.py
data-sets/human-genome-annotation/get-hga-data-set.py
data-sets/get-hga-sequences.py
0 → 100644
View file @
f61a423
# Get sequences by combining Human Genome Annotation data set (csv)
# with FASTA files to obtain sequences corresponding to object in human genome
# using "start" and "end" columns from human-genome-annotation
# Install BioPython: conda install -c conda-forge biopython
# Input files:
# FASTA all chromosomes: /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
# Output tab-separated format:
# Start End Sequence Feature
# Run:
# c:\Anaconda3\python get-hga-data-set.py
# --feature gene
# --outputFile hga-sequences.txt
# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
# --hgaFile some-rows-example-human-genome-annotation.csv
# --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
# --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files
# c:\Anaconda3\python get-hga-data-set.py --feature gene --outputFile hga-sequences.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --hgaFile some-rows-example-human-genome-annotation.csv --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files
import
argparse
# from Bio import SeqIO
import
csv
import
os
from
Bio.SeqIO.FastaIO
import
SimpleFastaParser
def
get_total_len
(
filename
):
count
=
0
total_len
=
0
with
open
(
filename
)
as
in_handle
:
for
title
,
seq
in
SimpleFastaParser
(
in_handle
):
count
+=
1
total_len
+=
len
(
seq
)
retval
=
"{} records with total sequence length {}"
.
format
(
count
,
total_len
)
return
retval
def
get_sequence
(
filename
,
start
,
end
):
ret_sequence
=
""
with
open
(
filename
)
as
in_handle
:
for
title
,
seq
in
SimpleFastaParser
(
in_handle
):
ret_sequence
=
seq
[
start
:
end
+
1
]
return
ret_sequence
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
'Get source data set for Human Genome Annotation.'
)
parser
.
add_argument
(
'--fastaPath'
,
dest
=
'fastaPath'
,
help
=
'Path for FASTA files'
)
parser
.
add_argument
(
'--hgaPath'
,
dest
=
'hgaPath'
,
help
=
'Path for Human Genome Annotation file'
)
parser
.
add_argument
(
'--hgaFile'
,
dest
=
'hgaFile'
,
help
=
'Human Genome Annotation file'
)
parser
.
add_argument
(
'--outputPath'
,
dest
=
'outputPath'
,
help
=
'Output path'
)
parser
.
add_argument
(
'--outputFile'
,
dest
=
'outputFile'
,
help
=
'Output file'
)
parser
.
add_argument
(
'--feature'
,
dest
=
'feature'
,
help
=
'Feature (gene, exon)'
)
args
=
parser
.
parse_args
()
list_rows
=
[]
# Read HGA csv file
with
open
(
os
.
path
.
join
(
args
.
hgaPath
,
args
.
hgaFile
),
mode
=
"r"
,
encoding
=
"utf-8"
)
as
csvfile
:
reader
=
csv
.
DictReader
(
csvfile
)
for
row
in
reader
:
# print(row)
filename
=
os
.
path
.
join
(
args
.
fastaPath
,
"Homo_sapiens.GRCh38.dna.chromosome.{}.fa"
.
format
(
row
[
'seqname'
]))
sequence
=
get_sequence
(
filename
,
int
(
row
[
'start'
]),
int
(
row
[
'end'
]))
if
row
[
'feature'
]
==
args
.
feature
:
label
=
row
[
'feature'
]
else
:
label
=
"other"
new_row
=
"{}
\t
{}
\t
{}
\t
{}
\t
{}
\n
"
.
format
(
row
[
'seqname'
],
row
[
'start'
],
row
[
'end'
],
sequence
,
label
)
list_rows
.
append
(
new_row
)
with
open
(
os
.
path
.
join
(
args
.
outputPath
,
args
.
outputFile
),
mode
=
"w"
)
as
oFile
:
oFile
.
write
(
"seqname
\t
start
\t
end
\t
sequence
\t
label
\n
"
)
for
elem
in
list_rows
:
oFile
.
write
(
elem
)
data-sets/get-hga-training-test.py
0 → 100644
View file @
f61a423
# Get training and test data set for deep learning from sequence data set
# obtained from FASTA and HGA data sets (see script get-hga-sequences.py)
# Input tab-separated format:
# Sequences: hga-sequences.txt
# Output one-hot encoding format:
# Each sequence as a one-hot encoding WHAT array or matrix
# Run:
# c:\Anaconda3\python get-hga-training-test.py
# --inputFile hga-sequences.txt
# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
# --outputTrainFile hga-sequences-training.txt
# --outputTestFile hga-sequences-test.txt
# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
import
argparse
import
pandas
as
pd
import
os
from
sklearn.preprocessing
import
LabelEncoder
,
OneHotEncoder
import
numpy
as
np
from
sklearn.model_selection
import
train_test_split
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
'Get training and test data sets for Human Genome Annotation.'
)
parser
.
add_argument
(
'--inputFile'
,
dest
=
'inputFile'
,
help
=
'Input file'
)
parser
.
add_argument
(
'--inputPath'
,
dest
=
'inputPath'
,
help
=
'Input path'
)
parser
.
add_argument
(
'--outputTraining'
,
dest
=
'outputTraining'
,
help
=
'Output training file'
)
parser
.
add_argument
(
'--outputValidation'
,
dest
=
'outputValidation'
,
help
=
'Output training file'
)
parser
.
add_argument
(
'--outputTest'
,
dest
=
'outputTest'
,
help
=
'Output test file'
)
parser
.
add_argument
(
'--outputPath'
,
dest
=
'outputPath'
,
help
=
'Output path for training, validation, and testing'
)
args
=
parser
.
parse_args
()
# To one-hot encoding taken from: https://colab.research.google.com/drive/17E4h5aAOioh5DiTo7MZg4hpL6Z_0FyWr#scrollTo=IPJD6PuDnaS6
# The LabelEncoder encodes a sequence of bases as a sequence of integers.
integer_encoder
=
LabelEncoder
()
# The OneHotEncoder converts an array of integers to a sparse matrix where
# each row corresponds to one possible value of each feature.
one_hot_encoder
=
OneHotEncoder
(
categories
=
'auto'
)
input_features
=
[]
sequences
=
[]
# Read file with sequences
with
open
(
os
.
path
.
join
(
args
.
inputFile
,
args
.
inputPath
),
mode
=
"r"
,
encoding
=
"utf-8"
)
as
tabfile
:
df
=
pd
.
read_csv
(
tabfile
,
delimiter
=
'
\t
'
)
sequences
=
df
[
'sequence'
]
labels
=
df
[
'label'
]
# One-hot-encoding of sequences
for
sequence
in
sequences
:
integer_encoded
=
integer_encoder
.
fit_transform
(
list
(
sequence
))
integer_encoded
=
np
.
array
(
integer_encoded
)
.
reshape
(
-
1
,
1
)
one_hot_encoded
=
one_hot_encoder
.
fit_transform
(
integer_encoded
)
# Print first sequence and one-hot-encoding
np
.
set_printoptions
(
threshold
=
40
)
input_features
=
np
.
stack
(
input_features
)
print
(
"Example sequence
\n
-----------------------"
)
print
(
'DNA Sequence #1:
\n
'
,
sequences
[
0
][:
10
],
'...'
,
sequences
[
0
][
-
10
:])
print
(
'One hot encoding of Sequence #1:
\n
'
,
input_features
[
0
]
.
T
)
# One-hot-encoding of labels
one_hot_encoder
=
OneHotEncoder
(
categories
=
'auto'
)
labels
=
np
.
array
(
labels
)
.
reshape
(
-
1
,
1
)
input_labels
=
one_hot_encoder
.
fit_transform
(
labels
)
.
toarray
()
# Print labels and one-hot-encoding
print
(
'Labels:
\n
'
,
labels
.
T
)
print
(
'One-hot encoded labels:
\n
'
,
input_labels
.
T
)
# Split one-hot-encoding data into training, and test data sets
train_features
,
test_features
,
train_labels
,
test_labels
=
train_test_split
(
input_features
,
input_labels
,
test_size
=
0.25
,
random_state
=
42
)
with
open
(
os
.
path
.
join
(
args
.
outputPath
,
args
.
outputFile
),
mode
=
"w"
)
as
oFile
:
for
elem
in
list_rows
:
oFile
.
write
(
elem
)
data-sets/human-genome-annotation/get-hga-data-set.py
deleted
100644 → 0
View file @
e6cd562
# Get source data set by combining Human Genome Annotation data set (csv)
# with FASTA files to obtain sequences corresponding to object in human genome
# using "start" and "end" columns from human-genome-annotation
# Input files:
# FASTA all chromosomes: /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
# Output tab-separated format:
# Start End Sequence Feature
import
argparse
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
'Get source data set for Human Genome Annotation.'
)
parser
.
add_argument
(
'--fastaPath'
,
dest
=
'fastaPath'
,
action
=
'store_const'
,
const
=
sum
,
default
=
max
,
help
=
'sum the integers (default: find the max)'
)
args
=
parser
.
parse_args
()
print
(
args
.
accumulate
(
args
.
integers
))
Please
register
or
login
to post a comment