Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Carlos-Francisco Méndez-Cruz
/
deep-learning-workshop
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
Carlos-Francisco Méndez-Cruz
2019-05-08 12:19:16 -0500
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
347df9f6f2aaba364ab22da58a89326a42f7e4ab
347df9f6
1 parent
7ae410f3
Deep Learning Workshop
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
294 additions
and
1 deletions
data-sets/get-hga-sequences-py27.py
data-sets/get-hga-training-test-py27.py
data-sets/get-hga-training-test.py
data-sets/get-hga-sequences-py27.py
0 → 100644
View file @
347df9f
# Get sequences by combining Human Genome Annotation data set (csv)
# with FASTA files to obtain sequences corresponding to object in human genome
# using "start" and "end" columns from human-genome-annotation
# Install BioPython: conda install -c conda-forge biopython
# Input files:
# FASTA all chromosomes: /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
# Output tab-separated format:
# Start End Sequence Feature
# Run:
# c:\Anaconda3\python get-hga-data-set.py
# --feature gene
# --outputFile hga-sequences-toy.txt
# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
# --hgaFile some-rows-example-human-genome-annotation.csv
# --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
# --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files
# c:\Anaconda3\python get-hga-data-set.py --feature gene --outputFile hga-sequences-toy.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --hgaFile some-rows-example-human-genome-annotation.csv --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files
# python3 get-hga-sequences.py
# --feature gene
# --outputFile hga-sequences-toy.txt
# --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# --hgaFile Homo_sapiens.GRCh38.92.csv
# --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
# python3 get-hga-sequences.py --feature gene --outputFile hga-sequences.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --hgaFile Homo_sapiens.GRCh38.92.csv --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
import
argparse
# from Bio import SeqIO
import
csv
import
os
from
Bio.SeqIO.FastaIO
import
SimpleFastaParser
def
get_total_len
(
filename
):
count
=
0
total_len
=
0
with
open
(
filename
)
as
in_handle
:
for
title
,
seq
in
SimpleFastaParser
(
in_handle
):
count
+=
1
total_len
+=
len
(
seq
)
retval
=
"{} records with total sequence length {}"
.
format
(
count
,
total_len
)
return
retval
def
get_sequence
(
filename
,
start
,
end
):
ret_sequence
=
""
with
open
(
filename
)
as
in_handle
:
for
title
,
seq
in
SimpleFastaParser
(
in_handle
):
ret_sequence
=
seq
[
start
:
end
+
1
]
return
ret_sequence
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
'Get source data set for Human Genome Annotation.'
)
parser
.
add_argument
(
'--fastaPath'
,
dest
=
'fastaPath'
,
help
=
'Path for FASTA files'
)
parser
.
add_argument
(
'--hgaPath'
,
dest
=
'hgaPath'
,
help
=
'Path for Human Genome Annotation file'
)
parser
.
add_argument
(
'--hgaFile'
,
dest
=
'hgaFile'
,
help
=
'Human Genome Annotation file'
)
parser
.
add_argument
(
'--outputPath'
,
dest
=
'outputPath'
,
help
=
'Output path'
)
parser
.
add_argument
(
'--outputFile'
,
dest
=
'outputFile'
,
help
=
'Output file'
)
parser
.
add_argument
(
'--feature'
,
dest
=
'feature'
,
help
=
'Feature (gene, exon)'
)
args
=
parser
.
parse_args
()
list_rows
=
[]
i
=
0
length
=
0
length_total_exon
=
0
length_total_utr
=
0
total_exon
=
0
total_utr
=
0
# Read HGA csv file
with
open
(
os
.
path
.
join
(
args
.
hgaPath
,
args
.
hgaFile
),
mode
=
"r"
,
encoding
=
"utf-8"
)
as
csvfile
:
reader
=
csv
.
DictReader
(
csvfile
)
for
row
in
reader
:
# print(row)
filename
=
os
.
path
.
join
(
args
.
fastaPath
,
"Homo_sapiens.GRCh38.dna.chromosome.{}.fa"
.
format
(
row
[
'seqname'
]))
# We use only
sequence
=
get_sequence
(
filename
,
int
(
row
[
'start'
]),
int
(
row
[
'end'
]))
# Features in HGA:
# exon
# feature
# five_prime_utr
# gene
# Selenocysteine
# start_codon
# stop_codon
# three_prime_utr
length
=
int
(
row
[
'end'
])
-
int
(
row
[
'start'
])
+
1
if
row
[
'feature'
]
==
"exon"
:
label
=
row
[
'feature'
]
length_total_exon
+=
length
total_exon
+=
1
elif
row
[
'feature'
]
in
[
"five_prime_utr"
,
"three_prime_utr"
]:
label
=
"utr"
length_total_utr
+=
length
total_utr
+=
1
else
:
label
=
"other"
new_row
=
"{}
\t
{}
\t
{}
\t
{}
\t
{}
\t
{}
\n
"
.
format
(
row
[
'seqname'
],
row
[
'start'
],
row
[
'end'
],
length
,
sequence
,
label
)
list_rows
.
append
(
new_row
)
i
+=
1
if
(
i
%
100
)
==
0
:
print
(
"{} rows processed."
.
format
(
i
))
if
i
==
10000
:
break
print
(
"Length media exons {} and utr {}"
.
format
(
length_total_exon
/
total_exon
,
length_total_utr
/
total_utr
))
with
open
(
os
.
path
.
join
(
args
.
outputPath
,
args
.
outputFile
),
mode
=
"w"
)
as
oFile
:
oFile
.
write
(
"seqname
\t
start
\t
end
\t
length
\t
sequence
\t
label
\n
"
)
for
elem
in
list_rows
:
oFile
.
write
(
elem
)
data-sets/get-hga-training-test-py27.py
0 → 100644
View file @
347df9f
# Get training and test data set for deep learning from sequence data set
# obtained from FASTA and HGA data sets (see script get-hga-sequences.py)
# Input tab-separated format:
# Sequences: hga-sequences-toy.txt
# Output one-hot encoding format:
# Each sequence as a one-hot encoding WHAT array or matrix
# Run:
# python3 get-hga-training-test.py
# --inputFile hga-sequences-toy.txt
# --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# --outputTraining hga-sequences-training.txt
# --outputTest hga-sequences-test.txt
# --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# python get-hga-training-test.py --inputFile hga-sequences-1000.txt --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
import
argparse
import
pandas
as
pd
import
os
from
sklearn.preprocessing
import
LabelEncoder
,
OneHotEncoder
import
numpy
as
np
from
sklearn.model_selection
import
train_test_split
from
tensorflow.keras.layers
import
Conv1D
,
Dense
,
MaxPooling1D
,
Flatten
from
tensorflow.keras.models
import
Sequential
import
matplotlib.pyplot
as
plt
from
sklearn.metrics
import
confusion_matrix
import
itertools
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
'Get training and test data sets for Human Genome Annotation.'
)
parser
.
add_argument
(
'--inputFile'
,
dest
=
'inputFile'
,
help
=
'Input file'
)
parser
.
add_argument
(
'--inputPath'
,
dest
=
'inputPath'
,
help
=
'Input path'
)
parser
.
add_argument
(
'--outputTraining'
,
dest
=
'outputTraining'
,
help
=
'Output training file'
)
parser
.
add_argument
(
'--outputValidation'
,
dest
=
'outputValidation'
,
help
=
'Output training file'
)
parser
.
add_argument
(
'--outputTest'
,
dest
=
'outputTest'
,
help
=
'Output test file'
)
parser
.
add_argument
(
'--outputPath'
,
dest
=
'outputPath'
,
help
=
'Output path for training, validation, and testing'
)
args
=
parser
.
parse_args
()
# To one-hot encoding taken from: https://colab.research.google.com/drive/17E4h5aAOioh5DiTo7MZg4hpL6Z_0FyWr#scrollTo=IPJD6PuDnaS6
# The LabelEncoder encodes a sequence of bases as a sequence of integers.
integer_encoder
=
LabelEncoder
()
# The OneHotEncoder converts an array of integers to a sparse matrix where
# each row corresponds to one possible value of each feature.
one_hot_encoder
=
OneHotEncoder
(
categories
=
'auto'
)
input_features
=
[]
sequences
=
[]
# Read file with sequences
with
open
(
os
.
path
.
join
(
args
.
inputPath
,
args
.
inputFile
),
mode
=
"r"
,
encoding
=
"utf-8"
)
as
tabfile
:
df
=
pd
.
read_csv
(
tabfile
,
delimiter
=
'
\t
'
)
print
(
"df: {}"
.
format
(
df
))
sequences
=
df
[
'sequence'
]
labels
=
df
[
'label'
]
max_exon_length
=
0
max_utr_length
=
0
# One-hot-encoding of sequences
for
sequence
,
label
in
zip
(
sequences
,
labels
):
if
label
==
"exon"
:
if
len
(
sequence
)
>
max_exon_length
:
max_exon_length
=
len
(
sequence
)
elif
label
==
"utr"
:
if
len
(
sequence
)
>
max_utr_length
:
max_utr_length
=
len
(
sequence
)
'''
integer_encoded = integer_encoder.fit_transform(list(sequence))
integer_encoded = np.array(integer_encoded).reshape(-1, 1)
one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)
input_features.append(one_hot_encoded.toarray())
'''
print
(
"Max exon length: {}"
.
format
(
max_exon_length
))
print
(
"Max utr length: {}"
.
format
(
max_utr_length
))
exit
()
# Print first sequence and one-hot-encoding
np
.
set_printoptions
(
threshold
=
40
)
input_features
=
np
.
stack
(
input_features
)
print
(
"Example sequence
\n
-----------------------"
)
print
(
'DNA Sequence #1:
\n
'
,
sequences
[
0
][:
10
],
'...'
,
sequences
[
0
][
-
10
:])
print
(
'One hot encoding of Sequence #1:
\n
'
,
input_features
[
0
]
.
T
)
# One-hot-encoding of labels
one_hot_encoder
=
OneHotEncoder
(
categories
=
'auto'
)
labels
=
np
.
array
(
labels
)
.
reshape
(
-
1
,
1
)
input_labels
=
one_hot_encoder
.
fit_transform
(
labels
)
.
toarray
()
# Print labels and one-hot-encoding
print
(
'Labels:
\n
'
,
labels
.
T
)
print
(
'One-hot encoded labels:
\n
'
,
input_labels
.
T
)
# Split one-hot-encoding data into training, and test data sets
train_features
,
test_features
,
train_labels
,
test_labels
=
train_test_split
(
input_features
,
input_labels
,
test_size
=
0.25
,
random_state
=
42
)
# Model definition
model
=
Sequential
()
model
.
add
(
Conv1D
(
filters
=
32
,
kernel_size
=
12
,
input_shape
=
(
train_features
.
shape
[
1
],
4
)))
model
.
add
(
MaxPooling1D
(
pool_size
=
4
))
model
.
add
(
Flatten
())
model
.
add
(
Dense
(
16
,
activation
=
'relu'
))
model
.
add
(
Dense
(
2
,
activation
=
'softmax'
))
model
.
compile
(
loss
=
'binary_crossentropy'
,
optimizer
=
'adam'
,
metrics
=
[
'binary_accuracy'
])
model
.
summary
()
# Model training and validation
history
=
model
.
fit
(
train_features
,
train_labels
,
epochs
=
50
,
verbose
=
0
,
validation_split
=
0.25
)
# Plot training-validation loss
plt
.
figure
()
plt
.
plot
(
history
.
history
[
'loss'
])
plt
.
plot
(
history
.
history
[
'val_loss'
])
plt
.
title
(
'model loss'
)
plt
.
ylabel
(
'loss'
)
plt
.
xlabel
(
'epoch'
)
plt
.
legend
([
'train'
,
'validation'
])
# plt.show()
plt
.
savefig
(
'training-validation-loss.png'
)
# Plot training-validation accuracy
plt
.
figure
()
plt
.
plot
(
history
.
history
[
'binary_accuracy'
])
plt
.
plot
(
history
.
history
[
'val_binary_accuracy'
])
plt
.
title
(
'model accuracy'
)
plt
.
ylabel
(
'accuracy'
)
plt
.
xlabel
(
'epoch'
)
plt
.
legend
([
'train'
,
'validation'
])
# plt.show()
plt
.
savefig
(
'training-validation-binary-accuracy.png'
)
# Predict with rest data set
predicted_labels
=
model
.
predict
(
np
.
stack
(
test_features
))
# Print confusion matrix
cm
=
confusion_matrix
(
np
.
argmax
(
test_labels
,
axis
=
1
),
np
.
argmax
(
predicted_labels
,
axis
=
1
))
print
(
'Confusion matrix:
\n
'
,
cm
)
cm
=
cm
.
astype
(
'float'
)
/
cm
.
sum
(
axis
=
1
)[:,
np
.
newaxis
]
# Plot confusion matrix
plt
.
imshow
(
cm
,
cmap
=
plt
.
cm
.
Blues
)
plt
.
title
(
'Normalized confusion matrix'
)
plt
.
colorbar
()
plt
.
xlabel
(
'True label'
)
plt
.
ylabel
(
'Predicted label'
)
plt
.
xticks
([
0
,
1
]);
plt
.
yticks
([
0
,
1
])
plt
.
grid
(
'off'
)
for
i
,
j
in
itertools
.
product
(
range
(
cm
.
shape
[
0
]),
range
(
cm
.
shape
[
1
])):
plt
.
text
(
j
,
i
,
format
(
cm
[
i
,
j
],
'.2f'
),
horizontalalignment
=
'center'
,
color
=
'white'
if
cm
[
i
,
j
]
>
0.5
else
'black'
)
plt
.
savefig
(
'training-validation-confusion-matrix.png'
)
data-sets/get-hga-training-test.py
View file @
347df9f
...
...
@@ -14,7 +14,7 @@
# --outputTraining hga-sequences-training.txt
# --outputTest hga-sequences-test.txt
# --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# python
3 get-hga-training-test.py --inputFile hga-sequences-toy
.txt --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# python
get-hga-training-test.py --inputFile hga-sequences-1000
.txt --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
import
argparse
import
pandas
as
pd
...
...
Please
register
or
login
to post a comment