Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Carlos-Francisco Méndez-Cruz
/
deep-learning-workshop
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
Carlos-Francisco Méndez-Cruz
2019-05-08 14:32:02 -0500
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
1abd40e73f7051e4fec747dae27559cc7f2e8354
1abd40e7
1 parent
c056ae1d
Deep Learning Workshop
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
208 additions
and
3 deletions
data-sets/get-hga-training-test-py27.py → data-sets/get-hga-training-test-py27-v1.py
data-sets/get-hga-training-test-py27-v2.py
data-sets/get-hga-training-test-py27.py
→
data-sets/get-hga-training-test-py27
-v1
.py
View file @
1abd40e
...
...
@@ -8,7 +8,7 @@
# Each sequence as a one-hot encoding WHAT array or matrix
# Run:
# python3 get-hga-training-test-py27.py
# python3 get-hga-training-test-py27
-v1
.py
# --inputFile hga-sequences-toy.txt
# --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# --outputTraining hga-sequences-training.txt
...
...
@@ -18,13 +18,13 @@
# LAVIS
# qlogin
# python get-hga-training-test-py27.py
# python get-hga-training-test-py27
-v1
.py
# --inputFile hga-sequences-1000.txt
# --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# --outputTraining hga-sequences-training.txt
# --outputTest hga-sequences-test.txt
# --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# python get-hga-training-test-py27.py --inputFile hga-sequences-toy.txt --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# python get-hga-training-test-py27
-v1
.py --inputFile hga-sequences-toy.txt --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
import
argparse
import
pandas
as
pd
...
...
data-sets/get-hga-training-test-py27-v2.py
0 → 100644
View file @
1abd40e
# Get training and test data set for deep learning from sequence data set
# obtained from FASTA and HGA data sets (see script get-hga-sequences-py3.py)
# Input tab-separated format:
# Sequences: hga-sequences-toy.txt
# Output one-hot encoding format:
# Each sequence as a one-hot encoding WHAT array or matrix
# Run:
# python3 get-hga-training-test-py27-v1.py
# --inputFile hga-sequences-toy.txt
# --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# --outputTraining hga-sequences-training.txt
# --outputTest hga-sequences-test.txt
# --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# python get-hga-training-test-py3.py --inputFile hga-sequences-1000.txt --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# LAVIS
# qlogin
# python get-hga-training-test-py27-v1.py
# --inputFile hga-sequences-1000.txt
# --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# --outputTraining hga-sequences-training.txt
# --outputTest hga-sequences-test.txt
# --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# python get-hga-training-test-py27-v1.py --inputFile hga-sequences-toy.txt --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
import
argparse
import
pandas
as
pd
import
os
from
sklearn.preprocessing
import
LabelEncoder
,
OneHotEncoder
import
numpy
as
np
from
sklearn.model_selection
import
train_test_split
from
tensorflow.keras.layers
import
Conv1D
,
Dense
,
MaxPooling1D
,
Flatten
from
tensorflow.keras.models
import
Sequential
import
matplotlib.pyplot
as
plt
from
sklearn.metrics
import
confusion_matrix
import
itertools
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
'Get training and test data sets for Human Genome Annotation.'
)
parser
.
add_argument
(
'--inputFile'
,
dest
=
'inputFile'
,
help
=
'Input file'
)
parser
.
add_argument
(
'--inputPath'
,
dest
=
'inputPath'
,
help
=
'Input path'
)
parser
.
add_argument
(
'--outputTraining'
,
dest
=
'outputTraining'
,
help
=
'Output training file'
)
parser
.
add_argument
(
'--outputValidation'
,
dest
=
'outputValidation'
,
help
=
'Output training file'
)
parser
.
add_argument
(
'--outputTest'
,
dest
=
'outputTest'
,
help
=
'Output test file'
)
parser
.
add_argument
(
'--outputPath'
,
dest
=
'outputPath'
,
help
=
'Output path for training, validation, and testing'
)
args
=
parser
.
parse_args
()
# To one-hot encoding taken from: https://colab.research.google.com/drive/17E4h5aAOioh5DiTo7MZg4hpL6Z_0FyWr#scrollTo=IPJD6PuDnaS6
# The LabelEncoder encodes a sequence of bases as a sequence of integers.
integer_encoder
=
LabelEncoder
()
# The OneHotEncoder converts an array of integers to a sparse matrix where
# each row corresponds to one possible value of each feature.
one_hot_encoder
=
OneHotEncoder
(
categories
=
'auto'
)
input_features
=
[]
# Read file with sequences
with
open
(
os
.
path
.
join
(
args
.
inputPath
,
args
.
inputFile
),
mode
=
"r"
)
as
tabfile
:
df
=
pd
.
read_csv
(
tabfile
,
delimiter
=
'
\t
'
)
print
(
"All rows in df: {}"
.
format
(
len
(
df
.
index
)))
df_filtered
=
df
.
loc
[(
df
[
'label'
]
==
"exon"
)
|
(
df
[
'label'
]
==
"utr"
)]
print
(
"Only exon and utr rows in df: {}"
.
format
(
len
(
df_filtered
.
index
)))
# print("df: {}".format(df))
sequences
=
df_filtered
[
'sequence'
]
labels
=
df_filtered
[
'label'
]
'''
max_exon_length = 0
max_utr_length = 0
# Getting the max length of sequences
for sequence, label in zip(sequences, labels):
if label == "exon":
if len(sequence) > max_exon_length:
max_exon_length = len(sequence)
elif label == "utr":
if len(sequence) > max_utr_length:
max_utr_length = len(sequence)
print("Max exon length: {}".format(max_exon_length))
print("Max utr length: {}".format(max_utr_length))
if max_exon_length > max_utr_length:
max_length = max_exon_length
else:
max_length = max_utr_length
print("Max length: {}".format(max_length))
# Fill sequence with X char to get max length
# One-hot-encoding of sequences
for sequence, label in zip(sequences, labels):
if len(sequence) < max_length:
# print("sequence: {}".format(sequence))
# print("Le falta: {}".format(max_length - len(sequence)))
sequence_adjust = sequence.ljust(max_length, 'X') + 'ACGTX'
# print("sequence_adjust: {}".format(sequence_adjust))
else:
sequence_adjust = sequence + 'ACGTX'
'''
# One-hot-encoding of sequences
for
sequence
,
label
in
zip
(
sequences
,
labels
):
sequence_adjust
=
sequence
+
'ACGTX'
print
(
"Length sequence_adjust: {}"
.
format
(
len
(
sequence_adjust
)))
integer_encoded
=
integer_encoder
.
fit_transform
(
list
(
sequence_adjust
))
print
(
"integer_encoded.classes_: {}"
.
format
(
integer_encoder
.
classes_
))
integer_encoded
=
np
.
array
(
integer_encoded
)
.
reshape
(
-
1
,
1
)
# print("integer_encoded: {}".format(integer_encoded))
one_hot_encoded
=
one_hot_encoder
.
fit_transform
(
integer_encoded
)
print
(
" shape: {}"
.
format
(
one_hot_encoded
.
toarray
()
.
shape
))
input_features
.
append
(
one_hot_encoded
.
toarray
())
# Print first sequence and one-hot-encoding
np
.
set_printoptions
(
threshold
=
40
)
input_features
=
np
.
stack
(
input_features
)
print
(
"Example sequence
\n
-----------------------"
)
print
(
'DNA Sequence #1:
\n
'
,
sequences
[
0
][:
10
],
'...'
,
sequences
[
0
][
-
10
:])
print
(
'One hot encoding of Sequence #1:
\n
'
,
input_features
[
0
]
.
T
)
# One-hot-encoding of labels
one_hot_encoder
=
OneHotEncoder
(
categories
=
'auto'
)
labels
=
np
.
array
(
labels
)
.
reshape
(
-
1
,
1
)
input_labels
=
one_hot_encoder
.
fit_transform
(
labels
)
.
toarray
()
# Print labels and one-hot-encoding
print
(
'Labels:
\n
'
,
labels
.
T
)
print
(
'One-hot encoded labels:
\n
'
,
input_labels
.
T
)
# Split one-hot-encoding data into training, and test data sets
train_features
,
test_features
,
train_labels
,
test_labels
=
train_test_split
(
input_features
,
input_labels
,
test_size
=
0.25
,
random_state
=
42
)
# Model definition
model
=
Sequential
()
model
.
add
(
Conv1D
(
filters
=
32
,
kernel_size
=
12
,
input_shape
=
(
train_features
.
shape
[
1
],
4
)))
model
.
add
(
MaxPooling1D
(
pool_size
=
4
))
model
.
add
(
Flatten
())
model
.
add
(
Dense
(
16
,
activation
=
'relu'
))
model
.
add
(
Dense
(
2
,
activation
=
'softmax'
))
model
.
compile
(
loss
=
'binary_crossentropy'
,
optimizer
=
'adam'
,
metrics
=
[
'binary_accuracy'
])
model
.
summary
()
# Model training and validation
history
=
model
.
fit
(
train_features
,
train_labels
,
epochs
=
50
,
verbose
=
0
,
validation_split
=
0.25
)
# Plot training-validation loss
plt
.
figure
()
plt
.
plot
(
history
.
history
[
'loss'
])
plt
.
plot
(
history
.
history
[
'val_loss'
])
plt
.
title
(
'model loss'
)
plt
.
ylabel
(
'loss'
)
plt
.
xlabel
(
'epoch'
)
plt
.
legend
([
'train'
,
'validation'
])
# plt.show()
plt
.
savefig
(
'training-validation-loss.png'
)
# Plot training-validation accuracy
plt
.
figure
()
plt
.
plot
(
history
.
history
[
'binary_accuracy'
])
plt
.
plot
(
history
.
history
[
'val_binary_accuracy'
])
plt
.
title
(
'model accuracy'
)
plt
.
ylabel
(
'accuracy'
)
plt
.
xlabel
(
'epoch'
)
plt
.
legend
([
'train'
,
'validation'
])
# plt.show()
plt
.
savefig
(
'training-validation-binary-accuracy.png'
)
# Predict with rest data set
predicted_labels
=
model
.
predict
(
np
.
stack
(
test_features
))
# Print confusion matrix
cm
=
confusion_matrix
(
np
.
argmax
(
test_labels
,
axis
=
1
),
np
.
argmax
(
predicted_labels
,
axis
=
1
))
print
(
'Confusion matrix:
\n
'
,
cm
)
cm
=
cm
.
astype
(
'float'
)
/
cm
.
sum
(
axis
=
1
)[:,
np
.
newaxis
]
# Plot confusion matrix
plt
.
imshow
(
cm
,
cmap
=
plt
.
cm
.
Blues
)
plt
.
title
(
'Normalized confusion matrix'
)
plt
.
colorbar
()
plt
.
xlabel
(
'True label'
)
plt
.
ylabel
(
'Predicted label'
)
plt
.
xticks
([
0
,
1
]);
plt
.
yticks
([
0
,
1
])
plt
.
grid
(
'off'
)
for
i
,
j
in
itertools
.
product
(
range
(
cm
.
shape
[
0
]),
range
(
cm
.
shape
[
1
])):
plt
.
text
(
j
,
i
,
format
(
cm
[
i
,
j
],
'.2f'
),
horizontalalignment
=
'center'
,
color
=
'white'
if
cm
[
i
,
j
]
>
0.5
else
'black'
)
plt
.
savefig
(
'training-validation-confusion-matrix.png'
)
Please
register
or
login
to post a comment