Carlos-Francisco Méndez-Cruz

Deep Learning Workshop

import numpy as np
import matplotlib.pyplot as plt
from keras.datasets import mnist
from keras.utils import np_utils
from keras import Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPool1D, MaxPool2D, Flatten, BatchNormalization
from keras.regularizers import l1_l2
# Load MNIST data set
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train = x_train.astype('float64') / 255.0
x_test = x_test.astype('float64') / 255.0
x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], x_train.shape[2], 1))
x_test = x_test.reshape(((x_test.shape[0], x_test.shape[1], x_test.shape[2], 1)))
y_train = np_utils.to_categorical(y_train)
y_test = np_utils.to_categorical(y_test)
print("Training set:")
print(x_train.shape)
print(y_train.shape)
print("Test set:")
print(x_test.shape)
print(y_test.shape)
# Split train into train and validation
validation_rate = 0.2
n_train_samples = round(validation_rate * len(x_train))
print("Taking {} validation samples".format(n_train_samples))
x_val = x_train[:n_train_samples]
y_val = y_train[:n_train_samples]
x_train = x_train[n_train_samples:]
y_train = y_train[n_train_samples:]
print("Training set:")
print(x_train.shape)
print(y_train.shape)
print("Validation set:")
print(x_val.shape)
print(y_val.shape)
# Build model architecture (layers with activations)
_, n_rows, n_cols, n_chans = x_train.shape # Shape of input data
model = Sequential()
model = Sequential()
# Feature learning
## Conv Layer 1
model.add(Conv2D(input_shape=(n_rows, n_cols, n_chans), filters=16, kernel_size=(3, 3), activation='relu', padding='same'))
model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2)))
## Conv Layer 2
model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2)))
## Conv Layer 3
model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
model.add(MaxPool2D(pool_size=(2, 2)))
# Clasification
model.add(Flatten())
model.add(BatchNormalization())
model.add(Dense(64, activation='relu', kernel_regularizer=l1_l2(9e-4)))
model.add(Dense(units=y_train.shape[1], activation='softmax'))
model.summary()
# Compile model (define optimizer and loss function)
model.compile(optimizer='adadelta', loss='categorical_crossentropy', metrics=['accuracy'])
# Train your model
num_epochs = 15
losses = np.zeros((num_epochs, 2))
accura = np.zeros((num_epochs, 2))
print("Training on {x_train.shape[0]} samples - validating on {x_val.shape[0]} samples.")
for epoch in range(num_epochs):
print("Epoch: {epoch+1:3d} -- ", end="")
model.fit(x_train, y_train, epochs=1, batch_size=256, verbose=False)
losses[epoch, 0], accura[epoch, 0] = model.evaluate(x_train, y_train, verbose=False)
losses[epoch, 1], accura[epoch, 1] = model.evaluate(x_val, y_val, verbose=False)
print("Train loss: {losses[epoch, 0]:6.4f}, acc: {accura[epoch, 0]:6.4f} -- Val loss: {losses[epoch, 1]:6.4f}, acc: {accura[epoch, 1]:6.4f}")
# Plot training history
plt.figure(figsize=(15, 10))
plt.plot(losses[:, 0], label='Loss: Training', linewidth=2)
plt.plot(losses[:, 1], label='Loss: Validation', linewidth=2)
plt.plot(accura[:, 0], label='Accu: Training', linewidth=2)
plt.plot(accura[:, 1], label='Accu: Validation', linewidth=2)
plt.legend(fontsize=18)
plt.xlabel("Epoch", fontsize=18)
plt.ylabel("Loss", fontsize=18)
plt.xticks(np.arange(1, len(losses)))
plt.tick_params(labelsize=18)
plt.grid()
y_hat = model.predict(x_test)
test_loss, test_acc = model.evaluate(x_test, y_test)
print("Test loss: {:6.4f}, acc: {:6.4f}".format(test_loss, test_acc))
\ No newline at end of file
......@@ -6,7 +6,13 @@ from keras import Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPool1D, MaxPool2D, Flatten, BatchNormalization
from keras.regularizers import l1_l2
# Load MNIST data set
# Load Human Genome Annotation (HGA) data set
# GRCh38.92
# From https://www.kaggle.com/alfrandom/human-genome-annotation
hga_csv =
(x_train, y_train), (x_test, y_test) = mnist.load_data()
......
FASTA data from ftp://ftp.ensembl.org/pub/release-96/fasta/homo_sapiens/dna/
Human Genome Annotation data from https://www.kaggle.com/alfrandom/human-genome-annotation
\ No newline at end of file
# Get source data set by combining Human Genome Annotation data set (csv)
# with FASTA files to obtain sequences corresponding to object in human genome
# using "start" and "end" columns from human-genome-annotation
# Input files:
# FASTA all chromosomes: /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
# Output tab-separated format:
# Start End Sequence Feature
import argparse
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Get source data set for Human Genome Annotation.')
parser.add_argument('--fastaPath', dest='fastaPath', action='store_const',
const=sum, default=max,
help='sum the integers (default: find the max)')
args = parser.parse_args()
print(args.accumulate(args.integers))