Deep Learning Workshop

Carlos-Francisco Méndez-Cruz
Commit e6cd562a4a4ff3ab3e2485c0810511fdf8b8fe35 e6cd562a 1 parent 67a4474b
Showing 4 changed files with 141 additions and 1 deletions
cnn/cnn-Larisa.py
cnn/cnn-hga-basic-v01.py
data-sets/human-genome-annotation/README.txt
data-sets/human-genome-annotation/get-hga-data-set.py
--- a/cnn/cnn-Larisa.py 0 → 100644
View file @e6cd562
+++ b/cnn/cnn-Larisa.py 0 → 100644
View file @e6cd562
+import numpy as np
+import matplotlib.pyplot as plt
+from keras.datasets import mnist
+from keras.utils import np_utils
+from keras import Sequential
+from keras.layers import Dense, Dropout, Conv2D, MaxPool1D, MaxPool2D, Flatten, BatchNormalization
+from keras.regularizers import l1_l2
+
+# Load MNIST data set
+
+(x_train, y_train), (x_test, y_test) = mnist.load_data()
+
+x_train = x_train.astype('float64') / 255.0
+x_test = x_test.astype('float64') / 255.0
+
+x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], x_train.shape[2], 1))
+x_test = x_test.reshape(((x_test.shape[0], x_test.shape[1], x_test.shape[2], 1)))
+
+y_train = np_utils.to_categorical(y_train)
+y_test = np_utils.to_categorical(y_test)
+
+print("Training set:")
+print(x_train.shape)
+print(y_train.shape)
+
+print("Test set:")
+print(x_test.shape)
+print(y_test.shape)
+
+# Split train into train and validation
+
+validation_rate = 0.2
+n_train_samples = round(validation_rate * len(x_train))
+print("Taking {} validation samples".format(n_train_samples))
+
+x_val = x_train[:n_train_samples]
+y_val = y_train[:n_train_samples]
+
+x_train = x_train[n_train_samples:]
+y_train = y_train[n_train_samples:]
+
+print("Training set:")
+print(x_train.shape)
+print(y_train.shape)
+
+print("Validation set:")
+print(x_val.shape)
+print(y_val.shape)
+
+# Build model architecture (layers with activations)
+
+_, n_rows, n_cols, n_chans = x_train.shape # Shape of input data
+model = Sequential()
+
+model = Sequential()
+
+# Feature learning
+
+## Conv Layer 1
+model.add(Conv2D(input_shape=(n_rows, n_cols, n_chans), filters=16, kernel_size=(3, 3), activation='relu', padding='same'))
+model.add(Conv2D(32, (3, 3), activation='relu'))
+model.add(MaxPool2D(pool_size=(2, 2)))
+## Conv Layer 2
+model.add(Conv2D(32, (3, 3), activation='relu'))
+model.add(MaxPool2D(pool_size=(2, 2)))
+## Conv Layer 3
+model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
+model.add(MaxPool2D(pool_size=(2, 2)))
+
+# Clasification
+
+model.add(Flatten())
+model.add(BatchNormalization())
+model.add(Dense(64, activation='relu', kernel_regularizer=l1_l2(9e-4)))
+model.add(Dense(units=y_train.shape[1], activation='softmax'))
+model.summary()
+
+# Compile model (define optimizer and loss function)
+model.compile(optimizer='adadelta', loss='categorical_crossentropy', metrics=['accuracy'])
+
+# Train your model
+num_epochs = 15
+losses = np.zeros((num_epochs, 2))
+accura = np.zeros((num_epochs, 2))
+print("Training on {x_train.shape[0]} samples - validating on {x_val.shape[0]} samples.")
+for epoch in range(num_epochs):
+    print("Epoch: {epoch+1:3d} -- ", end="")
+    model.fit(x_train, y_train, epochs=1, batch_size=256, verbose=False)
+    losses[epoch, 0], accura[epoch, 0] = model.evaluate(x_train, y_train, verbose=False)
+    losses[epoch, 1], accura[epoch, 1] = model.evaluate(x_val, y_val, verbose=False)
+    print("Train loss: {losses[epoch, 0]:6.4f}, acc: {accura[epoch, 0]:6.4f} -- Val loss: {losses[epoch, 1]:6.4f}, acc: {accura[epoch, 1]:6.4f}")
+
+# Plot training history
+plt.figure(figsize=(15, 10))
+
+plt.plot(losses[:, 0], label='Loss: Training', linewidth=2)
+plt.plot(losses[:, 1], label='Loss: Validation', linewidth=2)
+plt.plot(accura[:, 0], label='Accu: Training', linewidth=2)
+plt.plot(accura[:, 1], label='Accu: Validation', linewidth=2)
+
+plt.legend(fontsize=18)
+plt.xlabel("Epoch", fontsize=18)
+plt.ylabel("Loss", fontsize=18)
+
+plt.xticks(np.arange(1, len(losses)))
+plt.tick_params(labelsize=18)
+plt.grid()
+
+y_hat = model.predict(x_test)
+test_loss, test_acc = model.evaluate(x_test, y_test)
+print("Test loss: {:6.4f}, acc: {:6.4f}".format(test_loss, test_acc))
\ No newline at end of file
--- a/cnn/cnn-hga-basic-v01.py
View file @e6cd562
+++ b/cnn/cnn-hga-basic-v01.py
View file @e6cd562
@@ -6,7 +6,13 @@ from keras import Sequential
 from keras.layers import Dense, Dropout, Conv2D, MaxPool1D, MaxPool2D, Flatten, BatchNormalization
 from keras.regularizers import l1_l2
-# Load MNIST data set
+# Load Human Genome Annotation (HGA) data set
+# GRCh38.92
+# From https://www.kaggle.com/alfrandom/human-genome-annotation
+
+hga_csv =
+
+
 (x_train, y_train), (x_test, y_test) = mnist.load_data()
--- a/data-sets/human-genome-annotation/README.txt 0 → 100644
View file @e6cd562
+++ b/data-sets/human-genome-annotation/README.txt 0 → 100644
View file @e6cd562
+FASTA data from ftp://ftp.ensembl.org/pub/release-96/fasta/homo_sapiens/dna/
+Human Genome Annotation data from https://www.kaggle.com/alfrandom/human-genome-annotation
\ No newline at end of file
--- a/data-sets/human-genome-annotation/get-hga-data-set.py 0 → 100644
View file @e6cd562
+++ b/data-sets/human-genome-annotation/get-hga-data-set.py 0 → 100644
View file @e6cd562
+# Get source data set by combining Human Genome Annotation data set (csv)
+# with FASTA files to obtain sequences corresponding to object in human genome
+# using "start" and "end" columns from human-genome-annotation
+
+# Input files:
+# FASTA all chromosomes: /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
+
+# Output tab-separated format:
+# Start End Sequence Feature
+
+import argparse
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Get source data set for Human Genome Annotation.')
+    parser.add_argument('--fastaPath', dest='fastaPath', action='store_const',
+                        const=sum, default=max,
+                        help='sum the integers (default: find the max)')
+
+    args = parser.parse_args()
+    print(args.accumulate(args.integers))
+