Deep Learning Workshop

Carlos-Francisco Méndez-Cruz
Commit 1abd40e73f7051e4fec747dae27559cc7f2e8354 1abd40e7 1 parent c056ae1d
Showing 2 changed files with 208 additions and 3 deletions
data-sets/get-hga-training-test-py27.py → data-sets/get-hga-training-test-py27-v1.py
data-sets/get-hga-training-test-py27-v2.py
--- a/data-sets/get-hga-training-test-py27.py → data-sets/get-hga-training-test-py27-v1.py
View file @1abd40e
+++ b/data-sets/get-hga-training-test-py27.py → data-sets/get-hga-training-test-py27-v1.py
View file @1abd40e
@@ -8,7 +8,7 @@
 # Each sequence as a one-hot encoding WHAT array or matrix
 
 # Run:
- # python3 get-hga-training-test-py27.py
+ # python3 get-hga-training-test-py27-v1.py
 # --inputFile hga-sequences-toy.txt
 # --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
 # --outputTraining hga-sequences-training.txt
@@ -18,13 +18,13 @@
 
 # LAVIS
 # qlogin
- # python get-hga-training-test-py27.py
+ # python get-hga-training-test-py27-v1.py
 # --inputFile hga-sequences-1000.txt
 # --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
 # --outputTraining hga-sequences-training.txt
 # --outputTest hga-sequences-test.txt
 # --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
- # python get-hga-training-test-py27.py --inputFile hga-sequences-toy.txt --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
+ # python get-hga-training-test-py27-v1.py --inputFile hga-sequences-toy.txt --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
 
 import argparse
 import pandas as pd
--- a/data-sets/get-hga-training-test-py27-v2.py 0 → 100644
View file @1abd40e
+++ b/data-sets/get-hga-training-test-py27-v2.py 0 → 100644
View file @1abd40e
+ # Get training and test data set for deep learning from sequence data set
+ # obtained from FASTA and HGA data sets (see script get-hga-sequences-py3.py)
+ 
+ # Input tab-separated format:
+ # Sequences: hga-sequences-toy.txt
+ 
+ # Output one-hot encoding format:
+ # Each sequence as a one-hot encoding WHAT array or matrix
+ 
+ # Run:
+ # python3 get-hga-training-test-py27-v1.py
+ # --inputFile hga-sequences-toy.txt
+ # --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
+ # --outputTraining hga-sequences-training.txt
+ # --outputTest hga-sequences-test.txt
+ # --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
+ # python get-hga-training-test-py3.py --inputFile hga-sequences-1000.txt --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
+ 
+ # LAVIS
+ # qlogin
+ # python get-hga-training-test-py27-v1.py
+ # --inputFile hga-sequences-1000.txt
+ # --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
+ # --outputTraining hga-sequences-training.txt
+ # --outputTest hga-sequences-test.txt
+ # --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
+ # python get-hga-training-test-py27-v1.py --inputFile hga-sequences-toy.txt --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
+ 
+ import argparse
+ import pandas as pd
+ import os
+ from sklearn.preprocessing import LabelEncoder, OneHotEncoder
+ import numpy as np
+ from sklearn.model_selection import train_test_split
+ from tensorflow.keras.layers import Conv1D, Dense, MaxPooling1D, Flatten
+ from tensorflow.keras.models import Sequential
+ import matplotlib.pyplot as plt
+ from sklearn.metrics import confusion_matrix
+ import itertools
+ 
+ if __name__ == "__main__":
+     parser = argparse.ArgumentParser(description='Get training and test data sets for Human Genome Annotation.')
+     parser.add_argument('--inputFile', dest='inputFile',
+                         help='Input file')
+     parser.add_argument('--inputPath', dest='inputPath',
+                         help='Input path')
+     parser.add_argument('--outputTraining', dest='outputTraining',
+                         help='Output training file')
+     parser.add_argument('--outputValidation', dest='outputValidation',
+                         help='Output training file')
+     parser.add_argument('--outputTest', dest='outputTest',
+                         help='Output test file')
+     parser.add_argument('--outputPath', dest='outputPath',
+                         help='Output path for training, validation, and testing')
+ 
+     args = parser.parse_args()
+ 
+     # To one-hot encoding taken from: https://colab.research.google.com/drive/17E4h5aAOioh5DiTo7MZg4hpL6Z_0FyWr#scrollTo=IPJD6PuDnaS6
+     # The LabelEncoder encodes a sequence of bases as a sequence of integers.
+     integer_encoder = LabelEncoder()
+     # The OneHotEncoder converts an array of integers to a sparse matrix where
+     # each row corresponds to one possible value of each feature.
+     one_hot_encoder = OneHotEncoder(categories='auto')
+     input_features = []
+ 
+     # Read file with sequences
+     with open(os.path.join(args.inputPath, args.inputFile), mode="r") as tabfile:
+         df = pd.read_csv(tabfile, delimiter='\t')
+         print("All rows in df: {}".format(len(df.index)))
+         df_filtered = df.loc[(df['label'] == "exon") | (df['label'] == "utr")]
+         print("Only exon and utr rows in df: {}".format(len(df_filtered.index)))
+         # print("df: {}".format(df))
+         sequences = df_filtered['sequence']
+         labels = df_filtered['label']
+ 
+     '''
+     max_exon_length = 0
+     max_utr_length = 0
+     # Getting the max length of sequences
+     for sequence, label in zip(sequences, labels):
+         if label == "exon":
+             if len(sequence) > max_exon_length:
+                 max_exon_length = len(sequence)
+         elif label == "utr":
+             if len(sequence) > max_utr_length:
+                 max_utr_length = len(sequence)
+     print("Max exon length: {}".format(max_exon_length))
+     print("Max utr length: {}".format(max_utr_length))
+ 
+     if max_exon_length > max_utr_length:
+         max_length = max_exon_length
+     else:
+         max_length = max_utr_length
+     print("Max length: {}".format(max_length))
+ 
+     # Fill sequence with X char to get max length
+     # One-hot-encoding of sequences
+     for sequence, label in zip(sequences, labels):
+         if len(sequence) < max_length:
+             # print("sequence: {}".format(sequence))
+             # print("Le falta: {}".format(max_length - len(sequence)))
+             sequence_adjust = sequence.ljust(max_length, 'X') + 'ACGTX'
+             # print("sequence_adjust: {}".format(sequence_adjust))
+         else:
+             sequence_adjust = sequence + 'ACGTX'
+             
+     '''
+     # One-hot-encoding of sequences
+     for sequence, label in zip(sequences, labels):
+         sequence_adjust = sequence + 'ACGTX'
+         print("Length sequence_adjust: {}".format(len(sequence_adjust)))
+         integer_encoded = integer_encoder.fit_transform(list(sequence_adjust))
+         print("integer_encoded.classes_: {}".format(integer_encoder.classes_))
+         integer_encoded = np.array(integer_encoded).reshape(-1, 1)
+         # print("integer_encoded: {}".format(integer_encoded))
+         one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)
+         print(" shape: {}".format(one_hot_encoded.toarray().shape))
+         input_features.append(one_hot_encoded.toarray())
+ 
+     # Print first sequence and one-hot-encoding
+     np.set_printoptions(threshold=40)
+     input_features = np.stack(input_features)
+     print("Example sequence\n-----------------------")
+     print('DNA Sequence #1:\n', sequences[0][:10], '...', sequences[0][-10:])
+     print('One hot encoding of Sequence #1:\n', input_features[0].T)
+ 
+     # One-hot-encoding of labels
+     one_hot_encoder = OneHotEncoder(categories='auto')
+     labels = np.array(labels).reshape(-1, 1)
+     input_labels = one_hot_encoder.fit_transform(labels).toarray()
+ 
+     # Print labels and one-hot-encoding
+     print('Labels:\n', labels.T)
+     print('One-hot encoded labels:\n', input_labels.T)
+ 
+     # Split one-hot-encoding data into training, and test data sets
+     train_features, test_features, train_labels, test_labels = train_test_split(
+         input_features, input_labels, test_size=0.25, random_state=42)
+ 
+     # Model definition
+     model = Sequential()
+     model.add(Conv1D(filters=32, kernel_size=12,
+                      input_shape=(train_features.shape[1], 4)))
+     model.add(MaxPooling1D(pool_size=4))
+     model.add(Flatten())
+     model.add(Dense(16, activation='relu'))
+     model.add(Dense(2, activation='softmax'))
+ 
+     model.compile(loss='binary_crossentropy', optimizer='adam',
+                   metrics=['binary_accuracy'])
+     model.summary()
+ 
+     # Model training and validation
+     history = model.fit(train_features, train_labels,
+                         epochs=50, verbose=0, validation_split=0.25)
+ 
+     # Plot training-validation loss
+     plt.figure()
+     plt.plot(history.history['loss'])
+     plt.plot(history.history['val_loss'])
+     plt.title('model loss')
+     plt.ylabel('loss')
+     plt.xlabel('epoch')
+     plt.legend(['train', 'validation'])
+     # plt.show()
+     plt.savefig('training-validation-loss.png')
+ 
+     # Plot training-validation accuracy
+     plt.figure()
+     plt.plot(history.history['binary_accuracy'])
+     plt.plot(history.history['val_binary_accuracy'])
+     plt.title('model accuracy')
+     plt.ylabel('accuracy')
+     plt.xlabel('epoch')
+     plt.legend(['train', 'validation'])
+     # plt.show()
+     plt.savefig('training-validation-binary-accuracy.png')
+ 
+     # Predict with rest data set
+     predicted_labels = model.predict(np.stack(test_features))
+     # Print confusion matrix
+     cm = confusion_matrix(np.argmax(test_labels, axis=1),
+                           np.argmax(predicted_labels, axis=1))
+     print('Confusion matrix:\n', cm)
+     cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+ 
+     # Plot confusion matrix
+     plt.imshow(cm, cmap=plt.cm.Blues)
+     plt.title('Normalized confusion matrix')
+     plt.colorbar()
+     plt.xlabel('True label')
+     plt.ylabel('Predicted label')
+     plt.xticks([0, 1]);
+     plt.yticks([0, 1])
+     plt.grid('off')
+     for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
+         plt.text(j, i, format(cm[i, j], '.2f'),
+                  horizontalalignment='center',
+                  color='white' if cm[i, j] > 0.5 else 'black')
+     plt.savefig('training-validation-confusion-matrix.png')
+ 
+ 
+ 
+ 
+