Deep Learning Workshop

Carlos-Francisco Méndez-Cruz
Commit 347df9f6f2aaba364ab22da58a89326a42f7e4ab 347df9f6 1 parent 7ae410f3
Showing 3 changed files with 294 additions and 1 deletions
data-sets/get-hga-sequences-py27.py
data-sets/get-hga-training-test-py27.py
data-sets/get-hga-training-test.py
--- a/data-sets/get-hga-sequences-py27.py 0 → 100644
View file @347df9f
+++ b/data-sets/get-hga-sequences-py27.py 0 → 100644
View file @347df9f
+# Get sequences by combining Human Genome Annotation data set (csv)
+# with FASTA files to obtain sequences corresponding to object in human genome
+# using "start" and "end" columns from human-genome-annotation
+
+# Install BioPython: conda install -c conda-forge biopython
+
+# Input files:
+# FASTA all chromosomes: /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
+
+# Output tab-separated format:
+# Start End Sequence Feature
+
+# Run:
+# c:\Anaconda3\python get-hga-data-set.py
+# --feature gene
+# --outputFile hga-sequences-toy.txt
+# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
+# --hgaFile some-rows-example-human-genome-annotation.csv
+# --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
+# --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files
+# c:\Anaconda3\python get-hga-data-set.py --feature gene --outputFile hga-sequences-toy.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --hgaFile some-rows-example-human-genome-annotation.csv --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files
+
+# python3 get-hga-sequences.py
+# --feature gene
+# --outputFile hga-sequences-toy.txt
+# --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
+# --hgaFile Homo_sapiens.GRCh38.92.csv
+# --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
+# --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
+# python3 get-hga-sequences.py --feature gene --outputFile hga-sequences.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --hgaFile Homo_sapiens.GRCh38.92.csv --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
+
+import argparse
+# from Bio import SeqIO
+import csv
+import os
+from Bio.SeqIO.FastaIO import SimpleFastaParser
+
+
+def get_total_len(filename):
+    count = 0
+    total_len = 0
+    with open(filename) as in_handle:
+        for title, seq in SimpleFastaParser(in_handle):
+            count += 1
+            total_len += len(seq)
+    retval = "{} records with total sequence length {}".format(count, total_len)
+    return retval
+
+
+def get_sequence(filename, start, end):
+    ret_sequence = ""
+    with open(filename) as in_handle:
+        for title, seq in SimpleFastaParser(in_handle):
+            ret_sequence = seq[start:end + 1]
+    return ret_sequence
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Get source data set for Human Genome Annotation.')
+    parser.add_argument('--fastaPath', dest='fastaPath',
+                        help='Path for FASTA files')
+    parser.add_argument('--hgaPath', dest='hgaPath',
+                        help='Path for Human Genome Annotation file')
+    parser.add_argument('--hgaFile', dest='hgaFile',
+                        help='Human Genome Annotation file')
+    parser.add_argument('--outputPath', dest='outputPath',
+                        help='Output path')
+    parser.add_argument('--outputFile', dest='outputFile',
+                        help='Output file')
+    parser.add_argument('--feature', dest='feature',
+                        help='Feature (gene, exon)')
+
+    args = parser.parse_args()
+
+    list_rows = []
+    i = 0
+    length = 0
+    length_total_exon = 0
+    length_total_utr = 0
+    total_exon = 0
+    total_utr = 0
+    # Read HGA csv file
+    with open(os.path.join(args.hgaPath, args.hgaFile), mode="r", encoding="utf-8") as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row in reader:
+            # print(row)
+            filename = os.path.join(args.fastaPath, "Homo_sapiens.GRCh38.dna.chromosome.{}.fa".format(row['seqname']))
+            # We use only
+            sequence = get_sequence(filename, int(row['start']), int(row['end']))
+            # Features in HGA:
+            # exon
+            # feature
+            # five_prime_utr
+            # gene
+            # Selenocysteine
+            # start_codon
+            # stop_codon
+            # three_prime_utr
+            length = int(row['end']) - int(row['start']) + 1
+            if row['feature'] == "exon":
+                label = row['feature']
+                length_total_exon += length
+                total_exon += 1
+            elif row['feature'] in ["five_prime_utr", "three_prime_utr"]:
+                label = "utr"
+                length_total_utr += length
+                total_utr += 1
+            else:
+                label = "other"
+            new_row = "{}\t{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], length, sequence, label)
+            list_rows.append(new_row)
+            i += 1
+            if (i % 100) == 0:
+                print("{} rows processed.".format(i))
+            if i == 10000:
+                break
+    print("Length media exons {} and utr {}".format(length_total_exon/total_exon, length_total_utr/total_utr))
+    with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile:
+        oFile.write("seqname\tstart\tend\tlength\tsequence\tlabel\n")
+        for elem in list_rows:
+            oFile.write(elem)
+
--- a/data-sets/get-hga-training-test-py27.py 0 → 100644
View file @347df9f
+++ b/data-sets/get-hga-training-test-py27.py 0 → 100644
View file @347df9f
+# Get training and test data set for deep learning from sequence data set
+# obtained from FASTA and HGA data sets (see script get-hga-sequences.py)
+
+# Input tab-separated format:
+# Sequences: hga-sequences-toy.txt
+
+# Output one-hot encoding format:
+# Each sequence as a one-hot encoding WHAT array or matrix
+
+# Run:
+# python3 get-hga-training-test.py
+# --inputFile hga-sequences-toy.txt
+# --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
+# --outputTraining hga-sequences-training.txt
+# --outputTest hga-sequences-test.txt
+# --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
+# python get-hga-training-test.py --inputFile hga-sequences-1000.txt --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
+
+import argparse
+import pandas as pd
+import os
+from sklearn.preprocessing import LabelEncoder, OneHotEncoder
+import numpy as np
+from sklearn.model_selection import train_test_split
+from tensorflow.keras.layers import Conv1D, Dense, MaxPooling1D, Flatten
+from tensorflow.keras.models import Sequential
+import matplotlib.pyplot as plt
+from sklearn.metrics import confusion_matrix
+import itertools
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Get training and test data sets for Human Genome Annotation.')
+    parser.add_argument('--inputFile', dest='inputFile',
+                        help='Input file')
+    parser.add_argument('--inputPath', dest='inputPath',
+                        help='Input path')
+    parser.add_argument('--outputTraining', dest='outputTraining',
+                        help='Output training file')
+    parser.add_argument('--outputValidation', dest='outputValidation',
+                        help='Output training file')
+    parser.add_argument('--outputTest', dest='outputTest',
+                        help='Output test file')
+    parser.add_argument('--outputPath', dest='outputPath',
+                        help='Output path for training, validation, and testing')
+
+    args = parser.parse_args()
+
+    # To one-hot encoding taken from: https://colab.research.google.com/drive/17E4h5aAOioh5DiTo7MZg4hpL6Z_0FyWr#scrollTo=IPJD6PuDnaS6
+    # The LabelEncoder encodes a sequence of bases as a sequence of integers.
+    integer_encoder = LabelEncoder()
+    # The OneHotEncoder converts an array of integers to a sparse matrix where
+    # each row corresponds to one possible value of each feature.
+    one_hot_encoder = OneHotEncoder(categories='auto')
+    input_features = []
+    sequences = []
+
+    # Read file with sequences
+    with open(os.path.join(args.inputPath, args.inputFile), mode="r", encoding="utf-8") as tabfile:
+        df = pd.read_csv(tabfile, delimiter='\t')
+        print("df: {}".format(df))
+        sequences = df['sequence']
+        labels = df['label']
+
+    max_exon_length = 0
+    max_utr_length = 0
+    # One-hot-encoding of sequences
+    for sequence, label in zip(sequences, labels):
+        if label == "exon":
+            if len(sequence) > max_exon_length:
+                max_exon_length = len(sequence)
+        elif label == "utr":
+            if len(sequence) > max_utr_length:
+                max_utr_length = len(sequence)
+        '''
+        integer_encoded = integer_encoder.fit_transform(list(sequence))
+        integer_encoded = np.array(integer_encoded).reshape(-1, 1)
+        one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)
+        input_features.append(one_hot_encoded.toarray())
+        '''
+
+    print("Max exon length: {}".format(max_exon_length))
+    print("Max utr length: {}".format(max_utr_length))
+
+    exit()
+
+    # Print first sequence and one-hot-encoding
+    np.set_printoptions(threshold=40)
+    input_features = np.stack(input_features)
+    print("Example sequence\n-----------------------")
+    print('DNA Sequence #1:\n', sequences[0][:10], '...', sequences[0][-10:])
+    print('One hot encoding of Sequence #1:\n', input_features[0].T)
+
+    # One-hot-encoding of labels
+    one_hot_encoder = OneHotEncoder(categories='auto')
+    labels = np.array(labels).reshape(-1, 1)
+    input_labels = one_hot_encoder.fit_transform(labels).toarray()
+
+    # Print labels and one-hot-encoding
+    print('Labels:\n', labels.T)
+    print('One-hot encoded labels:\n', input_labels.T)
+
+    # Split one-hot-encoding data into training, and test data sets
+    train_features, test_features, train_labels, test_labels = train_test_split(
+        input_features, input_labels, test_size=0.25, random_state=42)
+
+    # Model definition
+    model = Sequential()
+    model.add(Conv1D(filters=32, kernel_size=12,
+                     input_shape=(train_features.shape[1], 4)))
+    model.add(MaxPooling1D(pool_size=4))
+    model.add(Flatten())
+    model.add(Dense(16, activation='relu'))
+    model.add(Dense(2, activation='softmax'))
+
+    model.compile(loss='binary_crossentropy', optimizer='adam',
+                  metrics=['binary_accuracy'])
+    model.summary()
+
+    # Model training and validation
+    history = model.fit(train_features, train_labels,
+                        epochs=50, verbose=0, validation_split=0.25)
+
+    # Plot training-validation loss
+    plt.figure()
+    plt.plot(history.history['loss'])
+    plt.plot(history.history['val_loss'])
+    plt.title('model loss')
+    plt.ylabel('loss')
+    plt.xlabel('epoch')
+    plt.legend(['train', 'validation'])
+    # plt.show()
+    plt.savefig('training-validation-loss.png')
+
+    # Plot training-validation accuracy
+    plt.figure()
+    plt.plot(history.history['binary_accuracy'])
+    plt.plot(history.history['val_binary_accuracy'])
+    plt.title('model accuracy')
+    plt.ylabel('accuracy')
+    plt.xlabel('epoch')
+    plt.legend(['train', 'validation'])
+    # plt.show()
+    plt.savefig('training-validation-binary-accuracy.png')
+
+    # Predict with rest data set
+    predicted_labels = model.predict(np.stack(test_features))
+    # Print confusion matrix
+    cm = confusion_matrix(np.argmax(test_labels, axis=1),
+                          np.argmax(predicted_labels, axis=1))
+    print('Confusion matrix:\n', cm)
+    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
+
+    # Plot confusion matrix
+    plt.imshow(cm, cmap=plt.cm.Blues)
+    plt.title('Normalized confusion matrix')
+    plt.colorbar()
+    plt.xlabel('True label')
+    plt.ylabel('Predicted label')
+    plt.xticks([0, 1]);
+    plt.yticks([0, 1])
+    plt.grid('off')
+    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
+        plt.text(j, i, format(cm[i, j], '.2f'),
+                 horizontalalignment='center',
+                 color='white' if cm[i, j] > 0.5 else 'black')
+    plt.savefig('training-validation-confusion-matrix.png')
+
+
+
+
+
--- a/data-sets/get-hga-training-test.py
View file @347df9f
+++ b/data-sets/get-hga-training-test.py
View file @347df9f
@@ -14,7 +14,7 @@
 # --outputTraining hga-sequences-training.txt
 # --outputTest hga-sequences-test.txt
 # --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
-# python3 get-hga-training-test.py --inputFile hga-sequences-toy.txt --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
+# python get-hga-training-test.py --inputFile hga-sequences-1000.txt --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
 import argparse
 import pandas as pd