get-hga-training-test-py27.py 8.14 KB

Raw Blame History Permalink

# Get training and test data set for deep learning from sequence data set
# obtained from FASTA and HGA data sets (see script get-hga-sequences-py3.py)

# Input tab-separated format:
# Sequences: hga-sequences-toy.txt

# Output one-hot encoding format:
# Each sequence as a one-hot encoding WHAT array or matrix

# Run:
# python3 get-hga-training-test-py27.py
# --inputFile hga-sequences-toy.txt
# --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# --outputTraining hga-sequences-training.txt
# --outputTest hga-sequences-test.txt
# --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# python get-hga-training-test-py3.py --inputFile hga-sequences-1000.txt --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation

# LAVIS
# qlogin
# python get-hga-training-test-py27.py
# --inputFile hga-sequences-1000.txt
# --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# --outputTraining hga-sequences-training.txt
# --outputTest hga-sequences-test.txt
# --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# python get-hga-training-test-py27.py --inputFile hga-sequences-toy.txt --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation

import argparse
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Conv1D, Dense, MaxPooling1D, Flatten
from tensorflow.keras.models import Sequential
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import itertools

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Get training and test data sets for Human Genome Annotation.')
    parser.add_argument('--inputFile', dest='inputFile',
                        help='Input file')
    parser.add_argument('--inputPath', dest='inputPath',
                        help='Input path')
    parser.add_argument('--outputTraining', dest='outputTraining',
                        help='Output training file')
    parser.add_argument('--outputValidation', dest='outputValidation',
                        help='Output training file')
    parser.add_argument('--outputTest', dest='outputTest',
                        help='Output test file')
    parser.add_argument('--outputPath', dest='outputPath',
                        help='Output path for training, validation, and testing')

    args = parser.parse_args()

    # To one-hot encoding taken from: https://colab.research.google.com/drive/17E4h5aAOioh5DiTo7MZg4hpL6Z_0FyWr#scrollTo=IPJD6PuDnaS6
    # The LabelEncoder encodes a sequence of bases as a sequence of integers.
    integer_encoder = LabelEncoder()
    # The OneHotEncoder converts an array of integers to a sparse matrix where
    # each row corresponds to one possible value of each feature.
    one_hot_encoder = OneHotEncoder(categories='auto')
    input_features = []

    # Read file with sequences
    with open(os.path.join(args.inputPath, args.inputFile), mode="r") as tabfile:
        df = pd.read_csv(tabfile, delimiter='\t')
        print("All rows in df: {}".format(len(df.index)))
        df_filtered = df.loc[(df['label'] == "exon") | (df['label'] == "utr")]
        print("Only exon and utr rows in df: {}".format(len(df_filtered.index)))
        # print("df: {}".format(df))
        sequences = df_filtered['sequence']
        labels = df_filtered['label']

    max_exon_length = 0
    max_utr_length = 0
    # Getting the max length of sequences
    for sequence, label in zip(sequences, labels):
        if label == "exon":
            if len(sequence) > max_exon_length:
                max_exon_length = len(sequence)
        elif label == "utr":
            if len(sequence) > max_utr_length:
                max_utr_length = len(sequence)
    print("Max exon length: {}".format(max_exon_length))
    print("Max utr length: {}".format(max_utr_length))

    if max_exon_length > max_utr_length:
        max_length = max_exon_length
    else:
        max_length = max_utr_length
    print("Max length: {}".format(max_length))

    # Fill sequence with X char to get max length
    # One-hot-encoding of sequences
    for sequence, label in zip(sequences, labels):
        if len(sequence) < max_length:
            sequence_adjust = sequence.ljust(max_length - len(sequence), 'X')
        print("Length sequence_adjust: {}".format(len(sequence_adjust)))
        integer_encoded = integer_encoder.fit_transform(list(sequence_adjust))
        integer_encoded = np.array(integer_encoded).reshape(-1, 1)
        one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)
        input_features.append(one_hot_encoded.toarray())

    # Print first sequence and one-hot-encoding
    np.set_printoptions(threshold=40)
    input_features = np.stack(input_features)
    print("Example sequence\n-----------------------")
    print('DNA Sequence #1:\n', sequences[0][:10], '...', sequences[0][-10:])
    print('One hot encoding of Sequence #1:\n', input_features[0].T)

    # One-hot-encoding of labels
    one_hot_encoder = OneHotEncoder(categories='auto')
    labels = np.array(labels).reshape(-1, 1)
    input_labels = one_hot_encoder.fit_transform(labels).toarray()

    # Print labels and one-hot-encoding
    print('Labels:\n', labels.T)
    print('One-hot encoded labels:\n', input_labels.T)

    # Split one-hot-encoding data into training, and test data sets
    train_features, test_features, train_labels, test_labels = train_test_split(
        input_features, input_labels, test_size=0.25, random_state=42)

    # Model definition
    model = Sequential()
    model.add(Conv1D(filters=32, kernel_size=12,
                     input_shape=(train_features.shape[1], 4)))
    model.add(MaxPooling1D(pool_size=4))
    model.add(Flatten())
    model.add(Dense(16, activation='relu'))
    model.add(Dense(2, activation='softmax'))

    model.compile(loss='binary_crossentropy', optimizer='adam',
                  metrics=['binary_accuracy'])
    model.summary()

    # Model training and validation
    history = model.fit(train_features, train_labels,
                        epochs=50, verbose=0, validation_split=0.25)

    # Plot training-validation loss
    plt.figure()
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'])
    # plt.show()
    plt.savefig('training-validation-loss.png')

    # Plot training-validation accuracy
    plt.figure()
    plt.plot(history.history['binary_accuracy'])
    plt.plot(history.history['val_binary_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'])
    # plt.show()
    plt.savefig('training-validation-binary-accuracy.png')

    # Predict with rest data set
    predicted_labels = model.predict(np.stack(test_features))
    # Print confusion matrix
    cm = confusion_matrix(np.argmax(test_labels, axis=1),
                          np.argmax(predicted_labels, axis=1))
    print('Confusion matrix:\n', cm)
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    # Plot confusion matrix
    plt.imshow(cm, cmap=plt.cm.Blues)
    plt.title('Normalized confusion matrix')
    plt.colorbar()
    plt.xlabel('True label')
    plt.ylabel('Predicted label')
    plt.xticks([0, 1]);
    plt.yticks([0, 1])
    plt.grid('off')
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], '.2f'),
                 horizontalalignment='center',
                 color='white' if cm[i, j] > 0.5 else 'black')
    plt.savefig('training-validation-confusion-matrix.png')