get-hga-training-test.py 3.75 KB
# Get training and test data set for deep learning from sequence data set
# obtained from FASTA and HGA data sets (see script get-hga-sequences.py)

# Input tab-separated format:
# Sequences: hga-sequences.txt

# Output one-hot encoding format:
# Each sequence as a one-hot encoding WHAT array or matrix

# Run:
# c:\Anaconda3\python get-hga-training-test.py
# --inputFile hga-sequences.txt
# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
# --outputTrainFile hga-sequences-training.txt
# --outputTestFile hga-sequences-test.txt
# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation

import argparse
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
from sklearn.model_selection import train_test_split

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Get training and test data sets for Human Genome Annotation.')
    parser.add_argument('--inputFile', dest='inputFile',
                        help='Input file')
    parser.add_argument('--inputPath', dest='inputPath',
                        help='Input path')
    parser.add_argument('--outputTraining', dest='outputTraining',
                        help='Output training file')
    parser.add_argument('--outputValidation', dest='outputValidation',
                        help='Output training file')
    parser.add_argument('--outputTest', dest='outputTest',
                        help='Output test file')
    parser.add_argument('--outputPath', dest='outputPath',
                        help='Output path for training, validation, and testing')

    args = parser.parse_args()

    # To one-hot encoding taken from: https://colab.research.google.com/drive/17E4h5aAOioh5DiTo7MZg4hpL6Z_0FyWr#scrollTo=IPJD6PuDnaS6
    # The LabelEncoder encodes a sequence of bases as a sequence of integers.
    integer_encoder = LabelEncoder()
    # The OneHotEncoder converts an array of integers to a sparse matrix where
    # each row corresponds to one possible value of each feature.
    one_hot_encoder = OneHotEncoder(categories='auto')
    input_features = []
    sequences = []

    # Read file with sequences
    with open(os.path.join(args.inputFile, args.inputPath), mode="r", encoding="utf-8") as tabfile:
        df = pd.read_csv(tabfile, delimiter='\t')
        sequences = df['sequence']
        labels = df['label']
    # One-hot-encoding of sequences
    for sequence in sequences:
        integer_encoded = integer_encoder.fit_transform(list(sequence))
        integer_encoded = np.array(integer_encoded).reshape(-1, 1)
        one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)

    # Print first sequence and one-hot-encoding
    np.set_printoptions(threshold=40)
    input_features = np.stack(input_features)
    print("Example sequence\n-----------------------")
    print('DNA Sequence #1:\n', sequences[0][:10], '...', sequences[0][-10:])
    print('One hot encoding of Sequence #1:\n', input_features[0].T)

    # One-hot-encoding of labels
    one_hot_encoder = OneHotEncoder(categories='auto')
    labels = np.array(labels).reshape(-1, 1)
    input_labels = one_hot_encoder.fit_transform(labels).toarray()

    # Print labels and one-hot-encoding
    print('Labels:\n', labels.T)
    print('One-hot encoded labels:\n', input_labels.T)

    # Split one-hot-encoding data into training, and test data sets
    train_features, test_features, train_labels, test_labels = train_test_split(
        input_features, input_labels, test_size=0.25, random_state=42)


    with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile:
        for elem in list_rows:
            oFile.write(elem)