get-hga-training-test.py
3.75 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# Get training and test data set for deep learning from sequence data set
# obtained from FASTA and HGA data sets (see script get-hga-sequences.py)
# Input tab-separated format:
# Sequences: hga-sequences.txt
# Output one-hot encoding format:
# Each sequence as a one-hot encoding WHAT array or matrix
# Run:
# c:\Anaconda3\python get-hga-training-test.py
# --inputFile hga-sequences.txt
# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
# --outputTrainFile hga-sequences-training.txt
# --outputTestFile hga-sequences-test.txt
# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
import argparse
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
from sklearn.model_selection import train_test_split
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Get training and test data sets for Human Genome Annotation.')
parser.add_argument('--inputFile', dest='inputFile',
help='Input file')
parser.add_argument('--inputPath', dest='inputPath',
help='Input path')
parser.add_argument('--outputTraining', dest='outputTraining',
help='Output training file')
parser.add_argument('--outputValidation', dest='outputValidation',
help='Output training file')
parser.add_argument('--outputTest', dest='outputTest',
help='Output test file')
parser.add_argument('--outputPath', dest='outputPath',
help='Output path for training, validation, and testing')
args = parser.parse_args()
# To one-hot encoding taken from: https://colab.research.google.com/drive/17E4h5aAOioh5DiTo7MZg4hpL6Z_0FyWr#scrollTo=IPJD6PuDnaS6
# The LabelEncoder encodes a sequence of bases as a sequence of integers.
integer_encoder = LabelEncoder()
# The OneHotEncoder converts an array of integers to a sparse matrix where
# each row corresponds to one possible value of each feature.
one_hot_encoder = OneHotEncoder(categories='auto')
input_features = []
sequences = []
# Read file with sequences
with open(os.path.join(args.inputFile, args.inputPath), mode="r", encoding="utf-8") as tabfile:
df = pd.read_csv(tabfile, delimiter='\t')
sequences = df['sequence']
labels = df['label']
# One-hot-encoding of sequences
for sequence in sequences:
integer_encoded = integer_encoder.fit_transform(list(sequence))
integer_encoded = np.array(integer_encoded).reshape(-1, 1)
one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)
# Print first sequence and one-hot-encoding
np.set_printoptions(threshold=40)
input_features = np.stack(input_features)
print("Example sequence\n-----------------------")
print('DNA Sequence #1:\n', sequences[0][:10], '...', sequences[0][-10:])
print('One hot encoding of Sequence #1:\n', input_features[0].T)
# One-hot-encoding of labels
one_hot_encoder = OneHotEncoder(categories='auto')
labels = np.array(labels).reshape(-1, 1)
input_labels = one_hot_encoder.fit_transform(labels).toarray()
# Print labels and one-hot-encoding
print('Labels:\n', labels.T)
print('One-hot encoded labels:\n', input_labels.T)
# Split one-hot-encoding data into training, and test data sets
train_features, test_features, train_labels, test_labels = train_test_split(
input_features, input_labels, test_size=0.25, random_state=42)
with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile:
for elem in list_rows:
oFile.write(elem)