Carlos-Francisco Méndez-Cruz

Deep Learning Workshop

...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
8 # Each sequence as a one-hot encoding WHAT array or matrix 8 # Each sequence as a one-hot encoding WHAT array or matrix
9 9
10 # Run: 10 # Run:
11 -# python3 get-hga-training-test-py27.py 11 +# python3 get-hga-training-test-py27-v1.py
12 # --inputFile hga-sequences-toy.txt 12 # --inputFile hga-sequences-toy.txt
13 # --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation 13 # --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
14 # --outputTraining hga-sequences-training.txt 14 # --outputTraining hga-sequences-training.txt
...@@ -18,13 +18,13 @@ ...@@ -18,13 +18,13 @@
18 18
19 # LAVIS 19 # LAVIS
20 # qlogin 20 # qlogin
21 -# python get-hga-training-test-py27.py 21 +# python get-hga-training-test-py27-v1.py
22 # --inputFile hga-sequences-1000.txt 22 # --inputFile hga-sequences-1000.txt
23 # --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation 23 # --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
24 # --outputTraining hga-sequences-training.txt 24 # --outputTraining hga-sequences-training.txt
25 # --outputTest hga-sequences-test.txt 25 # --outputTest hga-sequences-test.txt
26 # --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation 26 # --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
27 -# python get-hga-training-test-py27.py --inputFile hga-sequences-toy.txt --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation 27 +# python get-hga-training-test-py27-v1.py --inputFile hga-sequences-toy.txt --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
28 28
29 import argparse 29 import argparse
30 import pandas as pd 30 import pandas as pd
......
1 +# Get training and test data set for deep learning from sequence data set
2 +# obtained from FASTA and HGA data sets (see script get-hga-sequences-py3.py)
3 +
4 +# Input tab-separated format:
5 +# Sequences: hga-sequences-toy.txt
6 +
7 +# Output one-hot encoding format:
8 +# Each sequence as a one-hot encoding WHAT array or matrix
9 +
10 +# Run:
11 +# python3 get-hga-training-test-py27-v1.py
12 +# --inputFile hga-sequences-toy.txt
13 +# --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
14 +# --outputTraining hga-sequences-training.txt
15 +# --outputTest hga-sequences-test.txt
16 +# --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
17 +# python get-hga-training-test-py3.py --inputFile hga-sequences-1000.txt --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
18 +
19 +# LAVIS
20 +# qlogin
21 +# python get-hga-training-test-py27-v1.py
22 +# --inputFile hga-sequences-1000.txt
23 +# --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
24 +# --outputTraining hga-sequences-training.txt
25 +# --outputTest hga-sequences-test.txt
26 +# --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
27 +# python get-hga-training-test-py27-v1.py --inputFile hga-sequences-toy.txt --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
28 +
29 +import argparse
30 +import pandas as pd
31 +import os
32 +from sklearn.preprocessing import LabelEncoder, OneHotEncoder
33 +import numpy as np
34 +from sklearn.model_selection import train_test_split
35 +from tensorflow.keras.layers import Conv1D, Dense, MaxPooling1D, Flatten
36 +from tensorflow.keras.models import Sequential
37 +import matplotlib.pyplot as plt
38 +from sklearn.metrics import confusion_matrix
39 +import itertools
40 +
41 +if __name__ == "__main__":
42 + parser = argparse.ArgumentParser(description='Get training and test data sets for Human Genome Annotation.')
43 + parser.add_argument('--inputFile', dest='inputFile',
44 + help='Input file')
45 + parser.add_argument('--inputPath', dest='inputPath',
46 + help='Input path')
47 + parser.add_argument('--outputTraining', dest='outputTraining',
48 + help='Output training file')
49 + parser.add_argument('--outputValidation', dest='outputValidation',
50 + help='Output training file')
51 + parser.add_argument('--outputTest', dest='outputTest',
52 + help='Output test file')
53 + parser.add_argument('--outputPath', dest='outputPath',
54 + help='Output path for training, validation, and testing')
55 +
56 + args = parser.parse_args()
57 +
58 + # To one-hot encoding taken from: https://colab.research.google.com/drive/17E4h5aAOioh5DiTo7MZg4hpL6Z_0FyWr#scrollTo=IPJD6PuDnaS6
59 + # The LabelEncoder encodes a sequence of bases as a sequence of integers.
60 + integer_encoder = LabelEncoder()
61 + # The OneHotEncoder converts an array of integers to a sparse matrix where
62 + # each row corresponds to one possible value of each feature.
63 + one_hot_encoder = OneHotEncoder(categories='auto')
64 + input_features = []
65 +
66 + # Read file with sequences
67 + with open(os.path.join(args.inputPath, args.inputFile), mode="r") as tabfile:
68 + df = pd.read_csv(tabfile, delimiter='\t')
69 + print("All rows in df: {}".format(len(df.index)))
70 + df_filtered = df.loc[(df['label'] == "exon") | (df['label'] == "utr")]
71 + print("Only exon and utr rows in df: {}".format(len(df_filtered.index)))
72 + # print("df: {}".format(df))
73 + sequences = df_filtered['sequence']
74 + labels = df_filtered['label']
75 +
76 + '''
77 + max_exon_length = 0
78 + max_utr_length = 0
79 + # Getting the max length of sequences
80 + for sequence, label in zip(sequences, labels):
81 + if label == "exon":
82 + if len(sequence) > max_exon_length:
83 + max_exon_length = len(sequence)
84 + elif label == "utr":
85 + if len(sequence) > max_utr_length:
86 + max_utr_length = len(sequence)
87 + print("Max exon length: {}".format(max_exon_length))
88 + print("Max utr length: {}".format(max_utr_length))
89 +
90 + if max_exon_length > max_utr_length:
91 + max_length = max_exon_length
92 + else:
93 + max_length = max_utr_length
94 + print("Max length: {}".format(max_length))
95 +
96 + # Fill sequence with X char to get max length
97 + # One-hot-encoding of sequences
98 + for sequence, label in zip(sequences, labels):
99 + if len(sequence) < max_length:
100 + # print("sequence: {}".format(sequence))
101 + # print("Le falta: {}".format(max_length - len(sequence)))
102 + sequence_adjust = sequence.ljust(max_length, 'X') + 'ACGTX'
103 + # print("sequence_adjust: {}".format(sequence_adjust))
104 + else:
105 + sequence_adjust = sequence + 'ACGTX'
106 +
107 + '''
108 + # One-hot-encoding of sequences
109 + for sequence, label in zip(sequences, labels):
110 + sequence_adjust = sequence + 'ACGTX'
111 + print("Length sequence_adjust: {}".format(len(sequence_adjust)))
112 + integer_encoded = integer_encoder.fit_transform(list(sequence_adjust))
113 + print("integer_encoded.classes_: {}".format(integer_encoder.classes_))
114 + integer_encoded = np.array(integer_encoded).reshape(-1, 1)
115 + # print("integer_encoded: {}".format(integer_encoded))
116 + one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)
117 + print(" shape: {}".format(one_hot_encoded.toarray().shape))
118 + input_features.append(one_hot_encoded.toarray())
119 +
120 + # Print first sequence and one-hot-encoding
121 + np.set_printoptions(threshold=40)
122 + input_features = np.stack(input_features)
123 + print("Example sequence\n-----------------------")
124 + print('DNA Sequence #1:\n', sequences[0][:10], '...', sequences[0][-10:])
125 + print('One hot encoding of Sequence #1:\n', input_features[0].T)
126 +
127 + # One-hot-encoding of labels
128 + one_hot_encoder = OneHotEncoder(categories='auto')
129 + labels = np.array(labels).reshape(-1, 1)
130 + input_labels = one_hot_encoder.fit_transform(labels).toarray()
131 +
132 + # Print labels and one-hot-encoding
133 + print('Labels:\n', labels.T)
134 + print('One-hot encoded labels:\n', input_labels.T)
135 +
136 + # Split one-hot-encoding data into training, and test data sets
137 + train_features, test_features, train_labels, test_labels = train_test_split(
138 + input_features, input_labels, test_size=0.25, random_state=42)
139 +
140 + # Model definition
141 + model = Sequential()
142 + model.add(Conv1D(filters=32, kernel_size=12,
143 + input_shape=(train_features.shape[1], 4)))
144 + model.add(MaxPooling1D(pool_size=4))
145 + model.add(Flatten())
146 + model.add(Dense(16, activation='relu'))
147 + model.add(Dense(2, activation='softmax'))
148 +
149 + model.compile(loss='binary_crossentropy', optimizer='adam',
150 + metrics=['binary_accuracy'])
151 + model.summary()
152 +
153 + # Model training and validation
154 + history = model.fit(train_features, train_labels,
155 + epochs=50, verbose=0, validation_split=0.25)
156 +
157 + # Plot training-validation loss
158 + plt.figure()
159 + plt.plot(history.history['loss'])
160 + plt.plot(history.history['val_loss'])
161 + plt.title('model loss')
162 + plt.ylabel('loss')
163 + plt.xlabel('epoch')
164 + plt.legend(['train', 'validation'])
165 + # plt.show()
166 + plt.savefig('training-validation-loss.png')
167 +
168 + # Plot training-validation accuracy
169 + plt.figure()
170 + plt.plot(history.history['binary_accuracy'])
171 + plt.plot(history.history['val_binary_accuracy'])
172 + plt.title('model accuracy')
173 + plt.ylabel('accuracy')
174 + plt.xlabel('epoch')
175 + plt.legend(['train', 'validation'])
176 + # plt.show()
177 + plt.savefig('training-validation-binary-accuracy.png')
178 +
179 + # Predict with rest data set
180 + predicted_labels = model.predict(np.stack(test_features))
181 + # Print confusion matrix
182 + cm = confusion_matrix(np.argmax(test_labels, axis=1),
183 + np.argmax(predicted_labels, axis=1))
184 + print('Confusion matrix:\n', cm)
185 + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
186 +
187 + # Plot confusion matrix
188 + plt.imshow(cm, cmap=plt.cm.Blues)
189 + plt.title('Normalized confusion matrix')
190 + plt.colorbar()
191 + plt.xlabel('True label')
192 + plt.ylabel('Predicted label')
193 + plt.xticks([0, 1]);
194 + plt.yticks([0, 1])
195 + plt.grid('off')
196 + for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
197 + plt.text(j, i, format(cm[i, j], '.2f'),
198 + horizontalalignment='center',
199 + color='white' if cm[i, j] > 0.5 else 'black')
200 + plt.savefig('training-validation-confusion-matrix.png')
201 +
202 +
203 +
204 +
205 +