Showing
2 changed files
with
208 additions
and
3 deletions
... | @@ -8,7 +8,7 @@ | ... | @@ -8,7 +8,7 @@ |
8 | # Each sequence as a one-hot encoding WHAT array or matrix | 8 | # Each sequence as a one-hot encoding WHAT array or matrix |
9 | 9 | ||
10 | # Run: | 10 | # Run: |
11 | -# python3 get-hga-training-test-py27.py | 11 | +# python3 get-hga-training-test-py27-v1.py |
12 | # --inputFile hga-sequences-toy.txt | 12 | # --inputFile hga-sequences-toy.txt |
13 | # --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation | 13 | # --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation |
14 | # --outputTraining hga-sequences-training.txt | 14 | # --outputTraining hga-sequences-training.txt |
... | @@ -18,13 +18,13 @@ | ... | @@ -18,13 +18,13 @@ |
18 | 18 | ||
19 | # LAVIS | 19 | # LAVIS |
20 | # qlogin | 20 | # qlogin |
21 | -# python get-hga-training-test-py27.py | 21 | +# python get-hga-training-test-py27-v1.py |
22 | # --inputFile hga-sequences-1000.txt | 22 | # --inputFile hga-sequences-1000.txt |
23 | # --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation | 23 | # --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation |
24 | # --outputTraining hga-sequences-training.txt | 24 | # --outputTraining hga-sequences-training.txt |
25 | # --outputTest hga-sequences-test.txt | 25 | # --outputTest hga-sequences-test.txt |
26 | # --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation | 26 | # --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation |
27 | -# python get-hga-training-test-py27.py --inputFile hga-sequences-toy.txt --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation | 27 | +# python get-hga-training-test-py27-v1.py --inputFile hga-sequences-toy.txt --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation |
28 | 28 | ||
29 | import argparse | 29 | import argparse |
30 | import pandas as pd | 30 | import pandas as pd | ... | ... |
data-sets/get-hga-training-test-py27-v2.py
0 → 100644
1 | +# Get training and test data set for deep learning from sequence data set | ||
2 | +# obtained from FASTA and HGA data sets (see script get-hga-sequences-py3.py) | ||
3 | + | ||
4 | +# Input tab-separated format: | ||
5 | +# Sequences: hga-sequences-toy.txt | ||
6 | + | ||
7 | +# Output one-hot encoding format: | ||
8 | +# Each sequence as a one-hot encoding WHAT array or matrix | ||
9 | + | ||
10 | +# Run: | ||
11 | +# python3 get-hga-training-test-py27-v1.py | ||
12 | +# --inputFile hga-sequences-toy.txt | ||
13 | +# --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation | ||
14 | +# --outputTraining hga-sequences-training.txt | ||
15 | +# --outputTest hga-sequences-test.txt | ||
16 | +# --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation | ||
17 | +# python get-hga-training-test-py3.py --inputFile hga-sequences-1000.txt --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation | ||
18 | + | ||
19 | +# LAVIS | ||
20 | +# qlogin | ||
21 | +# python get-hga-training-test-py27-v1.py | ||
22 | +# --inputFile hga-sequences-1000.txt | ||
23 | +# --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation | ||
24 | +# --outputTraining hga-sequences-training.txt | ||
25 | +# --outputTest hga-sequences-test.txt | ||
26 | +# --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation | ||
27 | +# python get-hga-training-test-py27-v1.py --inputFile hga-sequences-toy.txt --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation | ||
28 | + | ||
29 | +import argparse | ||
30 | +import pandas as pd | ||
31 | +import os | ||
32 | +from sklearn.preprocessing import LabelEncoder, OneHotEncoder | ||
33 | +import numpy as np | ||
34 | +from sklearn.model_selection import train_test_split | ||
35 | +from tensorflow.keras.layers import Conv1D, Dense, MaxPooling1D, Flatten | ||
36 | +from tensorflow.keras.models import Sequential | ||
37 | +import matplotlib.pyplot as plt | ||
38 | +from sklearn.metrics import confusion_matrix | ||
39 | +import itertools | ||
40 | + | ||
41 | +if __name__ == "__main__": | ||
42 | + parser = argparse.ArgumentParser(description='Get training and test data sets for Human Genome Annotation.') | ||
43 | + parser.add_argument('--inputFile', dest='inputFile', | ||
44 | + help='Input file') | ||
45 | + parser.add_argument('--inputPath', dest='inputPath', | ||
46 | + help='Input path') | ||
47 | + parser.add_argument('--outputTraining', dest='outputTraining', | ||
48 | + help='Output training file') | ||
49 | + parser.add_argument('--outputValidation', dest='outputValidation', | ||
50 | + help='Output training file') | ||
51 | + parser.add_argument('--outputTest', dest='outputTest', | ||
52 | + help='Output test file') | ||
53 | + parser.add_argument('--outputPath', dest='outputPath', | ||
54 | + help='Output path for training, validation, and testing') | ||
55 | + | ||
56 | + args = parser.parse_args() | ||
57 | + | ||
58 | + # To one-hot encoding taken from: https://colab.research.google.com/drive/17E4h5aAOioh5DiTo7MZg4hpL6Z_0FyWr#scrollTo=IPJD6PuDnaS6 | ||
59 | + # The LabelEncoder encodes a sequence of bases as a sequence of integers. | ||
60 | + integer_encoder = LabelEncoder() | ||
61 | + # The OneHotEncoder converts an array of integers to a sparse matrix where | ||
62 | + # each row corresponds to one possible value of each feature. | ||
63 | + one_hot_encoder = OneHotEncoder(categories='auto') | ||
64 | + input_features = [] | ||
65 | + | ||
66 | + # Read file with sequences | ||
67 | + with open(os.path.join(args.inputPath, args.inputFile), mode="r") as tabfile: | ||
68 | + df = pd.read_csv(tabfile, delimiter='\t') | ||
69 | + print("All rows in df: {}".format(len(df.index))) | ||
70 | + df_filtered = df.loc[(df['label'] == "exon") | (df['label'] == "utr")] | ||
71 | + print("Only exon and utr rows in df: {}".format(len(df_filtered.index))) | ||
72 | + # print("df: {}".format(df)) | ||
73 | + sequences = df_filtered['sequence'] | ||
74 | + labels = df_filtered['label'] | ||
75 | + | ||
76 | + ''' | ||
77 | + max_exon_length = 0 | ||
78 | + max_utr_length = 0 | ||
79 | + # Getting the max length of sequences | ||
80 | + for sequence, label in zip(sequences, labels): | ||
81 | + if label == "exon": | ||
82 | + if len(sequence) > max_exon_length: | ||
83 | + max_exon_length = len(sequence) | ||
84 | + elif label == "utr": | ||
85 | + if len(sequence) > max_utr_length: | ||
86 | + max_utr_length = len(sequence) | ||
87 | + print("Max exon length: {}".format(max_exon_length)) | ||
88 | + print("Max utr length: {}".format(max_utr_length)) | ||
89 | + | ||
90 | + if max_exon_length > max_utr_length: | ||
91 | + max_length = max_exon_length | ||
92 | + else: | ||
93 | + max_length = max_utr_length | ||
94 | + print("Max length: {}".format(max_length)) | ||
95 | + | ||
96 | + # Fill sequence with X char to get max length | ||
97 | + # One-hot-encoding of sequences | ||
98 | + for sequence, label in zip(sequences, labels): | ||
99 | + if len(sequence) < max_length: | ||
100 | + # print("sequence: {}".format(sequence)) | ||
101 | + # print("Le falta: {}".format(max_length - len(sequence))) | ||
102 | + sequence_adjust = sequence.ljust(max_length, 'X') + 'ACGTX' | ||
103 | + # print("sequence_adjust: {}".format(sequence_adjust)) | ||
104 | + else: | ||
105 | + sequence_adjust = sequence + 'ACGTX' | ||
106 | + | ||
107 | + ''' | ||
108 | + # One-hot-encoding of sequences | ||
109 | + for sequence, label in zip(sequences, labels): | ||
110 | + sequence_adjust = sequence + 'ACGTX' | ||
111 | + print("Length sequence_adjust: {}".format(len(sequence_adjust))) | ||
112 | + integer_encoded = integer_encoder.fit_transform(list(sequence_adjust)) | ||
113 | + print("integer_encoded.classes_: {}".format(integer_encoder.classes_)) | ||
114 | + integer_encoded = np.array(integer_encoded).reshape(-1, 1) | ||
115 | + # print("integer_encoded: {}".format(integer_encoded)) | ||
116 | + one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded) | ||
117 | + print(" shape: {}".format(one_hot_encoded.toarray().shape)) | ||
118 | + input_features.append(one_hot_encoded.toarray()) | ||
119 | + | ||
120 | + # Print first sequence and one-hot-encoding | ||
121 | + np.set_printoptions(threshold=40) | ||
122 | + input_features = np.stack(input_features) | ||
123 | + print("Example sequence\n-----------------------") | ||
124 | + print('DNA Sequence #1:\n', sequences[0][:10], '...', sequences[0][-10:]) | ||
125 | + print('One hot encoding of Sequence #1:\n', input_features[0].T) | ||
126 | + | ||
127 | + # One-hot-encoding of labels | ||
128 | + one_hot_encoder = OneHotEncoder(categories='auto') | ||
129 | + labels = np.array(labels).reshape(-1, 1) | ||
130 | + input_labels = one_hot_encoder.fit_transform(labels).toarray() | ||
131 | + | ||
132 | + # Print labels and one-hot-encoding | ||
133 | + print('Labels:\n', labels.T) | ||
134 | + print('One-hot encoded labels:\n', input_labels.T) | ||
135 | + | ||
136 | + # Split one-hot-encoding data into training, and test data sets | ||
137 | + train_features, test_features, train_labels, test_labels = train_test_split( | ||
138 | + input_features, input_labels, test_size=0.25, random_state=42) | ||
139 | + | ||
140 | + # Model definition | ||
141 | + model = Sequential() | ||
142 | + model.add(Conv1D(filters=32, kernel_size=12, | ||
143 | + input_shape=(train_features.shape[1], 4))) | ||
144 | + model.add(MaxPooling1D(pool_size=4)) | ||
145 | + model.add(Flatten()) | ||
146 | + model.add(Dense(16, activation='relu')) | ||
147 | + model.add(Dense(2, activation='softmax')) | ||
148 | + | ||
149 | + model.compile(loss='binary_crossentropy', optimizer='adam', | ||
150 | + metrics=['binary_accuracy']) | ||
151 | + model.summary() | ||
152 | + | ||
153 | + # Model training and validation | ||
154 | + history = model.fit(train_features, train_labels, | ||
155 | + epochs=50, verbose=0, validation_split=0.25) | ||
156 | + | ||
157 | + # Plot training-validation loss | ||
158 | + plt.figure() | ||
159 | + plt.plot(history.history['loss']) | ||
160 | + plt.plot(history.history['val_loss']) | ||
161 | + plt.title('model loss') | ||
162 | + plt.ylabel('loss') | ||
163 | + plt.xlabel('epoch') | ||
164 | + plt.legend(['train', 'validation']) | ||
165 | + # plt.show() | ||
166 | + plt.savefig('training-validation-loss.png') | ||
167 | + | ||
168 | + # Plot training-validation accuracy | ||
169 | + plt.figure() | ||
170 | + plt.plot(history.history['binary_accuracy']) | ||
171 | + plt.plot(history.history['val_binary_accuracy']) | ||
172 | + plt.title('model accuracy') | ||
173 | + plt.ylabel('accuracy') | ||
174 | + plt.xlabel('epoch') | ||
175 | + plt.legend(['train', 'validation']) | ||
176 | + # plt.show() | ||
177 | + plt.savefig('training-validation-binary-accuracy.png') | ||
178 | + | ||
179 | + # Predict with rest data set | ||
180 | + predicted_labels = model.predict(np.stack(test_features)) | ||
181 | + # Print confusion matrix | ||
182 | + cm = confusion_matrix(np.argmax(test_labels, axis=1), | ||
183 | + np.argmax(predicted_labels, axis=1)) | ||
184 | + print('Confusion matrix:\n', cm) | ||
185 | + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] | ||
186 | + | ||
187 | + # Plot confusion matrix | ||
188 | + plt.imshow(cm, cmap=plt.cm.Blues) | ||
189 | + plt.title('Normalized confusion matrix') | ||
190 | + plt.colorbar() | ||
191 | + plt.xlabel('True label') | ||
192 | + plt.ylabel('Predicted label') | ||
193 | + plt.xticks([0, 1]); | ||
194 | + plt.yticks([0, 1]) | ||
195 | + plt.grid('off') | ||
196 | + for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): | ||
197 | + plt.text(j, i, format(cm[i, j], '.2f'), | ||
198 | + horizontalalignment='center', | ||
199 | + color='white' if cm[i, j] > 0.5 else 'black') | ||
200 | + plt.savefig('training-validation-confusion-matrix.png') | ||
201 | + | ||
202 | + | ||
203 | + | ||
204 | + | ||
205 | + |
-
Please register or login to post a comment