Carlos-Francisco Méndez-Cruz

Deep Learning Workshop

...@@ -94,6 +94,7 @@ if __name__ == "__main__": ...@@ -94,6 +94,7 @@ if __name__ == "__main__":
94 94
95 # Fill sequence with X char to get max length 95 # Fill sequence with X char to get max length
96 # One-hot-encoding of sequences 96 # One-hot-encoding of sequences
97 + sequences_adjust = []
97 for sequence, label in zip(sequences, labels): 98 for sequence, label in zip(sequences, labels):
98 if len(sequence) < max_length: 99 if len(sequence) < max_length:
99 # print("sequence: {}".format(sequence)) 100 # print("sequence: {}".format(sequence))
...@@ -102,6 +103,7 @@ if __name__ == "__main__": ...@@ -102,6 +103,7 @@ if __name__ == "__main__":
102 # print("sequence_adjust: {}".format(sequence_adjust)) 103 # print("sequence_adjust: {}".format(sequence_adjust))
103 else: 104 else:
104 sequence_adjust = sequence + 'ACGTX' 105 sequence_adjust = sequence + 'ACGTX'
106 + sequences_adjust.append(sequence_adjust)
105 print("Length sequence_adjust: {}".format(len(sequence_adjust))) 107 print("Length sequence_adjust: {}".format(len(sequence_adjust)))
106 integer_encoded = integer_encoder.fit_transform(list(sequence_adjust)) 108 integer_encoded = integer_encoder.fit_transform(list(sequence_adjust))
107 print("integer_encoded.classes_: {}".format(integer_encoder.classes_)) 109 print("integer_encoded.classes_: {}".format(integer_encoder.classes_))
...@@ -115,7 +117,8 @@ if __name__ == "__main__": ...@@ -115,7 +117,8 @@ if __name__ == "__main__":
115 np.set_printoptions(threshold=40) 117 np.set_printoptions(threshold=40)
116 input_features = np.stack(input_features) 118 input_features = np.stack(input_features)
117 print("Example sequence\n-----------------------") 119 print("Example sequence\n-----------------------")
118 - print('DNA Sequence #1:\n', sequences[0][:10], '...', sequences[0][-10:]) 120 + # print('DNA Sequence #1:\n', sequences[0][:10], '...', sequences[0][-10:])
121 + print('DNA Sequence #1:\n', sequences_adjust[0][:10], '...', sequences_adjust[0][-10:])
119 print('One hot encoding of Sequence #1:\n', input_features[0].T) 122 print('One hot encoding of Sequence #1:\n', input_features[0].T)
120 123
121 # One-hot-encoding of labels 124 # One-hot-encoding of labels
......
...@@ -24,7 +24,7 @@ ...@@ -24,7 +24,7 @@
24 # --outputTraining hga-sequences-training.txt 24 # --outputTraining hga-sequences-training.txt
25 # --outputTest hga-sequences-test.txt 25 # --outputTest hga-sequences-test.txt
26 # --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation 26 # --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
27 -# python get-hga-training-test-py27-v2.py --inputFile hga-sequences-toy.txt --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation 27 +# python get-hga-training-test-py27-v2.py --inputFile hga-sequences-1000.txt --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
28 28
29 import argparse 29 import argparse
30 import pandas as pd 30 import pandas as pd
......