Carlos-Francisco Méndez-Cruz

Deep Learning Workshop

......@@ -94,6 +94,7 @@ if __name__ == "__main__":
# Fill sequence with X char to get max length
# One-hot-encoding of sequences
sequences_adjust = []
for sequence, label in zip(sequences, labels):
if len(sequence) < max_length:
# print("sequence: {}".format(sequence))
......@@ -102,6 +103,7 @@ if __name__ == "__main__":
# print("sequence_adjust: {}".format(sequence_adjust))
else:
sequence_adjust = sequence + 'ACGTX'
sequences_adjust.append(sequence_adjust)
print("Length sequence_adjust: {}".format(len(sequence_adjust)))
integer_encoded = integer_encoder.fit_transform(list(sequence_adjust))
print("integer_encoded.classes_: {}".format(integer_encoder.classes_))
......@@ -115,7 +117,8 @@ if __name__ == "__main__":
np.set_printoptions(threshold=40)
input_features = np.stack(input_features)
print("Example sequence\n-----------------------")
print('DNA Sequence #1:\n', sequences[0][:10], '...', sequences[0][-10:])
# print('DNA Sequence #1:\n', sequences[0][:10], '...', sequences[0][-10:])
print('DNA Sequence #1:\n', sequences_adjust[0][:10], '...', sequences_adjust[0][-10:])
print('One hot encoding of Sequence #1:\n', input_features[0].T)
# One-hot-encoding of labels
......
......@@ -24,7 +24,7 @@
# --outputTraining hga-sequences-training.txt
# --outputTest hga-sequences-test.txt
# --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# python get-hga-training-test-py27-v2.py --inputFile hga-sequences-toy.txt --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# python get-hga-training-test-py27-v2.py --inputFile hga-sequences-1000.txt --inputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /mnt/Genoma/amedina/cmendez/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
import argparse
import pandas as pd
......