Carlos-Francisco Méndez-Cruz

Deep Learning Workshop

...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
27 # --hgaFile Homo_sapiens.GRCh38.92.csv 27 # --hgaFile Homo_sapiens.GRCh38.92.csv
28 # --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation 28 # --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
29 # --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna 29 # --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
30 -# python3 get-hga-sequences.py --feature gene --outputFile hga-sequences-toy.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --hgaFile Homo_sapiens.GRCh38.92.csv --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna 30 +# python3 get-hga-sequences.py --feature gene --outputFile hga-sequences.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --hgaFile Homo_sapiens.GRCh38.92.csv --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
31 31
32 import argparse 32 import argparse
33 # from Bio import SeqIO 33 # from Bio import SeqIO
......
...@@ -60,12 +60,28 @@ if __name__ == "__main__": ...@@ -60,12 +60,28 @@ if __name__ == "__main__":
60 print("df: {}".format(df)) 60 print("df: {}".format(df))
61 sequences = df['sequence'] 61 sequences = df['sequence']
62 labels = df['label'] 62 labels = df['label']
63 +
64 + max_exon_length = 0
65 + max_utr_length = 0
63 # One-hot-encoding of sequences 66 # One-hot-encoding of sequences
64 - for sequence in sequences: 67 + for sequence, label in zip(sequences, labels):
68 + if label == "exon":
69 + if len(sequence) > max_exon_length:
70 + max_exon_length = len(sequence)
71 + elif label == "utr":
72 + if len(sequence) > max_utr_length:
73 + max_utr_length = len(sequence)
74 + '''
65 integer_encoded = integer_encoder.fit_transform(list(sequence)) 75 integer_encoded = integer_encoder.fit_transform(list(sequence))
66 integer_encoded = np.array(integer_encoded).reshape(-1, 1) 76 integer_encoded = np.array(integer_encoded).reshape(-1, 1)
67 one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded) 77 one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)
68 input_features.append(one_hot_encoded.toarray()) 78 input_features.append(one_hot_encoded.toarray())
79 + '''
80 +
81 + print("Max exon length: {}".format(max_exon_length))
82 + print("Max utr length: {}".format(max_utr_length))
83 +
84 + exit()
69 85
70 # Print first sequence and one-hot-encoding 86 # Print first sequence and one-hot-encoding
71 np.set_printoptions(threshold=40) 87 np.set_printoptions(threshold=40)
......