Showing
2 changed files
with
18 additions
and
2 deletions
... | @@ -27,7 +27,7 @@ | ... | @@ -27,7 +27,7 @@ |
27 | # --hgaFile Homo_sapiens.GRCh38.92.csv | 27 | # --hgaFile Homo_sapiens.GRCh38.92.csv |
28 | # --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation | 28 | # --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation |
29 | # --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna | 29 | # --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna |
30 | -# python3 get-hga-sequences.py --feature gene --outputFile hga-sequences-toy.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --hgaFile Homo_sapiens.GRCh38.92.csv --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna | 30 | +# python3 get-hga-sequences.py --feature gene --outputFile hga-sequences.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --hgaFile Homo_sapiens.GRCh38.92.csv --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna |
31 | 31 | ||
32 | import argparse | 32 | import argparse |
33 | # from Bio import SeqIO | 33 | # from Bio import SeqIO | ... | ... |
... | @@ -60,12 +60,28 @@ if __name__ == "__main__": | ... | @@ -60,12 +60,28 @@ if __name__ == "__main__": |
60 | print("df: {}".format(df)) | 60 | print("df: {}".format(df)) |
61 | sequences = df['sequence'] | 61 | sequences = df['sequence'] |
62 | labels = df['label'] | 62 | labels = df['label'] |
63 | + | ||
64 | + max_exon_length = 0 | ||
65 | + max_utr_length = 0 | ||
63 | # One-hot-encoding of sequences | 66 | # One-hot-encoding of sequences |
64 | - for sequence in sequences: | 67 | + for sequence, label in zip(sequences, labels): |
68 | + if label == "exon": | ||
69 | + if len(sequence) > max_exon_length: | ||
70 | + max_exon_length = len(sequence) | ||
71 | + elif label == "utr": | ||
72 | + if len(sequence) > max_utr_length: | ||
73 | + max_utr_length = len(sequence) | ||
74 | + ''' | ||
65 | integer_encoded = integer_encoder.fit_transform(list(sequence)) | 75 | integer_encoded = integer_encoder.fit_transform(list(sequence)) |
66 | integer_encoded = np.array(integer_encoded).reshape(-1, 1) | 76 | integer_encoded = np.array(integer_encoded).reshape(-1, 1) |
67 | one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded) | 77 | one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded) |
68 | input_features.append(one_hot_encoded.toarray()) | 78 | input_features.append(one_hot_encoded.toarray()) |
79 | + ''' | ||
80 | + | ||
81 | + print("Max exon length: {}".format(max_exon_length)) | ||
82 | + print("Max utr length: {}".format(max_utr_length)) | ||
83 | + | ||
84 | + exit() | ||
69 | 85 | ||
70 | # Print first sequence and one-hot-encoding | 86 | # Print first sequence and one-hot-encoding |
71 | np.set_printoptions(threshold=40) | 87 | np.set_printoptions(threshold=40) | ... | ... |
-
Please register or login to post a comment