Deep Learning Workshop

Carlos-Francisco Méndez-Cruz
Commit 7ae410f355624f81cf7cb5c5ad02ef34963030a3 7ae410f3 1 parent 45c7e69e
Showing 2 changed files with 18 additions and 2 deletions
data-sets/get-hga-sequences.py
data-sets/get-hga-training-test.py
--- a/data-sets/get-hga-sequences.py
View file @7ae410f
+++ b/data-sets/get-hga-sequences.py
View file @7ae410f
@@ -27,7 +27,7 @@
 # --hgaFile Homo_sapiens.GRCh38.92.csv
 # --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
 # --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
-# python3 get-hga-sequences.py --feature gene --outputFile hga-sequences-toy.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --hgaFile Homo_sapiens.GRCh38.92.csv --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
+# python3 get-hga-sequences.py --feature gene --outputFile hga-sequences.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --hgaFile Homo_sapiens.GRCh38.92.csv --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
 import argparse
 # from Bio import SeqIO
--- a/data-sets/get-hga-training-test.py
View file @7ae410f
+++ b/data-sets/get-hga-training-test.py
View file @7ae410f
@@ -60,12 +60,28 @@ if __name__ == "__main__":
         print("df: {}".format(df))
         sequences = df['sequence']
         labels = df['label']
+
+    max_exon_length = 0
+    max_utr_length = 0
     # One-hot-encoding of sequences
-    for sequence in sequences:
+    for sequence, label in zip(sequences, labels):
+        if label == "exon":
+            if len(sequence) > max_exon_length:
+                max_exon_length = len(sequence)
+        elif label == "utr":
+            if len(sequence) > max_utr_length:
+                max_utr_length = len(sequence)
+        '''
         integer_encoded = integer_encoder.fit_transform(list(sequence))
         integer_encoded = np.array(integer_encoded).reshape(-1, 1)
         one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)
         input_features.append(one_hot_encoded.toarray())
+        '''
+
+    print("Max exon length: {}".format(max_exon_length))
+    print("Max utr length: {}".format(max_utr_length))
+
+    exit()
     # Print first sequence and one-hot-encoding
     np.set_printoptions(threshold=40)