Deep Learning Workshop

Carlos-Francisco Méndez-Cruz
Commit 4d536d405a43ac0f5ddefc59712ab3dd17846e2a 4d536d40 1 parent dc0e4cbb
Showing 1 changed file with 26 additions and 24 deletions
data-sets/get-hga-sequences-py3.py
--- a/data-sets/get-hga-sequences-py3.py
View file @4d536d4
+++ b/data-sets/get-hga-sequences-py3.py
View file @4d536d4
@@ -86,35 +86,37 @@ if __name__ == "__main__":
         for row in reader:
             # print(row)
             filename = os.path.join(args.fastaPath, "Homo_sapiens.GRCh38.dna.chromosome.{}.fa".format(row['seqname']))
-             # We use only
-             sequence = get_sequence(filename, int(row['start']), int(row['end']))
-             # Features in HGA:
-             # exon
-             # feature
-             # five_prime_utr
-             # gene
-             # Selenocysteine
-             # start_codon
-             # stop_codon
-             # three_prime_utr
-             length = int(row['end']) - int(row['start']) + 1
-             if row['feature'] == "exon":
-                 label = row['feature']
-                 length_total_exon += length
-                 total_exon += 1
-             elif row['feature'] in ["five_prime_utr", "three_prime_utr"]:
-                 label = "utr"
-                 length_total_utr += length
-                 total_utr += 1
-             else:
-                 label = "other"
-             new_row = "{}\t{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], length, sequence, label)
-             list_rows.append(new_row)
+             if row['feature'] in ["exon", "five_prime_utr", "three_prime_utr"]:
+                 # We use only exon, five_prime_utr, and three_prime_utr
+                 sequence = get_sequence(filename, int(row['start']), int(row['end']))
+                 # Features in HGA:
+                 # exon
+                 # feature
+                 # five_prime_utr
+                 # gene
+                 # Selenocysteine
+                 # start_codon
+                 # stop_codon
+                 # three_prime_utr
+                 length = int(row['end']) - int(row['start']) + 1
+                 if row['feature'] == "exon":
+                     label = row['feature']
+                     length_total_exon += length
+                     total_exon += 1
+                 elif row['feature'] in ["five_prime_utr", "three_prime_utr"]:
+                     label = "utr"
+                     length_total_utr += length
+                     total_utr += 1
+                 else:
+                     label = "other"
+                 new_row = "{}\t{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], length, sequence, label)
+                 list_rows.append(new_row)
             i += 1
             if (i % 100) == 0:
                 print("{} rows processed.".format(i))
             if i == 10000:
                 break
+     print("Count exons {} and utr {}".format(total_exon, total_utr))
     print("Length media exons {} and utr {}".format(length_total_exon/total_exon, length_total_utr/total_utr))
     with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile:
         oFile.write("seqname\tstart\tend\tlength\tsequence\tlabel\n")