Carlos-Francisco Méndez-Cruz

Deep Learning Workshop

...@@ -80,9 +80,22 @@ if __name__ == "__main__": ...@@ -80,9 +80,22 @@ if __name__ == "__main__":
80 for row in reader: 80 for row in reader:
81 # print(row) 81 # print(row)
82 filename = os.path.join(args.fastaPath, "Homo_sapiens.GRCh38.dna.chromosome.{}.fa".format(row['seqname'])) 82 filename = os.path.join(args.fastaPath, "Homo_sapiens.GRCh38.dna.chromosome.{}.fa".format(row['seqname']))
83 + # We use only
83 sequence = get_sequence(filename, int(row['start']), int(row['end'])) 84 sequence = get_sequence(filename, int(row['start']), int(row['end']))
84 - if row['feature'] == args.feature: 85 + # Features in HGA:
86 + # exon
87 + # feature
88 + # five_prime_utr
89 + # gene
90 + # Selenocysteine
91 + # start_codon
92 + # stop_codon
93 + # three_prime_utr
94 +
95 + if row['feature'] == "exon":
85 label = row['feature'] 96 label = row['feature']
97 + elif row['feature'] in ["five_prime_utr", "three_prime_utr"]:
98 + label = "utr"
86 else: 99 else:
87 label = "other" 100 label = "other"
88 new_row = "{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], sequence, label) 101 new_row = "{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], sequence, label)
...@@ -90,6 +103,8 @@ if __name__ == "__main__": ...@@ -90,6 +103,8 @@ if __name__ == "__main__":
90 i += 1 103 i += 1
91 if (i % 100) == 0: 104 if (i % 100) == 0:
92 print("{} rows processed.".format(i)) 105 print("{} rows processed.".format(i))
106 + if i == 10000:
107 + break
93 108
94 with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile: 109 with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile:
95 oFile.write("seqname\tstart\tend\tsequence\tlabel\n") 110 oFile.write("seqname\tstart\tend\tsequence\tlabel\n")
......