Carlos-Francisco Méndez-Cruz

Deep Learning Workshop

...@@ -86,35 +86,37 @@ if __name__ == "__main__": ...@@ -86,35 +86,37 @@ if __name__ == "__main__":
86 for row in reader: 86 for row in reader:
87 # print(row) 87 # print(row)
88 filename = os.path.join(args.fastaPath, "Homo_sapiens.GRCh38.dna.chromosome.{}.fa".format(row['seqname'])) 88 filename = os.path.join(args.fastaPath, "Homo_sapiens.GRCh38.dna.chromosome.{}.fa".format(row['seqname']))
89 - # We use only 89 + if row['feature'] in ["exon", "five_prime_utr", "three_prime_utr"]:
90 - sequence = get_sequence(filename, int(row['start']), int(row['end'])) 90 + # We use only exon, five_prime_utr, and three_prime_utr
91 - # Features in HGA: 91 + sequence = get_sequence(filename, int(row['start']), int(row['end']))
92 - # exon 92 + # Features in HGA:
93 - # feature 93 + # exon
94 - # five_prime_utr 94 + # feature
95 - # gene 95 + # five_prime_utr
96 - # Selenocysteine 96 + # gene
97 - # start_codon 97 + # Selenocysteine
98 - # stop_codon 98 + # start_codon
99 - # three_prime_utr 99 + # stop_codon
100 - length = int(row['end']) - int(row['start']) + 1 100 + # three_prime_utr
101 - if row['feature'] == "exon": 101 + length = int(row['end']) - int(row['start']) + 1
102 - label = row['feature'] 102 + if row['feature'] == "exon":
103 - length_total_exon += length 103 + label = row['feature']
104 - total_exon += 1 104 + length_total_exon += length
105 - elif row['feature'] in ["five_prime_utr", "three_prime_utr"]: 105 + total_exon += 1
106 - label = "utr" 106 + elif row['feature'] in ["five_prime_utr", "three_prime_utr"]:
107 - length_total_utr += length 107 + label = "utr"
108 - total_utr += 1 108 + length_total_utr += length
109 - else: 109 + total_utr += 1
110 - label = "other" 110 + else:
111 - new_row = "{}\t{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], length, sequence, label) 111 + label = "other"
112 - list_rows.append(new_row) 112 + new_row = "{}\t{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], length, sequence, label)
113 + list_rows.append(new_row)
113 i += 1 114 i += 1
114 if (i % 100) == 0: 115 if (i % 100) == 0:
115 print("{} rows processed.".format(i)) 116 print("{} rows processed.".format(i))
116 if i == 10000: 117 if i == 10000:
117 break 118 break
119 + print("Count exons {} and utr {}".format(total_exon, total_utr))
118 print("Length media exons {} and utr {}".format(length_total_exon/total_exon, length_total_utr/total_utr)) 120 print("Length media exons {} and utr {}".format(length_total_exon/total_exon, length_total_utr/total_utr))
119 with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile: 121 with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile:
120 oFile.write("seqname\tstart\tend\tlength\tsequence\tlabel\n") 122 oFile.write("seqname\tstart\tend\tlength\tsequence\tlabel\n")
......