Carlos-Francisco Méndez-Cruz

Deep Learning Workshop

......@@ -74,6 +74,11 @@ if __name__ == "__main__":
list_rows = []
i = 0
length = 0
length_total_exon = 0
length_total_utr = 0
total_exon = 0
total_utr = 0
# Read HGA csv file
with open(os.path.join(args.hgaPath, args.hgaFile), mode="r", encoding="utf-8") as csvfile:
reader = csv.DictReader(csvfile)
......@@ -91,22 +96,27 @@ if __name__ == "__main__":
# start_codon
# stop_codon
# three_prime_utr
length = int(row['end']) - int(row['start']) + 1
if row['feature'] == "exon":
label = row['feature']
length_total_exon += length
total_exon += 1
elif row['feature'] in ["five_prime_utr", "three_prime_utr"]:
label = "utr"
length_total_utr += length
total_utr += 1
else:
label = "other"
new_row = "{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], sequence, label)
new_row = "{}\t{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], length, sequence, label)
list_rows.append(new_row)
i += 1
if (i % 100) == 0:
print("{} rows processed.".format(i))
if i == 10000:
break
print("Length media exons {} and utr {}".format(length_total_exon/total_exon, length_total_utr/total_utr))
with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile:
oFile.write("seqname\tstart\tend\tsequence\tlabel\n")
oFile.write("seqname\tstart\tend\tlength\tsequence\tlabel\n")
for elem in list_rows:
oFile.write(elem)
......