Carlos-Francisco Méndez-Cruz

Deep Learning Workshop

......@@ -80,9 +80,22 @@ if __name__ == "__main__":
for row in reader:
# print(row)
filename = os.path.join(args.fastaPath, "Homo_sapiens.GRCh38.dna.chromosome.{}.fa".format(row['seqname']))
# We use only
sequence = get_sequence(filename, int(row['start']), int(row['end']))
if row['feature'] == args.feature:
# Features in HGA:
# exon
# feature
# five_prime_utr
# gene
# Selenocysteine
# start_codon
# stop_codon
# three_prime_utr
if row['feature'] == "exon":
label = row['feature']
elif row['feature'] in ["five_prime_utr", "three_prime_utr"]:
label = "utr"
else:
label = "other"
new_row = "{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], sequence, label)
......@@ -90,6 +103,8 @@ if __name__ == "__main__":
i += 1
if (i % 100) == 0:
print("{} rows processed.".format(i))
if i == 10000:
break
with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile:
oFile.write("seqname\tstart\tend\tsequence\tlabel\n")
......