Carlos-Francisco Méndez-Cruz

Deep Learning Workshop

......@@ -86,35 +86,37 @@ if __name__ == "__main__":
for row in reader:
# print(row)
filename = os.path.join(args.fastaPath, "Homo_sapiens.GRCh38.dna.chromosome.{}.fa".format(row['seqname']))
# We use only
sequence = get_sequence(filename, int(row['start']), int(row['end']))
# Features in HGA:
# exon
# feature
# five_prime_utr
# gene
# Selenocysteine
# start_codon
# stop_codon
# three_prime_utr
length = int(row['end']) - int(row['start']) + 1
if row['feature'] == "exon":
label = row['feature']
length_total_exon += length
total_exon += 1
elif row['feature'] in ["five_prime_utr", "three_prime_utr"]:
label = "utr"
length_total_utr += length
total_utr += 1
else:
label = "other"
new_row = "{}\t{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], length, sequence, label)
list_rows.append(new_row)
if row['feature'] in ["exon", "five_prime_utr", "three_prime_utr"]:
# We use only exon, five_prime_utr, and three_prime_utr
sequence = get_sequence(filename, int(row['start']), int(row['end']))
# Features in HGA:
# exon
# feature
# five_prime_utr
# gene
# Selenocysteine
# start_codon
# stop_codon
# three_prime_utr
length = int(row['end']) - int(row['start']) + 1
if row['feature'] == "exon":
label = row['feature']
length_total_exon += length
total_exon += 1
elif row['feature'] in ["five_prime_utr", "three_prime_utr"]:
label = "utr"
length_total_utr += length
total_utr += 1
else:
label = "other"
new_row = "{}\t{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], length, sequence, label)
list_rows.append(new_row)
i += 1
if (i % 100) == 0:
print("{} rows processed.".format(i))
if i == 10000:
break
print("Count exons {} and utr {}".format(total_exon, total_utr))
print("Length media exons {} and utr {}".format(length_total_exon/total_exon, length_total_utr/total_utr))
with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile:
oFile.write("seqname\tstart\tend\tlength\tsequence\tlabel\n")
......