Deep Learning Workshop

Carlos-Francisco Méndez-Cruz
Commit fd6da844a387cbbfb806b52c61ddb47fa8c976dc fd6da844 1 parent aa056f22
Showing 1 changed file with 14 additions and 4 deletions
data-sets/get-hga-sequences.py
--- a/data-sets/get-hga-sequences.py
View file @fd6da84
+++ b/data-sets/get-hga-sequences.py
View file @fd6da84
@@ -74,6 +74,11 @@ if __name__ == "__main__":
 
     list_rows = []
     i = 0
+     length = 0
+     length_total_exon = 0
+     length_total_utr = 0
+     total_exon = 0
+     total_utr = 0
     # Read HGA csv file
     with open(os.path.join(args.hgaPath, args.hgaFile), mode="r", encoding="utf-8") as csvfile:
         reader = csv.DictReader(csvfile)
@@ -91,22 +96,27 @@ if __name__ == "__main__":
             # start_codon
             # stop_codon
             # three_prime_utr
- 
+             length = int(row['end']) - int(row['start']) + 1
             if row['feature'] == "exon":
                 label = row['feature']
+                 length_total_exon += length
+                 total_exon += 1
             elif row['feature'] in ["five_prime_utr", "three_prime_utr"]:
                 label = "utr"
+                 length_total_utr += length
+                 total_utr += 1
             else:
                 label = "other"
-             new_row = "{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], sequence, label)
+             new_row = "{}\t{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], length, sequence, label)
             list_rows.append(new_row)
             i += 1
             if (i % 100) == 0:
                 print("{} rows processed.".format(i))
             if i == 10000:
                 break
- 
+     print("Length media exons {} and utr {}".format(length_total_exon/total_exon, length_total_utr/total_utr))
     with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile:
-         oFile.write("seqname\tstart\tend\tsequence\tlabel\n")
+         oFile.write("seqname\tstart\tend\tlength\tsequence\tlabel\n")
         for elem in list_rows:
             oFile.write(elem)
+