Showing
1 changed file
with
26 additions
and
24 deletions
... | @@ -86,35 +86,37 @@ if __name__ == "__main__": | ... | @@ -86,35 +86,37 @@ if __name__ == "__main__": |
86 | for row in reader: | 86 | for row in reader: |
87 | # print(row) | 87 | # print(row) |
88 | filename = os.path.join(args.fastaPath, "Homo_sapiens.GRCh38.dna.chromosome.{}.fa".format(row['seqname'])) | 88 | filename = os.path.join(args.fastaPath, "Homo_sapiens.GRCh38.dna.chromosome.{}.fa".format(row['seqname'])) |
89 | - # We use only | 89 | + if row['feature'] in ["exon", "five_prime_utr", "three_prime_utr"]: |
90 | - sequence = get_sequence(filename, int(row['start']), int(row['end'])) | 90 | + # We use only exon, five_prime_utr, and three_prime_utr |
91 | - # Features in HGA: | 91 | + sequence = get_sequence(filename, int(row['start']), int(row['end'])) |
92 | - # exon | 92 | + # Features in HGA: |
93 | - # feature | 93 | + # exon |
94 | - # five_prime_utr | 94 | + # feature |
95 | - # gene | 95 | + # five_prime_utr |
96 | - # Selenocysteine | 96 | + # gene |
97 | - # start_codon | 97 | + # Selenocysteine |
98 | - # stop_codon | 98 | + # start_codon |
99 | - # three_prime_utr | 99 | + # stop_codon |
100 | - length = int(row['end']) - int(row['start']) + 1 | 100 | + # three_prime_utr |
101 | - if row['feature'] == "exon": | 101 | + length = int(row['end']) - int(row['start']) + 1 |
102 | - label = row['feature'] | 102 | + if row['feature'] == "exon": |
103 | - length_total_exon += length | 103 | + label = row['feature'] |
104 | - total_exon += 1 | 104 | + length_total_exon += length |
105 | - elif row['feature'] in ["five_prime_utr", "three_prime_utr"]: | 105 | + total_exon += 1 |
106 | - label = "utr" | 106 | + elif row['feature'] in ["five_prime_utr", "three_prime_utr"]: |
107 | - length_total_utr += length | 107 | + label = "utr" |
108 | - total_utr += 1 | 108 | + length_total_utr += length |
109 | - else: | 109 | + total_utr += 1 |
110 | - label = "other" | 110 | + else: |
111 | - new_row = "{}\t{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], length, sequence, label) | 111 | + label = "other" |
112 | - list_rows.append(new_row) | 112 | + new_row = "{}\t{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], length, sequence, label) |
113 | + list_rows.append(new_row) | ||
113 | i += 1 | 114 | i += 1 |
114 | if (i % 100) == 0: | 115 | if (i % 100) == 0: |
115 | print("{} rows processed.".format(i)) | 116 | print("{} rows processed.".format(i)) |
116 | if i == 10000: | 117 | if i == 10000: |
117 | break | 118 | break |
119 | + print("Count exons {} and utr {}".format(total_exon, total_utr)) | ||
118 | print("Length media exons {} and utr {}".format(length_total_exon/total_exon, length_total_utr/total_utr)) | 120 | print("Length media exons {} and utr {}".format(length_total_exon/total_exon, length_total_utr/total_utr)) |
119 | with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile: | 121 | with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile: |
120 | oFile.write("seqname\tstart\tend\tlength\tsequence\tlabel\n") | 122 | oFile.write("seqname\tstart\tend\tlength\tsequence\tlabel\n") | ... | ... |
-
Please register or login to post a comment