get-hga-sequences-py27.py
5.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# Get sequences by combining Human Genome Annotation data set (csv)
# with FASTA files to obtain sequences corresponding to object in human genome
# using "start" and "end" columns from human-genome-annotation
# Install BioPython: conda install -c conda-forge biopython
# Input files:
# FASTA all chromosomes: /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
# Output tab-separated format:
# Start End Sequence Feature
# Run:
# c:\Anaconda3\python get-hga-data-set.py
# --feature gene
# --outputFile hga-sequences-toy.txt
# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
# --hgaFile some-rows-example-human-genome-annotation.csv
# --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
# --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files
# c:\Anaconda3\python get-hga-data-set.py --feature gene --outputFile hga-sequences-toy.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --hgaFile some-rows-example-human-genome-annotation.csv --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files
# python3 get-hga-sequences.py
# --feature gene
# --outputFile hga-sequences-toy.txt
# --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# --hgaFile Homo_sapiens.GRCh38.92.csv
# --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
# python3 get-hga-sequences.py --feature gene --outputFile hga-sequences.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --hgaFile Homo_sapiens.GRCh38.92.csv --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
import argparse
# from Bio import SeqIO
import csv
import os
from Bio.SeqIO.FastaIO import SimpleFastaParser
def get_total_len(filename):
count = 0
total_len = 0
with open(filename) as in_handle:
for title, seq in SimpleFastaParser(in_handle):
count += 1
total_len += len(seq)
retval = "{} records with total sequence length {}".format(count, total_len)
return retval
def get_sequence(filename, start, end):
ret_sequence = ""
with open(filename) as in_handle:
for title, seq in SimpleFastaParser(in_handle):
ret_sequence = seq[start:end + 1]
return ret_sequence
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Get source data set for Human Genome Annotation.')
parser.add_argument('--fastaPath', dest='fastaPath',
help='Path for FASTA files')
parser.add_argument('--hgaPath', dest='hgaPath',
help='Path for Human Genome Annotation file')
parser.add_argument('--hgaFile', dest='hgaFile',
help='Human Genome Annotation file')
parser.add_argument('--outputPath', dest='outputPath',
help='Output path')
parser.add_argument('--outputFile', dest='outputFile',
help='Output file')
parser.add_argument('--feature', dest='feature',
help='Feature (gene, exon)')
args = parser.parse_args()
list_rows = []
i = 0
length = 0
length_total_exon = 0
length_total_utr = 0
total_exon = 0
total_utr = 0
# Read HGA csv file
with open(os.path.join(args.hgaPath, args.hgaFile), mode="r", encoding="utf-8") as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
# print(row)
filename = os.path.join(args.fastaPath, "Homo_sapiens.GRCh38.dna.chromosome.{}.fa".format(row['seqname']))
# We use only
sequence = get_sequence(filename, int(row['start']), int(row['end']))
# Features in HGA:
# exon
# feature
# five_prime_utr
# gene
# Selenocysteine
# start_codon
# stop_codon
# three_prime_utr
length = int(row['end']) - int(row['start']) + 1
if row['feature'] == "exon":
label = row['feature']
length_total_exon += length
total_exon += 1
elif row['feature'] in ["five_prime_utr", "three_prime_utr"]:
label = "utr"
length_total_utr += length
total_utr += 1
else:
label = "other"
new_row = "{}\t{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], length, sequence, label)
list_rows.append(new_row)
i += 1
if (i % 100) == 0:
print("{} rows processed.".format(i))
if i == 10000:
break
print("Length media exons {} and utr {}".format(length_total_exon/total_exon, length_total_utr/total_utr))
with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile:
oFile.write("seqname\tstart\tend\tlength\tsequence\tlabel\n")
for elem in list_rows:
oFile.write(elem)