Showing
1 changed file
with
79 additions
and
0 deletions
data-sets/get-hga-data-set.py
0 → 100644
1 | +# Get training and test data set to deep learning. | ||
2 | + | ||
3 | +# Install BioPython: conda install -c conda-forge biopython | ||
4 | + | ||
5 | +# Input files: | ||
6 | +# FASTA all chromosomes: /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna | ||
7 | + | ||
8 | +# Output tab-separated format: | ||
9 | +# Start End Sequence Feature | ||
10 | + | ||
11 | +# Run: | ||
12 | +# c:\Anaconda3\python get-hga-data-set.py | ||
13 | +# --feature gene | ||
14 | +# --outputFile hga-sequences.txt | ||
15 | +# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation | ||
16 | +# --hgaFile some-rows-example-human-genome-annotation.csv | ||
17 | +# --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation | ||
18 | +# --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files | ||
19 | +# c:\Anaconda3\python get-hga-data-set.py --feature gene --outputFile hga-sequences.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --hgaFile some-rows-example-human-genome-annotation.csv --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files | ||
20 | + | ||
21 | +import argparse | ||
22 | +# from Bio import SeqIO | ||
23 | +import csv | ||
24 | +import os | ||
25 | +from Bio.SeqIO.FastaIO import SimpleFastaParser | ||
26 | + | ||
27 | + | ||
28 | +def get_total_len(filename): | ||
29 | + count = 0 | ||
30 | + total_len = 0 | ||
31 | + with open(filename) as in_handle: | ||
32 | + for title, seq in SimpleFastaParser(in_handle): | ||
33 | + count += 1 | ||
34 | + total_len += len(seq) | ||
35 | + retval = "{} records with total sequence length {}".format(count, total_len) | ||
36 | + return retval | ||
37 | + | ||
38 | +def get_sequence(filename, start, end): | ||
39 | + ret_sequence = "" | ||
40 | + with open(filename) as in_handle: | ||
41 | + for title, seq in SimpleFastaParser(in_handle): | ||
42 | + ret_sequence = seq[start:end+1] | ||
43 | + return ret_sequence | ||
44 | + | ||
45 | +if __name__ == "__main__": | ||
46 | + parser = argparse.ArgumentParser(description='Get source data set for Human Genome Annotation.') | ||
47 | + parser.add_argument('--fastaPath', dest='fastaPath', | ||
48 | + help='Path for FASTA files') | ||
49 | + parser.add_argument('--hgaPath', dest='hgaPath', | ||
50 | + help='Path for Human Genome Annotation file') | ||
51 | + parser.add_argument('--hgaFile', dest='hgaFile', | ||
52 | + help='Human Genome Annotation file') | ||
53 | + parser.add_argument('--outputPath', dest='outputPath', | ||
54 | + help='Output path') | ||
55 | + parser.add_argument('--outputFile', dest='outputFile', | ||
56 | + help='Output file') | ||
57 | + parser.add_argument('--feature', dest='feature', | ||
58 | + help='Feature (gene, exon)') | ||
59 | + | ||
60 | + args = parser.parse_args() | ||
61 | + | ||
62 | + list_rows = [] | ||
63 | + # Read HGA csv file | ||
64 | + with open(os.path.join(args.hgaPath, args.hgaFile), mode="r", encoding="utf-8") as csvfile: | ||
65 | + reader = csv.DictReader(csvfile) | ||
66 | + for row in reader: | ||
67 | + print(row) | ||
68 | + filename = os.path.join(args.fastaPath, "Homo_sapiens.GRCh38.dna.chromosome.{}.fa".format(row['seqname'])) | ||
69 | + sequence = get_sequence(filename, int(row['start']), int(row['end'])) | ||
70 | + if row['feature'] == args.feature: | ||
71 | + label = row['feature'] | ||
72 | + else: | ||
73 | + label = "other" | ||
74 | + new_row = "{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], sequence, label) | ||
75 | + list_rows.append(new_row) | ||
76 | + | ||
77 | + with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile: | ||
78 | + for elem in list_rows: | ||
79 | + oFile.write(elem) |
-
Please register or login to post a comment