Carlos-Francisco Méndez-Cruz

Deep Learning Workshop

1 +# Get training and test data set to deep learning.
2 +
3 +# Install BioPython: conda install -c conda-forge biopython
4 +
5 +# Input files:
6 +# FASTA all chromosomes: /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
7 +
8 +# Output tab-separated format:
9 +# Start End Sequence Feature
10 +
11 +# Run:
12 +# c:\Anaconda3\python get-hga-data-set.py
13 +# --feature gene
14 +# --outputFile hga-sequences.txt
15 +# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
16 +# --hgaFile some-rows-example-human-genome-annotation.csv
17 +# --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
18 +# --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files
19 +# c:\Anaconda3\python get-hga-data-set.py --feature gene --outputFile hga-sequences.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --hgaFile some-rows-example-human-genome-annotation.csv --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files
20 +
21 +import argparse
22 +# from Bio import SeqIO
23 +import csv
24 +import os
25 +from Bio.SeqIO.FastaIO import SimpleFastaParser
26 +
27 +
28 +def get_total_len(filename):
29 + count = 0
30 + total_len = 0
31 + with open(filename) as in_handle:
32 + for title, seq in SimpleFastaParser(in_handle):
33 + count += 1
34 + total_len += len(seq)
35 + retval = "{} records with total sequence length {}".format(count, total_len)
36 + return retval
37 +
38 +def get_sequence(filename, start, end):
39 + ret_sequence = ""
40 + with open(filename) as in_handle:
41 + for title, seq in SimpleFastaParser(in_handle):
42 + ret_sequence = seq[start:end+1]
43 + return ret_sequence
44 +
45 +if __name__ == "__main__":
46 + parser = argparse.ArgumentParser(description='Get source data set for Human Genome Annotation.')
47 + parser.add_argument('--fastaPath', dest='fastaPath',
48 + help='Path for FASTA files')
49 + parser.add_argument('--hgaPath', dest='hgaPath',
50 + help='Path for Human Genome Annotation file')
51 + parser.add_argument('--hgaFile', dest='hgaFile',
52 + help='Human Genome Annotation file')
53 + parser.add_argument('--outputPath', dest='outputPath',
54 + help='Output path')
55 + parser.add_argument('--outputFile', dest='outputFile',
56 + help='Output file')
57 + parser.add_argument('--feature', dest='feature',
58 + help='Feature (gene, exon)')
59 +
60 + args = parser.parse_args()
61 +
62 + list_rows = []
63 + # Read HGA csv file
64 + with open(os.path.join(args.hgaPath, args.hgaFile), mode="r", encoding="utf-8") as csvfile:
65 + reader = csv.DictReader(csvfile)
66 + for row in reader:
67 + print(row)
68 + filename = os.path.join(args.fastaPath, "Homo_sapiens.GRCh38.dna.chromosome.{}.fa".format(row['seqname']))
69 + sequence = get_sequence(filename, int(row['start']), int(row['end']))
70 + if row['feature'] == args.feature:
71 + label = row['feature']
72 + else:
73 + label = "other"
74 + new_row = "{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], sequence, label)
75 + list_rows.append(new_row)
76 +
77 + with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile:
78 + for elem in list_rows:
79 + oFile.write(elem)