Deep Learning Workshop

Carlos-Francisco Méndez-Cruz
Commit 45c7e69e9bdc1aabec1914bc596f066d9f70bd84 45c7e69e 1 parent bb88e929
Showing 1 changed file with 79 additions and 0 deletions
data-sets/get-hga-data-set.py
--- a/data-sets/get-hga-data-set.py 0 → 100644
View file @45c7e69
+++ b/data-sets/get-hga-data-set.py 0 → 100644
View file @45c7e69
+# Get training and test data set to deep learning.
+
+# Install BioPython: conda install -c conda-forge biopython
+
+# Input files:
+# FASTA all chromosomes: /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
+
+# Output tab-separated format:
+# Start End Sequence Feature
+
+# Run:
+# c:\Anaconda3\python get-hga-data-set.py
+# --feature gene
+# --outputFile hga-sequences.txt
+# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
+# --hgaFile some-rows-example-human-genome-annotation.csv
+# --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
+# --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files
+# c:\Anaconda3\python get-hga-data-set.py --feature gene --outputFile hga-sequences.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --hgaFile some-rows-example-human-genome-annotation.csv --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files
+
+import argparse
+# from Bio import SeqIO
+import csv
+import os
+from Bio.SeqIO.FastaIO import SimpleFastaParser
+
+
+def get_total_len(filename):
+    count = 0
+    total_len = 0
+    with open(filename) as in_handle:
+        for title, seq in SimpleFastaParser(in_handle):
+            count += 1
+            total_len += len(seq)
+    retval = "{} records with total sequence length {}".format(count, total_len)
+    return retval
+
+def get_sequence(filename, start, end):
+    ret_sequence = ""
+    with open(filename) as in_handle:
+        for title, seq in SimpleFastaParser(in_handle):
+            ret_sequence = seq[start:end+1]
+    return ret_sequence
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Get source data set for Human Genome Annotation.')
+    parser.add_argument('--fastaPath', dest='fastaPath',
+                        help='Path for FASTA files')
+    parser.add_argument('--hgaPath', dest='hgaPath',
+                        help='Path for Human Genome Annotation file')
+    parser.add_argument('--hgaFile', dest='hgaFile',
+                        help='Human Genome Annotation file')
+    parser.add_argument('--outputPath', dest='outputPath',
+                        help='Output path')
+    parser.add_argument('--outputFile', dest='outputFile',
+                        help='Output file')
+    parser.add_argument('--feature', dest='feature',
+                        help='Feature (gene, exon)')
+
+    args = parser.parse_args()
+
+    list_rows = []
+    # Read HGA csv file
+    with open(os.path.join(args.hgaPath, args.hgaFile), mode="r", encoding="utf-8") as csvfile:
+        reader = csv.DictReader(csvfile)
+        for row in reader:
+            print(row)
+            filename = os.path.join(args.fastaPath, "Homo_sapiens.GRCh38.dna.chromosome.{}.fa".format(row['seqname']))
+            sequence = get_sequence(filename, int(row['start']), int(row['end']))
+            if row['feature'] == args.feature:
+                label = row['feature']
+            else:
+                label = "other"
+            new_row = "{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], sequence, label)
+            list_rows.append(new_row)
+
+    with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile:
+        for elem in list_rows:
+            oFile.write(elem)