Carlos-Francisco Méndez-Cruz

Deep Learning Workshop

1 +# Get sequences by combining Human Genome Annotation data set (csv)
2 +# with FASTA files to obtain sequences corresponding to object in human genome
3 +# using "start" and "end" columns from human-genome-annotation
4 +
5 +# Install BioPython: conda install -c conda-forge biopython
6 +
7 +# Input files:
8 +# FASTA all chromosomes: /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
9 +
10 +# Output tab-separated format:
11 +# Start End Sequence Feature
12 +
13 +# Run:
14 +# c:\Anaconda3\python get-hga-data-set.py
15 +# --feature gene
16 +# --outputFile hga-sequences.txt
17 +# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
18 +# --hgaFile some-rows-example-human-genome-annotation.csv
19 +# --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
20 +# --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files
21 +# c:\Anaconda3\python get-hga-data-set.py --feature gene --outputFile hga-sequences.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --hgaFile some-rows-example-human-genome-annotation.csv --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files
22 +
23 +import argparse
24 +# from Bio import SeqIO
25 +import csv
26 +import os
27 +from Bio.SeqIO.FastaIO import SimpleFastaParser
28 +
29 +
30 +def get_total_len(filename):
31 + count = 0
32 + total_len = 0
33 + with open(filename) as in_handle:
34 + for title, seq in SimpleFastaParser(in_handle):
35 + count += 1
36 + total_len += len(seq)
37 + retval = "{} records with total sequence length {}".format(count, total_len)
38 + return retval
39 +
40 +def get_sequence(filename, start, end):
41 + ret_sequence = ""
42 + with open(filename) as in_handle:
43 + for title, seq in SimpleFastaParser(in_handle):
44 + ret_sequence = seq[start:end+1]
45 + return ret_sequence
46 +
47 +if __name__ == "__main__":
48 + parser = argparse.ArgumentParser(description='Get source data set for Human Genome Annotation.')
49 + parser.add_argument('--fastaPath', dest='fastaPath',
50 + help='Path for FASTA files')
51 + parser.add_argument('--hgaPath', dest='hgaPath',
52 + help='Path for Human Genome Annotation file')
53 + parser.add_argument('--hgaFile', dest='hgaFile',
54 + help='Human Genome Annotation file')
55 + parser.add_argument('--outputPath', dest='outputPath',
56 + help='Output path')
57 + parser.add_argument('--outputFile', dest='outputFile',
58 + help='Output file')
59 + parser.add_argument('--feature', dest='feature',
60 + help='Feature (gene, exon)')
61 +
62 + args = parser.parse_args()
63 +
64 + list_rows = []
65 + # Read HGA csv file
66 + with open(os.path.join(args.hgaPath, args.hgaFile), mode="r", encoding="utf-8") as csvfile:
67 + reader = csv.DictReader(csvfile)
68 + for row in reader:
69 + # print(row)
70 + filename = os.path.join(args.fastaPath, "Homo_sapiens.GRCh38.dna.chromosome.{}.fa".format(row['seqname']))
71 + sequence = get_sequence(filename, int(row['start']), int(row['end']))
72 + if row['feature'] == args.feature:
73 + label = row['feature']
74 + else:
75 + label = "other"
76 + new_row = "{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], sequence, label)
77 + list_rows.append(new_row)
78 +
79 + with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile:
80 + oFile.write("seqname\tstart\tend\tsequence\tlabel\n")
81 + for elem in list_rows:
82 + oFile.write(elem)
1 +# Get training and test data set for deep learning from sequence data set
2 +# obtained from FASTA and HGA data sets (see script get-hga-sequences.py)
3 +
4 +# Input tab-separated format:
5 +# Sequences: hga-sequences.txt
6 +
7 +# Output one-hot encoding format:
8 +# Each sequence as a one-hot encoding WHAT array or matrix
9 +
10 +# Run:
11 +# c:\Anaconda3\python get-hga-training-test.py
12 +# --inputFile hga-sequences.txt
13 +# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
14 +# --outputTrainFile hga-sequences-training.txt
15 +# --outputTestFile hga-sequences-test.txt
16 +# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
17 +
18 +import argparse
19 +import pandas as pd
20 +import os
21 +from sklearn.preprocessing import LabelEncoder, OneHotEncoder
22 +import numpy as np
23 +from sklearn.model_selection import train_test_split
24 +
25 +if __name__ == "__main__":
26 + parser = argparse.ArgumentParser(description='Get training and test data sets for Human Genome Annotation.')
27 + parser.add_argument('--inputFile', dest='inputFile',
28 + help='Input file')
29 + parser.add_argument('--inputPath', dest='inputPath',
30 + help='Input path')
31 + parser.add_argument('--outputTraining', dest='outputTraining',
32 + help='Output training file')
33 + parser.add_argument('--outputValidation', dest='outputValidation',
34 + help='Output training file')
35 + parser.add_argument('--outputTest', dest='outputTest',
36 + help='Output test file')
37 + parser.add_argument('--outputPath', dest='outputPath',
38 + help='Output path for training, validation, and testing')
39 +
40 + args = parser.parse_args()
41 +
42 + # To one-hot encoding taken from: https://colab.research.google.com/drive/17E4h5aAOioh5DiTo7MZg4hpL6Z_0FyWr#scrollTo=IPJD6PuDnaS6
43 + # The LabelEncoder encodes a sequence of bases as a sequence of integers.
44 + integer_encoder = LabelEncoder()
45 + # The OneHotEncoder converts an array of integers to a sparse matrix where
46 + # each row corresponds to one possible value of each feature.
47 + one_hot_encoder = OneHotEncoder(categories='auto')
48 + input_features = []
49 + sequences = []
50 +
51 + # Read file with sequences
52 + with open(os.path.join(args.inputFile, args.inputPath), mode="r", encoding="utf-8") as tabfile:
53 + df = pd.read_csv(tabfile, delimiter='\t')
54 + sequences = df['sequence']
55 + labels = df['label']
56 + # One-hot-encoding of sequences
57 + for sequence in sequences:
58 + integer_encoded = integer_encoder.fit_transform(list(sequence))
59 + integer_encoded = np.array(integer_encoded).reshape(-1, 1)
60 + one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)
61 +
62 + # Print first sequence and one-hot-encoding
63 + np.set_printoptions(threshold=40)
64 + input_features = np.stack(input_features)
65 + print("Example sequence\n-----------------------")
66 + print('DNA Sequence #1:\n', sequences[0][:10], '...', sequences[0][-10:])
67 + print('One hot encoding of Sequence #1:\n', input_features[0].T)
68 +
69 + # One-hot-encoding of labels
70 + one_hot_encoder = OneHotEncoder(categories='auto')
71 + labels = np.array(labels).reshape(-1, 1)
72 + input_labels = one_hot_encoder.fit_transform(labels).toarray()
73 +
74 + # Print labels and one-hot-encoding
75 + print('Labels:\n', labels.T)
76 + print('One-hot encoded labels:\n', input_labels.T)
77 +
78 + # Split one-hot-encoding data into training, and test data sets
79 + train_features, test_features, train_labels, test_labels = train_test_split(
80 + input_features, input_labels, test_size=0.25, random_state=42)
81 +
82 +
83 + with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile:
84 + for elem in list_rows:
85 + oFile.write(elem)
86 +
87 +
88 +
1 -# Get source data set by combining Human Genome Annotation data set (csv)
2 -# with FASTA files to obtain sequences corresponding to object in human genome
3 -# using "start" and "end" columns from human-genome-annotation
4 -
5 -# Input files:
6 -# FASTA all chromosomes: /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
7 -
8 -# Output tab-separated format:
9 -# Start End Sequence Feature
10 -
11 -import argparse
12 -
13 -if __name__ == "__main__":
14 - parser = argparse.ArgumentParser(description='Get source data set for Human Genome Annotation.')
15 - parser.add_argument('--fastaPath', dest='fastaPath', action='store_const',
16 - const=sum, default=max,
17 - help='sum the integers (default: find the max)')
18 -
19 - args = parser.parse_args()
20 - print(args.accumulate(args.integers))
21 -