Showing
3 changed files
with
170 additions
and
21 deletions
data-sets/get-hga-sequences.py
0 → 100644
1 | +# Get sequences by combining Human Genome Annotation data set (csv) | ||
2 | +# with FASTA files to obtain sequences corresponding to object in human genome | ||
3 | +# using "start" and "end" columns from human-genome-annotation | ||
4 | + | ||
5 | +# Install BioPython: conda install -c conda-forge biopython | ||
6 | + | ||
7 | +# Input files: | ||
8 | +# FASTA all chromosomes: /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna | ||
9 | + | ||
10 | +# Output tab-separated format: | ||
11 | +# Start End Sequence Feature | ||
12 | + | ||
13 | +# Run: | ||
14 | +# c:\Anaconda3\python get-hga-data-set.py | ||
15 | +# --feature gene | ||
16 | +# --outputFile hga-sequences.txt | ||
17 | +# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation | ||
18 | +# --hgaFile some-rows-example-human-genome-annotation.csv | ||
19 | +# --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation | ||
20 | +# --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files | ||
21 | +# c:\Anaconda3\python get-hga-data-set.py --feature gene --outputFile hga-sequences.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --hgaFile some-rows-example-human-genome-annotation.csv --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files | ||
22 | + | ||
23 | +import argparse | ||
24 | +# from Bio import SeqIO | ||
25 | +import csv | ||
26 | +import os | ||
27 | +from Bio.SeqIO.FastaIO import SimpleFastaParser | ||
28 | + | ||
29 | + | ||
30 | +def get_total_len(filename): | ||
31 | + count = 0 | ||
32 | + total_len = 0 | ||
33 | + with open(filename) as in_handle: | ||
34 | + for title, seq in SimpleFastaParser(in_handle): | ||
35 | + count += 1 | ||
36 | + total_len += len(seq) | ||
37 | + retval = "{} records with total sequence length {}".format(count, total_len) | ||
38 | + return retval | ||
39 | + | ||
40 | +def get_sequence(filename, start, end): | ||
41 | + ret_sequence = "" | ||
42 | + with open(filename) as in_handle: | ||
43 | + for title, seq in SimpleFastaParser(in_handle): | ||
44 | + ret_sequence = seq[start:end+1] | ||
45 | + return ret_sequence | ||
46 | + | ||
47 | +if __name__ == "__main__": | ||
48 | + parser = argparse.ArgumentParser(description='Get source data set for Human Genome Annotation.') | ||
49 | + parser.add_argument('--fastaPath', dest='fastaPath', | ||
50 | + help='Path for FASTA files') | ||
51 | + parser.add_argument('--hgaPath', dest='hgaPath', | ||
52 | + help='Path for Human Genome Annotation file') | ||
53 | + parser.add_argument('--hgaFile', dest='hgaFile', | ||
54 | + help='Human Genome Annotation file') | ||
55 | + parser.add_argument('--outputPath', dest='outputPath', | ||
56 | + help='Output path') | ||
57 | + parser.add_argument('--outputFile', dest='outputFile', | ||
58 | + help='Output file') | ||
59 | + parser.add_argument('--feature', dest='feature', | ||
60 | + help='Feature (gene, exon)') | ||
61 | + | ||
62 | + args = parser.parse_args() | ||
63 | + | ||
64 | + list_rows = [] | ||
65 | + # Read HGA csv file | ||
66 | + with open(os.path.join(args.hgaPath, args.hgaFile), mode="r", encoding="utf-8") as csvfile: | ||
67 | + reader = csv.DictReader(csvfile) | ||
68 | + for row in reader: | ||
69 | + # print(row) | ||
70 | + filename = os.path.join(args.fastaPath, "Homo_sapiens.GRCh38.dna.chromosome.{}.fa".format(row['seqname'])) | ||
71 | + sequence = get_sequence(filename, int(row['start']), int(row['end'])) | ||
72 | + if row['feature'] == args.feature: | ||
73 | + label = row['feature'] | ||
74 | + else: | ||
75 | + label = "other" | ||
76 | + new_row = "{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], sequence, label) | ||
77 | + list_rows.append(new_row) | ||
78 | + | ||
79 | + with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile: | ||
80 | + oFile.write("seqname\tstart\tend\tsequence\tlabel\n") | ||
81 | + for elem in list_rows: | ||
82 | + oFile.write(elem) |
data-sets/get-hga-training-test.py
0 → 100644
1 | +# Get training and test data set for deep learning from sequence data set | ||
2 | +# obtained from FASTA and HGA data sets (see script get-hga-sequences.py) | ||
3 | + | ||
4 | +# Input tab-separated format: | ||
5 | +# Sequences: hga-sequences.txt | ||
6 | + | ||
7 | +# Output one-hot encoding format: | ||
8 | +# Each sequence as a one-hot encoding WHAT array or matrix | ||
9 | + | ||
10 | +# Run: | ||
11 | +# c:\Anaconda3\python get-hga-training-test.py | ||
12 | +# --inputFile hga-sequences.txt | ||
13 | +# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation | ||
14 | +# --outputTrainFile hga-sequences-training.txt | ||
15 | +# --outputTestFile hga-sequences-test.txt | ||
16 | +# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation | ||
17 | + | ||
18 | +import argparse | ||
19 | +import pandas as pd | ||
20 | +import os | ||
21 | +from sklearn.preprocessing import LabelEncoder, OneHotEncoder | ||
22 | +import numpy as np | ||
23 | +from sklearn.model_selection import train_test_split | ||
24 | + | ||
25 | +if __name__ == "__main__": | ||
26 | + parser = argparse.ArgumentParser(description='Get training and test data sets for Human Genome Annotation.') | ||
27 | + parser.add_argument('--inputFile', dest='inputFile', | ||
28 | + help='Input file') | ||
29 | + parser.add_argument('--inputPath', dest='inputPath', | ||
30 | + help='Input path') | ||
31 | + parser.add_argument('--outputTraining', dest='outputTraining', | ||
32 | + help='Output training file') | ||
33 | + parser.add_argument('--outputValidation', dest='outputValidation', | ||
34 | + help='Output training file') | ||
35 | + parser.add_argument('--outputTest', dest='outputTest', | ||
36 | + help='Output test file') | ||
37 | + parser.add_argument('--outputPath', dest='outputPath', | ||
38 | + help='Output path for training, validation, and testing') | ||
39 | + | ||
40 | + args = parser.parse_args() | ||
41 | + | ||
42 | + # To one-hot encoding taken from: https://colab.research.google.com/drive/17E4h5aAOioh5DiTo7MZg4hpL6Z_0FyWr#scrollTo=IPJD6PuDnaS6 | ||
43 | + # The LabelEncoder encodes a sequence of bases as a sequence of integers. | ||
44 | + integer_encoder = LabelEncoder() | ||
45 | + # The OneHotEncoder converts an array of integers to a sparse matrix where | ||
46 | + # each row corresponds to one possible value of each feature. | ||
47 | + one_hot_encoder = OneHotEncoder(categories='auto') | ||
48 | + input_features = [] | ||
49 | + sequences = [] | ||
50 | + | ||
51 | + # Read file with sequences | ||
52 | + with open(os.path.join(args.inputFile, args.inputPath), mode="r", encoding="utf-8") as tabfile: | ||
53 | + df = pd.read_csv(tabfile, delimiter='\t') | ||
54 | + sequences = df['sequence'] | ||
55 | + labels = df['label'] | ||
56 | + # One-hot-encoding of sequences | ||
57 | + for sequence in sequences: | ||
58 | + integer_encoded = integer_encoder.fit_transform(list(sequence)) | ||
59 | + integer_encoded = np.array(integer_encoded).reshape(-1, 1) | ||
60 | + one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded) | ||
61 | + | ||
62 | + # Print first sequence and one-hot-encoding | ||
63 | + np.set_printoptions(threshold=40) | ||
64 | + input_features = np.stack(input_features) | ||
65 | + print("Example sequence\n-----------------------") | ||
66 | + print('DNA Sequence #1:\n', sequences[0][:10], '...', sequences[0][-10:]) | ||
67 | + print('One hot encoding of Sequence #1:\n', input_features[0].T) | ||
68 | + | ||
69 | + # One-hot-encoding of labels | ||
70 | + one_hot_encoder = OneHotEncoder(categories='auto') | ||
71 | + labels = np.array(labels).reshape(-1, 1) | ||
72 | + input_labels = one_hot_encoder.fit_transform(labels).toarray() | ||
73 | + | ||
74 | + # Print labels and one-hot-encoding | ||
75 | + print('Labels:\n', labels.T) | ||
76 | + print('One-hot encoded labels:\n', input_labels.T) | ||
77 | + | ||
78 | + # Split one-hot-encoding data into training, and test data sets | ||
79 | + train_features, test_features, train_labels, test_labels = train_test_split( | ||
80 | + input_features, input_labels, test_size=0.25, random_state=42) | ||
81 | + | ||
82 | + | ||
83 | + with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile: | ||
84 | + for elem in list_rows: | ||
85 | + oFile.write(elem) | ||
86 | + | ||
87 | + | ||
88 | + |
1 | -# Get source data set by combining Human Genome Annotation data set (csv) | ||
2 | -# with FASTA files to obtain sequences corresponding to object in human genome | ||
3 | -# using "start" and "end" columns from human-genome-annotation | ||
4 | - | ||
5 | -# Input files: | ||
6 | -# FASTA all chromosomes: /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna | ||
7 | - | ||
8 | -# Output tab-separated format: | ||
9 | -# Start End Sequence Feature | ||
10 | - | ||
11 | -import argparse | ||
12 | - | ||
13 | -if __name__ == "__main__": | ||
14 | - parser = argparse.ArgumentParser(description='Get source data set for Human Genome Annotation.') | ||
15 | - parser.add_argument('--fastaPath', dest='fastaPath', action='store_const', | ||
16 | - const=sum, default=max, | ||
17 | - help='sum the integers (default: find the max)') | ||
18 | - | ||
19 | - args = parser.parse_args() | ||
20 | - print(args.accumulate(args.integers)) | ||
21 | - |
-
Please register or login to post a comment