Showing
3 changed files
with
294 additions
and
1 deletions
data-sets/get-hga-sequences-py27.py
0 → 100644
1 | +# Get sequences by combining Human Genome Annotation data set (csv) | ||
2 | +# with FASTA files to obtain sequences corresponding to object in human genome | ||
3 | +# using "start" and "end" columns from human-genome-annotation | ||
4 | + | ||
5 | +# Install BioPython: conda install -c conda-forge biopython | ||
6 | + | ||
7 | +# Input files: | ||
8 | +# FASTA all chromosomes: /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna | ||
9 | + | ||
10 | +# Output tab-separated format: | ||
11 | +# Start End Sequence Feature | ||
12 | + | ||
13 | +# Run: | ||
14 | +# c:\Anaconda3\python get-hga-data-set.py | ||
15 | +# --feature gene | ||
16 | +# --outputFile hga-sequences-toy.txt | ||
17 | +# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation | ||
18 | +# --hgaFile some-rows-example-human-genome-annotation.csv | ||
19 | +# --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation | ||
20 | +# --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files | ||
21 | +# c:\Anaconda3\python get-hga-data-set.py --feature gene --outputFile hga-sequences-toy.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --hgaFile some-rows-example-human-genome-annotation.csv --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files | ||
22 | + | ||
23 | +# python3 get-hga-sequences.py | ||
24 | +# --feature gene | ||
25 | +# --outputFile hga-sequences-toy.txt | ||
26 | +# --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation | ||
27 | +# --hgaFile Homo_sapiens.GRCh38.92.csv | ||
28 | +# --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation | ||
29 | +# --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna | ||
30 | +# python3 get-hga-sequences.py --feature gene --outputFile hga-sequences.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --hgaFile Homo_sapiens.GRCh38.92.csv --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna | ||
31 | + | ||
32 | +import argparse | ||
33 | +# from Bio import SeqIO | ||
34 | +import csv | ||
35 | +import os | ||
36 | +from Bio.SeqIO.FastaIO import SimpleFastaParser | ||
37 | + | ||
38 | + | ||
39 | +def get_total_len(filename): | ||
40 | + count = 0 | ||
41 | + total_len = 0 | ||
42 | + with open(filename) as in_handle: | ||
43 | + for title, seq in SimpleFastaParser(in_handle): | ||
44 | + count += 1 | ||
45 | + total_len += len(seq) | ||
46 | + retval = "{} records with total sequence length {}".format(count, total_len) | ||
47 | + return retval | ||
48 | + | ||
49 | + | ||
50 | +def get_sequence(filename, start, end): | ||
51 | + ret_sequence = "" | ||
52 | + with open(filename) as in_handle: | ||
53 | + for title, seq in SimpleFastaParser(in_handle): | ||
54 | + ret_sequence = seq[start:end + 1] | ||
55 | + return ret_sequence | ||
56 | + | ||
57 | + | ||
58 | +if __name__ == "__main__": | ||
59 | + parser = argparse.ArgumentParser(description='Get source data set for Human Genome Annotation.') | ||
60 | + parser.add_argument('--fastaPath', dest='fastaPath', | ||
61 | + help='Path for FASTA files') | ||
62 | + parser.add_argument('--hgaPath', dest='hgaPath', | ||
63 | + help='Path for Human Genome Annotation file') | ||
64 | + parser.add_argument('--hgaFile', dest='hgaFile', | ||
65 | + help='Human Genome Annotation file') | ||
66 | + parser.add_argument('--outputPath', dest='outputPath', | ||
67 | + help='Output path') | ||
68 | + parser.add_argument('--outputFile', dest='outputFile', | ||
69 | + help='Output file') | ||
70 | + parser.add_argument('--feature', dest='feature', | ||
71 | + help='Feature (gene, exon)') | ||
72 | + | ||
73 | + args = parser.parse_args() | ||
74 | + | ||
75 | + list_rows = [] | ||
76 | + i = 0 | ||
77 | + length = 0 | ||
78 | + length_total_exon = 0 | ||
79 | + length_total_utr = 0 | ||
80 | + total_exon = 0 | ||
81 | + total_utr = 0 | ||
82 | + # Read HGA csv file | ||
83 | + with open(os.path.join(args.hgaPath, args.hgaFile), mode="r", encoding="utf-8") as csvfile: | ||
84 | + reader = csv.DictReader(csvfile) | ||
85 | + for row in reader: | ||
86 | + # print(row) | ||
87 | + filename = os.path.join(args.fastaPath, "Homo_sapiens.GRCh38.dna.chromosome.{}.fa".format(row['seqname'])) | ||
88 | + # We use only | ||
89 | + sequence = get_sequence(filename, int(row['start']), int(row['end'])) | ||
90 | + # Features in HGA: | ||
91 | + # exon | ||
92 | + # feature | ||
93 | + # five_prime_utr | ||
94 | + # gene | ||
95 | + # Selenocysteine | ||
96 | + # start_codon | ||
97 | + # stop_codon | ||
98 | + # three_prime_utr | ||
99 | + length = int(row['end']) - int(row['start']) + 1 | ||
100 | + if row['feature'] == "exon": | ||
101 | + label = row['feature'] | ||
102 | + length_total_exon += length | ||
103 | + total_exon += 1 | ||
104 | + elif row['feature'] in ["five_prime_utr", "three_prime_utr"]: | ||
105 | + label = "utr" | ||
106 | + length_total_utr += length | ||
107 | + total_utr += 1 | ||
108 | + else: | ||
109 | + label = "other" | ||
110 | + new_row = "{}\t{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], length, sequence, label) | ||
111 | + list_rows.append(new_row) | ||
112 | + i += 1 | ||
113 | + if (i % 100) == 0: | ||
114 | + print("{} rows processed.".format(i)) | ||
115 | + if i == 10000: | ||
116 | + break | ||
117 | + print("Length media exons {} and utr {}".format(length_total_exon/total_exon, length_total_utr/total_utr)) | ||
118 | + with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile: | ||
119 | + oFile.write("seqname\tstart\tend\tlength\tsequence\tlabel\n") | ||
120 | + for elem in list_rows: | ||
121 | + oFile.write(elem) | ||
122 | + |
data-sets/get-hga-training-test-py27.py
0 → 100644
1 | +# Get training and test data set for deep learning from sequence data set | ||
2 | +# obtained from FASTA and HGA data sets (see script get-hga-sequences.py) | ||
3 | + | ||
4 | +# Input tab-separated format: | ||
5 | +# Sequences: hga-sequences-toy.txt | ||
6 | + | ||
7 | +# Output one-hot encoding format: | ||
8 | +# Each sequence as a one-hot encoding WHAT array or matrix | ||
9 | + | ||
10 | +# Run: | ||
11 | +# python3 get-hga-training-test.py | ||
12 | +# --inputFile hga-sequences-toy.txt | ||
13 | +# --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation | ||
14 | +# --outputTraining hga-sequences-training.txt | ||
15 | +# --outputTest hga-sequences-test.txt | ||
16 | +# --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation | ||
17 | +# python get-hga-training-test.py --inputFile hga-sequences-1000.txt --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation | ||
18 | + | ||
19 | +import argparse | ||
20 | +import pandas as pd | ||
21 | +import os | ||
22 | +from sklearn.preprocessing import LabelEncoder, OneHotEncoder | ||
23 | +import numpy as np | ||
24 | +from sklearn.model_selection import train_test_split | ||
25 | +from tensorflow.keras.layers import Conv1D, Dense, MaxPooling1D, Flatten | ||
26 | +from tensorflow.keras.models import Sequential | ||
27 | +import matplotlib.pyplot as plt | ||
28 | +from sklearn.metrics import confusion_matrix | ||
29 | +import itertools | ||
30 | + | ||
31 | +if __name__ == "__main__": | ||
32 | + parser = argparse.ArgumentParser(description='Get training and test data sets for Human Genome Annotation.') | ||
33 | + parser.add_argument('--inputFile', dest='inputFile', | ||
34 | + help='Input file') | ||
35 | + parser.add_argument('--inputPath', dest='inputPath', | ||
36 | + help='Input path') | ||
37 | + parser.add_argument('--outputTraining', dest='outputTraining', | ||
38 | + help='Output training file') | ||
39 | + parser.add_argument('--outputValidation', dest='outputValidation', | ||
40 | + help='Output training file') | ||
41 | + parser.add_argument('--outputTest', dest='outputTest', | ||
42 | + help='Output test file') | ||
43 | + parser.add_argument('--outputPath', dest='outputPath', | ||
44 | + help='Output path for training, validation, and testing') | ||
45 | + | ||
46 | + args = parser.parse_args() | ||
47 | + | ||
48 | + # To one-hot encoding taken from: https://colab.research.google.com/drive/17E4h5aAOioh5DiTo7MZg4hpL6Z_0FyWr#scrollTo=IPJD6PuDnaS6 | ||
49 | + # The LabelEncoder encodes a sequence of bases as a sequence of integers. | ||
50 | + integer_encoder = LabelEncoder() | ||
51 | + # The OneHotEncoder converts an array of integers to a sparse matrix where | ||
52 | + # each row corresponds to one possible value of each feature. | ||
53 | + one_hot_encoder = OneHotEncoder(categories='auto') | ||
54 | + input_features = [] | ||
55 | + sequences = [] | ||
56 | + | ||
57 | + # Read file with sequences | ||
58 | + with open(os.path.join(args.inputPath, args.inputFile), mode="r", encoding="utf-8") as tabfile: | ||
59 | + df = pd.read_csv(tabfile, delimiter='\t') | ||
60 | + print("df: {}".format(df)) | ||
61 | + sequences = df['sequence'] | ||
62 | + labels = df['label'] | ||
63 | + | ||
64 | + max_exon_length = 0 | ||
65 | + max_utr_length = 0 | ||
66 | + # One-hot-encoding of sequences | ||
67 | + for sequence, label in zip(sequences, labels): | ||
68 | + if label == "exon": | ||
69 | + if len(sequence) > max_exon_length: | ||
70 | + max_exon_length = len(sequence) | ||
71 | + elif label == "utr": | ||
72 | + if len(sequence) > max_utr_length: | ||
73 | + max_utr_length = len(sequence) | ||
74 | + ''' | ||
75 | + integer_encoded = integer_encoder.fit_transform(list(sequence)) | ||
76 | + integer_encoded = np.array(integer_encoded).reshape(-1, 1) | ||
77 | + one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded) | ||
78 | + input_features.append(one_hot_encoded.toarray()) | ||
79 | + ''' | ||
80 | + | ||
81 | + print("Max exon length: {}".format(max_exon_length)) | ||
82 | + print("Max utr length: {}".format(max_utr_length)) | ||
83 | + | ||
84 | + exit() | ||
85 | + | ||
86 | + # Print first sequence and one-hot-encoding | ||
87 | + np.set_printoptions(threshold=40) | ||
88 | + input_features = np.stack(input_features) | ||
89 | + print("Example sequence\n-----------------------") | ||
90 | + print('DNA Sequence #1:\n', sequences[0][:10], '...', sequences[0][-10:]) | ||
91 | + print('One hot encoding of Sequence #1:\n', input_features[0].T) | ||
92 | + | ||
93 | + # One-hot-encoding of labels | ||
94 | + one_hot_encoder = OneHotEncoder(categories='auto') | ||
95 | + labels = np.array(labels).reshape(-1, 1) | ||
96 | + input_labels = one_hot_encoder.fit_transform(labels).toarray() | ||
97 | + | ||
98 | + # Print labels and one-hot-encoding | ||
99 | + print('Labels:\n', labels.T) | ||
100 | + print('One-hot encoded labels:\n', input_labels.T) | ||
101 | + | ||
102 | + # Split one-hot-encoding data into training, and test data sets | ||
103 | + train_features, test_features, train_labels, test_labels = train_test_split( | ||
104 | + input_features, input_labels, test_size=0.25, random_state=42) | ||
105 | + | ||
106 | + # Model definition | ||
107 | + model = Sequential() | ||
108 | + model.add(Conv1D(filters=32, kernel_size=12, | ||
109 | + input_shape=(train_features.shape[1], 4))) | ||
110 | + model.add(MaxPooling1D(pool_size=4)) | ||
111 | + model.add(Flatten()) | ||
112 | + model.add(Dense(16, activation='relu')) | ||
113 | + model.add(Dense(2, activation='softmax')) | ||
114 | + | ||
115 | + model.compile(loss='binary_crossentropy', optimizer='adam', | ||
116 | + metrics=['binary_accuracy']) | ||
117 | + model.summary() | ||
118 | + | ||
119 | + # Model training and validation | ||
120 | + history = model.fit(train_features, train_labels, | ||
121 | + epochs=50, verbose=0, validation_split=0.25) | ||
122 | + | ||
123 | + # Plot training-validation loss | ||
124 | + plt.figure() | ||
125 | + plt.plot(history.history['loss']) | ||
126 | + plt.plot(history.history['val_loss']) | ||
127 | + plt.title('model loss') | ||
128 | + plt.ylabel('loss') | ||
129 | + plt.xlabel('epoch') | ||
130 | + plt.legend(['train', 'validation']) | ||
131 | + # plt.show() | ||
132 | + plt.savefig('training-validation-loss.png') | ||
133 | + | ||
134 | + # Plot training-validation accuracy | ||
135 | + plt.figure() | ||
136 | + plt.plot(history.history['binary_accuracy']) | ||
137 | + plt.plot(history.history['val_binary_accuracy']) | ||
138 | + plt.title('model accuracy') | ||
139 | + plt.ylabel('accuracy') | ||
140 | + plt.xlabel('epoch') | ||
141 | + plt.legend(['train', 'validation']) | ||
142 | + # plt.show() | ||
143 | + plt.savefig('training-validation-binary-accuracy.png') | ||
144 | + | ||
145 | + # Predict with rest data set | ||
146 | + predicted_labels = model.predict(np.stack(test_features)) | ||
147 | + # Print confusion matrix | ||
148 | + cm = confusion_matrix(np.argmax(test_labels, axis=1), | ||
149 | + np.argmax(predicted_labels, axis=1)) | ||
150 | + print('Confusion matrix:\n', cm) | ||
151 | + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] | ||
152 | + | ||
153 | + # Plot confusion matrix | ||
154 | + plt.imshow(cm, cmap=plt.cm.Blues) | ||
155 | + plt.title('Normalized confusion matrix') | ||
156 | + plt.colorbar() | ||
157 | + plt.xlabel('True label') | ||
158 | + plt.ylabel('Predicted label') | ||
159 | + plt.xticks([0, 1]); | ||
160 | + plt.yticks([0, 1]) | ||
161 | + plt.grid('off') | ||
162 | + for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): | ||
163 | + plt.text(j, i, format(cm[i, j], '.2f'), | ||
164 | + horizontalalignment='center', | ||
165 | + color='white' if cm[i, j] > 0.5 else 'black') | ||
166 | + plt.savefig('training-validation-confusion-matrix.png') | ||
167 | + | ||
168 | + | ||
169 | + | ||
170 | + | ||
171 | + |
... | @@ -14,7 +14,7 @@ | ... | @@ -14,7 +14,7 @@ |
14 | # --outputTraining hga-sequences-training.txt | 14 | # --outputTraining hga-sequences-training.txt |
15 | # --outputTest hga-sequences-test.txt | 15 | # --outputTest hga-sequences-test.txt |
16 | # --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation | 16 | # --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation |
17 | -# python3 get-hga-training-test.py --inputFile hga-sequences-toy.txt --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation | 17 | +# python get-hga-training-test.py --inputFile hga-sequences-1000.txt --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation |
18 | 18 | ||
19 | import argparse | 19 | import argparse |
20 | import pandas as pd | 20 | import pandas as pd | ... | ... |
-
Please register or login to post a comment