Carlos-Francisco Méndez-Cruz

Deep Learning Workshop

1 +# Get sequences by combining Human Genome Annotation data set (csv)
2 +# with FASTA files to obtain sequences corresponding to object in human genome
3 +# using "start" and "end" columns from human-genome-annotation
4 +
5 +# Install BioPython: conda install -c conda-forge biopython
6 +
7 +# Input files:
8 +# FASTA all chromosomes: /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
9 +
10 +# Output tab-separated format:
11 +# Start End Sequence Feature
12 +
13 +# Run:
14 +# c:\Anaconda3\python get-hga-data-set.py
15 +# --feature gene
16 +# --outputFile hga-sequences-toy.txt
17 +# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
18 +# --hgaFile some-rows-example-human-genome-annotation.csv
19 +# --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
20 +# --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files
21 +# c:\Anaconda3\python get-hga-data-set.py --feature gene --outputFile hga-sequences-toy.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --hgaFile some-rows-example-human-genome-annotation.csv --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files
22 +
23 +# python3 get-hga-sequences.py
24 +# --feature gene
25 +# --outputFile hga-sequences-toy.txt
26 +# --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
27 +# --hgaFile Homo_sapiens.GRCh38.92.csv
28 +# --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
29 +# --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
30 +# python3 get-hga-sequences.py --feature gene --outputFile hga-sequences.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --hgaFile Homo_sapiens.GRCh38.92.csv --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
31 +
32 +import argparse
33 +# from Bio import SeqIO
34 +import csv
35 +import os
36 +from Bio.SeqIO.FastaIO import SimpleFastaParser
37 +
38 +
39 +def get_total_len(filename):
40 + count = 0
41 + total_len = 0
42 + with open(filename) as in_handle:
43 + for title, seq in SimpleFastaParser(in_handle):
44 + count += 1
45 + total_len += len(seq)
46 + retval = "{} records with total sequence length {}".format(count, total_len)
47 + return retval
48 +
49 +
50 +def get_sequence(filename, start, end):
51 + ret_sequence = ""
52 + with open(filename) as in_handle:
53 + for title, seq in SimpleFastaParser(in_handle):
54 + ret_sequence = seq[start:end + 1]
55 + return ret_sequence
56 +
57 +
58 +if __name__ == "__main__":
59 + parser = argparse.ArgumentParser(description='Get source data set for Human Genome Annotation.')
60 + parser.add_argument('--fastaPath', dest='fastaPath',
61 + help='Path for FASTA files')
62 + parser.add_argument('--hgaPath', dest='hgaPath',
63 + help='Path for Human Genome Annotation file')
64 + parser.add_argument('--hgaFile', dest='hgaFile',
65 + help='Human Genome Annotation file')
66 + parser.add_argument('--outputPath', dest='outputPath',
67 + help='Output path')
68 + parser.add_argument('--outputFile', dest='outputFile',
69 + help='Output file')
70 + parser.add_argument('--feature', dest='feature',
71 + help='Feature (gene, exon)')
72 +
73 + args = parser.parse_args()
74 +
75 + list_rows = []
76 + i = 0
77 + length = 0
78 + length_total_exon = 0
79 + length_total_utr = 0
80 + total_exon = 0
81 + total_utr = 0
82 + # Read HGA csv file
83 + with open(os.path.join(args.hgaPath, args.hgaFile), mode="r", encoding="utf-8") as csvfile:
84 + reader = csv.DictReader(csvfile)
85 + for row in reader:
86 + # print(row)
87 + filename = os.path.join(args.fastaPath, "Homo_sapiens.GRCh38.dna.chromosome.{}.fa".format(row['seqname']))
88 + # We use only
89 + sequence = get_sequence(filename, int(row['start']), int(row['end']))
90 + # Features in HGA:
91 + # exon
92 + # feature
93 + # five_prime_utr
94 + # gene
95 + # Selenocysteine
96 + # start_codon
97 + # stop_codon
98 + # three_prime_utr
99 + length = int(row['end']) - int(row['start']) + 1
100 + if row['feature'] == "exon":
101 + label = row['feature']
102 + length_total_exon += length
103 + total_exon += 1
104 + elif row['feature'] in ["five_prime_utr", "three_prime_utr"]:
105 + label = "utr"
106 + length_total_utr += length
107 + total_utr += 1
108 + else:
109 + label = "other"
110 + new_row = "{}\t{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], length, sequence, label)
111 + list_rows.append(new_row)
112 + i += 1
113 + if (i % 100) == 0:
114 + print("{} rows processed.".format(i))
115 + if i == 10000:
116 + break
117 + print("Length media exons {} and utr {}".format(length_total_exon/total_exon, length_total_utr/total_utr))
118 + with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile:
119 + oFile.write("seqname\tstart\tend\tlength\tsequence\tlabel\n")
120 + for elem in list_rows:
121 + oFile.write(elem)
122 +
1 +# Get training and test data set for deep learning from sequence data set
2 +# obtained from FASTA and HGA data sets (see script get-hga-sequences.py)
3 +
4 +# Input tab-separated format:
5 +# Sequences: hga-sequences-toy.txt
6 +
7 +# Output one-hot encoding format:
8 +# Each sequence as a one-hot encoding WHAT array or matrix
9 +
10 +# Run:
11 +# python3 get-hga-training-test.py
12 +# --inputFile hga-sequences-toy.txt
13 +# --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
14 +# --outputTraining hga-sequences-training.txt
15 +# --outputTest hga-sequences-test.txt
16 +# --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
17 +# python get-hga-training-test.py --inputFile hga-sequences-1000.txt --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
18 +
19 +import argparse
20 +import pandas as pd
21 +import os
22 +from sklearn.preprocessing import LabelEncoder, OneHotEncoder
23 +import numpy as np
24 +from sklearn.model_selection import train_test_split
25 +from tensorflow.keras.layers import Conv1D, Dense, MaxPooling1D, Flatten
26 +from tensorflow.keras.models import Sequential
27 +import matplotlib.pyplot as plt
28 +from sklearn.metrics import confusion_matrix
29 +import itertools
30 +
31 +if __name__ == "__main__":
32 + parser = argparse.ArgumentParser(description='Get training and test data sets for Human Genome Annotation.')
33 + parser.add_argument('--inputFile', dest='inputFile',
34 + help='Input file')
35 + parser.add_argument('--inputPath', dest='inputPath',
36 + help='Input path')
37 + parser.add_argument('--outputTraining', dest='outputTraining',
38 + help='Output training file')
39 + parser.add_argument('--outputValidation', dest='outputValidation',
40 + help='Output training file')
41 + parser.add_argument('--outputTest', dest='outputTest',
42 + help='Output test file')
43 + parser.add_argument('--outputPath', dest='outputPath',
44 + help='Output path for training, validation, and testing')
45 +
46 + args = parser.parse_args()
47 +
48 + # To one-hot encoding taken from: https://colab.research.google.com/drive/17E4h5aAOioh5DiTo7MZg4hpL6Z_0FyWr#scrollTo=IPJD6PuDnaS6
49 + # The LabelEncoder encodes a sequence of bases as a sequence of integers.
50 + integer_encoder = LabelEncoder()
51 + # The OneHotEncoder converts an array of integers to a sparse matrix where
52 + # each row corresponds to one possible value of each feature.
53 + one_hot_encoder = OneHotEncoder(categories='auto')
54 + input_features = []
55 + sequences = []
56 +
57 + # Read file with sequences
58 + with open(os.path.join(args.inputPath, args.inputFile), mode="r", encoding="utf-8") as tabfile:
59 + df = pd.read_csv(tabfile, delimiter='\t')
60 + print("df: {}".format(df))
61 + sequences = df['sequence']
62 + labels = df['label']
63 +
64 + max_exon_length = 0
65 + max_utr_length = 0
66 + # One-hot-encoding of sequences
67 + for sequence, label in zip(sequences, labels):
68 + if label == "exon":
69 + if len(sequence) > max_exon_length:
70 + max_exon_length = len(sequence)
71 + elif label == "utr":
72 + if len(sequence) > max_utr_length:
73 + max_utr_length = len(sequence)
74 + '''
75 + integer_encoded = integer_encoder.fit_transform(list(sequence))
76 + integer_encoded = np.array(integer_encoded).reshape(-1, 1)
77 + one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)
78 + input_features.append(one_hot_encoded.toarray())
79 + '''
80 +
81 + print("Max exon length: {}".format(max_exon_length))
82 + print("Max utr length: {}".format(max_utr_length))
83 +
84 + exit()
85 +
86 + # Print first sequence and one-hot-encoding
87 + np.set_printoptions(threshold=40)
88 + input_features = np.stack(input_features)
89 + print("Example sequence\n-----------------------")
90 + print('DNA Sequence #1:\n', sequences[0][:10], '...', sequences[0][-10:])
91 + print('One hot encoding of Sequence #1:\n', input_features[0].T)
92 +
93 + # One-hot-encoding of labels
94 + one_hot_encoder = OneHotEncoder(categories='auto')
95 + labels = np.array(labels).reshape(-1, 1)
96 + input_labels = one_hot_encoder.fit_transform(labels).toarray()
97 +
98 + # Print labels and one-hot-encoding
99 + print('Labels:\n', labels.T)
100 + print('One-hot encoded labels:\n', input_labels.T)
101 +
102 + # Split one-hot-encoding data into training, and test data sets
103 + train_features, test_features, train_labels, test_labels = train_test_split(
104 + input_features, input_labels, test_size=0.25, random_state=42)
105 +
106 + # Model definition
107 + model = Sequential()
108 + model.add(Conv1D(filters=32, kernel_size=12,
109 + input_shape=(train_features.shape[1], 4)))
110 + model.add(MaxPooling1D(pool_size=4))
111 + model.add(Flatten())
112 + model.add(Dense(16, activation='relu'))
113 + model.add(Dense(2, activation='softmax'))
114 +
115 + model.compile(loss='binary_crossentropy', optimizer='adam',
116 + metrics=['binary_accuracy'])
117 + model.summary()
118 +
119 + # Model training and validation
120 + history = model.fit(train_features, train_labels,
121 + epochs=50, verbose=0, validation_split=0.25)
122 +
123 + # Plot training-validation loss
124 + plt.figure()
125 + plt.plot(history.history['loss'])
126 + plt.plot(history.history['val_loss'])
127 + plt.title('model loss')
128 + plt.ylabel('loss')
129 + plt.xlabel('epoch')
130 + plt.legend(['train', 'validation'])
131 + # plt.show()
132 + plt.savefig('training-validation-loss.png')
133 +
134 + # Plot training-validation accuracy
135 + plt.figure()
136 + plt.plot(history.history['binary_accuracy'])
137 + plt.plot(history.history['val_binary_accuracy'])
138 + plt.title('model accuracy')
139 + plt.ylabel('accuracy')
140 + plt.xlabel('epoch')
141 + plt.legend(['train', 'validation'])
142 + # plt.show()
143 + plt.savefig('training-validation-binary-accuracy.png')
144 +
145 + # Predict with rest data set
146 + predicted_labels = model.predict(np.stack(test_features))
147 + # Print confusion matrix
148 + cm = confusion_matrix(np.argmax(test_labels, axis=1),
149 + np.argmax(predicted_labels, axis=1))
150 + print('Confusion matrix:\n', cm)
151 + cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
152 +
153 + # Plot confusion matrix
154 + plt.imshow(cm, cmap=plt.cm.Blues)
155 + plt.title('Normalized confusion matrix')
156 + plt.colorbar()
157 + plt.xlabel('True label')
158 + plt.ylabel('Predicted label')
159 + plt.xticks([0, 1]);
160 + plt.yticks([0, 1])
161 + plt.grid('off')
162 + for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
163 + plt.text(j, i, format(cm[i, j], '.2f'),
164 + horizontalalignment='center',
165 + color='white' if cm[i, j] > 0.5 else 'black')
166 + plt.savefig('training-validation-confusion-matrix.png')
167 +
168 +
169 +
170 +
171 +
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
14 # --outputTraining hga-sequences-training.txt 14 # --outputTraining hga-sequences-training.txt
15 # --outputTest hga-sequences-test.txt 15 # --outputTest hga-sequences-test.txt
16 # --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation 16 # --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
17 -# python3 get-hga-training-test.py --inputFile hga-sequences-toy.txt --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation 17 +# python get-hga-training-test.py --inputFile hga-sequences-1000.txt --inputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --outputTraining hga-sequences-training.txt --outputTest hga-sequences-test.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
18 18
19 import argparse 19 import argparse
20 import pandas as pd 20 import pandas as pd
......