Deep Learning Workshop

Carlos-Francisco Méndez-Cruz
Commit f61a4234e1c3fac6f576ac3e54147eeb6e9d6cfc f61a4234 1 parent e6cd562a
Showing 3 changed files with 170 additions and 21 deletions
data-sets/get-hga-sequences.py
data-sets/get-hga-training-test.py
data-sets/human-genome-annotation/get-hga-data-set.py
--- a/data-sets/get-hga-sequences.py 0 → 100644
View file @f61a423
+++ b/data-sets/get-hga-sequences.py 0 → 100644
View file @f61a423
+ # Get sequences by combining Human Genome Annotation data set (csv)
+ # with FASTA files to obtain sequences corresponding to object in human genome
+ # using "start" and "end" columns from human-genome-annotation
+ 
+ # Install BioPython: conda install -c conda-forge biopython
+ 
+ # Input files:
+ # FASTA all chromosomes: /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
+ 
+ # Output tab-separated format:
+ # Start End Sequence Feature
+ 
+ # Run:
+ # c:\Anaconda3\python get-hga-data-set.py
+ # --feature gene
+ # --outputFile hga-sequences.txt
+ # --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
+ # --hgaFile some-rows-example-human-genome-annotation.csv
+ # --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
+ # --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files
+ # c:\Anaconda3\python get-hga-data-set.py --feature gene --outputFile hga-sequences.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --hgaFile some-rows-example-human-genome-annotation.csv --hgaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation --fastaPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\fasta-files
+ 
+ import argparse
+ # from Bio import SeqIO
+ import csv
+ import os
+ from Bio.SeqIO.FastaIO import SimpleFastaParser
+ 
+ 
+ def get_total_len(filename):
+     count = 0
+     total_len = 0
+     with open(filename) as in_handle:
+         for title, seq in SimpleFastaParser(in_handle):
+             count += 1
+             total_len += len(seq)
+     retval = "{} records with total sequence length {}".format(count, total_len)
+     return retval
+ 
+ def get_sequence(filename, start, end):
+     ret_sequence = ""
+     with open(filename) as in_handle:
+         for title, seq in SimpleFastaParser(in_handle):
+             ret_sequence = seq[start:end+1]
+     return ret_sequence
+ 
+ if __name__ == "__main__":
+     parser = argparse.ArgumentParser(description='Get source data set for Human Genome Annotation.')
+     parser.add_argument('--fastaPath', dest='fastaPath',
+                         help='Path for FASTA files')
+     parser.add_argument('--hgaPath', dest='hgaPath',
+                         help='Path for Human Genome Annotation file')
+     parser.add_argument('--hgaFile', dest='hgaFile',
+                         help='Human Genome Annotation file')
+     parser.add_argument('--outputPath', dest='outputPath',
+                         help='Output path')
+     parser.add_argument('--outputFile', dest='outputFile',
+                         help='Output file')
+     parser.add_argument('--feature', dest='feature',
+                         help='Feature (gene, exon)')
+ 
+     args = parser.parse_args()
+ 
+     list_rows = []
+     # Read HGA csv file
+     with open(os.path.join(args.hgaPath, args.hgaFile), mode="r", encoding="utf-8") as csvfile:
+         reader = csv.DictReader(csvfile)
+         for row in reader:
+             # print(row)
+             filename = os.path.join(args.fastaPath, "Homo_sapiens.GRCh38.dna.chromosome.{}.fa".format(row['seqname']))
+             sequence = get_sequence(filename, int(row['start']), int(row['end']))
+             if row['feature'] == args.feature:
+                 label = row['feature']
+             else:
+                 label = "other"
+             new_row = "{}\t{}\t{}\t{}\t{}\n".format(row['seqname'], row['start'], row['end'], sequence, label)
+             list_rows.append(new_row)
+ 
+     with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile:
+         oFile.write("seqname\tstart\tend\tsequence\tlabel\n")
+         for elem in list_rows:
+             oFile.write(elem)
--- a/data-sets/get-hga-training-test.py 0 → 100644
View file @f61a423
+++ b/data-sets/get-hga-training-test.py 0 → 100644
View file @f61a423
+ # Get training and test data set for deep learning from sequence data set
+ # obtained from FASTA and HGA data sets (see script get-hga-sequences.py)
+ 
+ # Input tab-separated format:
+ # Sequences: hga-sequences.txt
+ 
+ # Output one-hot encoding format:
+ # Each sequence as a one-hot encoding WHAT array or matrix
+ 
+ # Run:
+ # c:\Anaconda3\python get-hga-training-test.py
+ # --inputFile hga-sequences.txt
+ # --inputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
+ # --outputTrainFile hga-sequences-training.txt
+ # --outputTestFile hga-sequences-test.txt
+ # --outputPath C:\Users\cmendezc\Documents\GENOMICAS\DEEP_LEARNING\gitlab-deep-learning-workshop\data-sets\human-genome-annotation
+ 
+ import argparse
+ import pandas as pd
+ import os
+ from sklearn.preprocessing import LabelEncoder, OneHotEncoder
+ import numpy as np
+ from sklearn.model_selection import train_test_split
+ 
+ if __name__ == "__main__":
+     parser = argparse.ArgumentParser(description='Get training and test data sets for Human Genome Annotation.')
+     parser.add_argument('--inputFile', dest='inputFile',
+                         help='Input file')
+     parser.add_argument('--inputPath', dest='inputPath',
+                         help='Input path')
+     parser.add_argument('--outputTraining', dest='outputTraining',
+                         help='Output training file')
+     parser.add_argument('--outputValidation', dest='outputValidation',
+                         help='Output training file')
+     parser.add_argument('--outputTest', dest='outputTest',
+                         help='Output test file')
+     parser.add_argument('--outputPath', dest='outputPath',
+                         help='Output path for training, validation, and testing')
+ 
+     args = parser.parse_args()
+ 
+     # To one-hot encoding taken from: https://colab.research.google.com/drive/17E4h5aAOioh5DiTo7MZg4hpL6Z_0FyWr#scrollTo=IPJD6PuDnaS6
+     # The LabelEncoder encodes a sequence of bases as a sequence of integers.
+     integer_encoder = LabelEncoder()
+     # The OneHotEncoder converts an array of integers to a sparse matrix where
+     # each row corresponds to one possible value of each feature.
+     one_hot_encoder = OneHotEncoder(categories='auto')
+     input_features = []
+     sequences = []
+ 
+     # Read file with sequences
+     with open(os.path.join(args.inputFile, args.inputPath), mode="r", encoding="utf-8") as tabfile:
+         df = pd.read_csv(tabfile, delimiter='\t')
+         sequences = df['sequence']
+         labels = df['label']
+     # One-hot-encoding of sequences
+     for sequence in sequences:
+         integer_encoded = integer_encoder.fit_transform(list(sequence))
+         integer_encoded = np.array(integer_encoded).reshape(-1, 1)
+         one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)
+ 
+     # Print first sequence and one-hot-encoding
+     np.set_printoptions(threshold=40)
+     input_features = np.stack(input_features)
+     print("Example sequence\n-----------------------")
+     print('DNA Sequence #1:\n', sequences[0][:10], '...', sequences[0][-10:])
+     print('One hot encoding of Sequence #1:\n', input_features[0].T)
+ 
+     # One-hot-encoding of labels
+     one_hot_encoder = OneHotEncoder(categories='auto')
+     labels = np.array(labels).reshape(-1, 1)
+     input_labels = one_hot_encoder.fit_transform(labels).toarray()
+ 
+     # Print labels and one-hot-encoding
+     print('Labels:\n', labels.T)
+     print('One-hot encoded labels:\n', input_labels.T)
+ 
+     # Split one-hot-encoding data into training, and test data sets
+     train_features, test_features, train_labels, test_labels = train_test_split(
+         input_features, input_labels, test_size=0.25, random_state=42)
+ 
+ 
+     with open(os.path.join(args.outputPath, args.outputFile), mode="w") as oFile:
+         for elem in list_rows:
+             oFile.write(elem)
+ 
+ 
+ 
--- a/data-sets/human-genome-annotation/get-hga-data-set.py deleted 100644 → 0
View file @e6cd562
+++ b/data-sets/human-genome-annotation/get-hga-data-set.py deleted 100644 → 0
View file @e6cd562
- # Get source data set by combining Human Genome Annotation data set (csv)
- # with FASTA files to obtain sequences corresponding to object in human genome
- # using "start" and "end" columns from human-genome-annotation
- 
- # Input files:
- # FASTA all chromosomes: /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
- 
- # Output tab-separated format:
- # Start End Sequence Feature
- 
- import argparse
- 
- if __name__ == "__main__":
-     parser = argparse.ArgumentParser(description='Get source data set for Human Genome Annotation.')
-     parser.add_argument('--fastaPath', dest='fastaPath', action='store_const',
-                         const=sum, default=max,
-                         help='sum the integers (default: find the max)')
- 
-     args = parser.parse_args()
-     print(args.accumulate(args.integers))
-