Carlos-Francisco Méndez-Cruz

Deep Learning Workshop

......@@ -27,7 +27,7 @@
# --hgaFile Homo_sapiens.GRCh38.92.csv
# --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation
# --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
# python3 get-hga-sequences.py --feature gene --outputFile hga-sequences-toy.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --hgaFile Homo_sapiens.GRCh38.92.csv --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
# python3 get-hga-sequences.py --feature gene --outputFile hga-sequences.txt --outputPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --hgaFile Homo_sapiens.GRCh38.92.csv --hgaPath /home/cmendezc/gitlab-deep-learning-workshop/data-sets/human-genome-annotation --fastaPath /home/cmendezc/data-FASTA-Homo_sapiens.GRCh38.dna
import argparse
# from Bio import SeqIO
......
......@@ -60,12 +60,28 @@ if __name__ == "__main__":
print("df: {}".format(df))
sequences = df['sequence']
labels = df['label']
max_exon_length = 0
max_utr_length = 0
# One-hot-encoding of sequences
for sequence in sequences:
for sequence, label in zip(sequences, labels):
if label == "exon":
if len(sequence) > max_exon_length:
max_exon_length = len(sequence)
elif label == "utr":
if len(sequence) > max_utr_length:
max_utr_length = len(sequence)
'''
integer_encoded = integer_encoder.fit_transform(list(sequence))
integer_encoded = np.array(integer_encoded).reshape(-1, 1)
one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)
input_features.append(one_hot_encoded.toarray())
'''
print("Max exon length: {}".format(max_exon_length))
print("Max utr length: {}".format(max_utr_length))
exit()
# Print first sequence and one-hot-encoding
np.set_printoptions(threshold=40)
......