Carlos-Francisco Méndez-Cruz

Deep Learning Workshop

......@@ -62,18 +62,20 @@ if __name__ == "__main__":
# each row corresponds to one possible value of each feature.
one_hot_encoder = OneHotEncoder(categories='auto')
input_features = []
sequences = []
# Read file with sequences
with open(os.path.join(args.inputPath, args.inputFile), mode="r") as tabfile:
df = pd.read_csv(tabfile, delimiter='\t')
print("df: {}".format(df))
sequences = df['sequence']
labels = df['label']
print("All rows in df: {}".format(len(df.index)))
df_filtered = df.loc[df['label'] in ["exon", "utr"]]
print("Only exon and utr rows in df: {}".format(len(df_filtered.index)))
# print("df: {}".format(df))
sequences = df_filtered['sequence']
labels = df_filtered['label']
max_exon_length = 0
max_utr_length = 0
# One-hot-encoding of sequences
# Getting the max length of sequences
for sequence, label in zip(sequences, labels):
if label == "exon":
if len(sequence) > max_exon_length:
......@@ -81,17 +83,24 @@ if __name__ == "__main__":
elif label == "utr":
if len(sequence) > max_utr_length:
max_utr_length = len(sequence)
'''
print("Max exon length: {}".format(max_exon_length))
print("Max utr length: {}".format(max_utr_length))
quit()
# Fill sequence with X char to get max length
# One-hot-encoding of sequences
for sequence, label in zip(sequences, labels):
if label == "exon":
if len(sequence) < max_exon_length:
sequence.ljust(max_exon_length + len(sequence), 'X')
elif label == "utr":
if len(sequence) < max_utr_length:
sequence.ljust(max_utr_length + len(sequence), 'X')
integer_encoded = integer_encoder.fit_transform(list(sequence))
integer_encoded = np.array(integer_encoded).reshape(-1, 1)
one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)
input_features.append(one_hot_encoded.toarray())
'''
print("Max exon length: {}".format(max_exon_length))
print("Max utr length: {}".format(max_utr_length))
exit()
# Print first sequence and one-hot-encoding
np.set_printoptions(threshold=40)
......