Carlos-Francisco Méndez-Cruz

Deep Learning Workshop

...@@ -62,18 +62,20 @@ if __name__ == "__main__": ...@@ -62,18 +62,20 @@ if __name__ == "__main__":
62 # each row corresponds to one possible value of each feature. 62 # each row corresponds to one possible value of each feature.
63 one_hot_encoder = OneHotEncoder(categories='auto') 63 one_hot_encoder = OneHotEncoder(categories='auto')
64 input_features = [] 64 input_features = []
65 - sequences = []
66 65
67 # Read file with sequences 66 # Read file with sequences
68 with open(os.path.join(args.inputPath, args.inputFile), mode="r") as tabfile: 67 with open(os.path.join(args.inputPath, args.inputFile), mode="r") as tabfile:
69 df = pd.read_csv(tabfile, delimiter='\t') 68 df = pd.read_csv(tabfile, delimiter='\t')
70 - print("df: {}".format(df)) 69 + print("All rows in df: {}".format(len(df.index)))
71 - sequences = df['sequence'] 70 + df_filtered = df.loc[df['label'] in ["exon", "utr"]]
72 - labels = df['label'] 71 + print("Only exon and utr rows in df: {}".format(len(df_filtered.index)))
72 + # print("df: {}".format(df))
73 + sequences = df_filtered['sequence']
74 + labels = df_filtered['label']
73 75
74 max_exon_length = 0 76 max_exon_length = 0
75 max_utr_length = 0 77 max_utr_length = 0
76 - # One-hot-encoding of sequences 78 + # Getting the max length of sequences
77 for sequence, label in zip(sequences, labels): 79 for sequence, label in zip(sequences, labels):
78 if label == "exon": 80 if label == "exon":
79 if len(sequence) > max_exon_length: 81 if len(sequence) > max_exon_length:
...@@ -81,17 +83,24 @@ if __name__ == "__main__": ...@@ -81,17 +83,24 @@ if __name__ == "__main__":
81 elif label == "utr": 83 elif label == "utr":
82 if len(sequence) > max_utr_length: 84 if len(sequence) > max_utr_length:
83 max_utr_length = len(sequence) 85 max_utr_length = len(sequence)
84 - ''' 86 + print("Max exon length: {}".format(max_exon_length))
87 + print("Max utr length: {}".format(max_utr_length))
88 +
89 + quit()
90 +
91 + # Fill sequence with X char to get max length
92 + # One-hot-encoding of sequences
93 + for sequence, label in zip(sequences, labels):
94 + if label == "exon":
95 + if len(sequence) < max_exon_length:
96 + sequence.ljust(max_exon_length + len(sequence), 'X')
97 + elif label == "utr":
98 + if len(sequence) < max_utr_length:
99 + sequence.ljust(max_utr_length + len(sequence), 'X')
85 integer_encoded = integer_encoder.fit_transform(list(sequence)) 100 integer_encoded = integer_encoder.fit_transform(list(sequence))
86 integer_encoded = np.array(integer_encoded).reshape(-1, 1) 101 integer_encoded = np.array(integer_encoded).reshape(-1, 1)
87 one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded) 102 one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded)
88 input_features.append(one_hot_encoded.toarray()) 103 input_features.append(one_hot_encoded.toarray())
89 - '''
90 -
91 - print("Max exon length: {}".format(max_exon_length))
92 - print("Max utr length: {}".format(max_utr_length))
93 -
94 - exit()
95 104
96 # Print first sequence and one-hot-encoding 105 # Print first sequence and one-hot-encoding
97 np.set_printoptions(threshold=40) 106 np.set_printoptions(threshold=40)
......