Showing
1 changed file
with
21 additions
and
12 deletions
... | @@ -62,18 +62,20 @@ if __name__ == "__main__": | ... | @@ -62,18 +62,20 @@ if __name__ == "__main__": |
62 | # each row corresponds to one possible value of each feature. | 62 | # each row corresponds to one possible value of each feature. |
63 | one_hot_encoder = OneHotEncoder(categories='auto') | 63 | one_hot_encoder = OneHotEncoder(categories='auto') |
64 | input_features = [] | 64 | input_features = [] |
65 | - sequences = [] | ||
66 | 65 | ||
67 | # Read file with sequences | 66 | # Read file with sequences |
68 | with open(os.path.join(args.inputPath, args.inputFile), mode="r") as tabfile: | 67 | with open(os.path.join(args.inputPath, args.inputFile), mode="r") as tabfile: |
69 | df = pd.read_csv(tabfile, delimiter='\t') | 68 | df = pd.read_csv(tabfile, delimiter='\t') |
70 | - print("df: {}".format(df)) | 69 | + print("All rows in df: {}".format(len(df.index))) |
71 | - sequences = df['sequence'] | 70 | + df_filtered = df.loc[df['label'] in ["exon", "utr"]] |
72 | - labels = df['label'] | 71 | + print("Only exon and utr rows in df: {}".format(len(df_filtered.index))) |
72 | + # print("df: {}".format(df)) | ||
73 | + sequences = df_filtered['sequence'] | ||
74 | + labels = df_filtered['label'] | ||
73 | 75 | ||
74 | max_exon_length = 0 | 76 | max_exon_length = 0 |
75 | max_utr_length = 0 | 77 | max_utr_length = 0 |
76 | - # One-hot-encoding of sequences | 78 | + # Getting the max length of sequences |
77 | for sequence, label in zip(sequences, labels): | 79 | for sequence, label in zip(sequences, labels): |
78 | if label == "exon": | 80 | if label == "exon": |
79 | if len(sequence) > max_exon_length: | 81 | if len(sequence) > max_exon_length: |
... | @@ -81,17 +83,24 @@ if __name__ == "__main__": | ... | @@ -81,17 +83,24 @@ if __name__ == "__main__": |
81 | elif label == "utr": | 83 | elif label == "utr": |
82 | if len(sequence) > max_utr_length: | 84 | if len(sequence) > max_utr_length: |
83 | max_utr_length = len(sequence) | 85 | max_utr_length = len(sequence) |
84 | - ''' | 86 | + print("Max exon length: {}".format(max_exon_length)) |
87 | + print("Max utr length: {}".format(max_utr_length)) | ||
88 | + | ||
89 | + quit() | ||
90 | + | ||
91 | + # Fill sequence with X char to get max length | ||
92 | + # One-hot-encoding of sequences | ||
93 | + for sequence, label in zip(sequences, labels): | ||
94 | + if label == "exon": | ||
95 | + if len(sequence) < max_exon_length: | ||
96 | + sequence.ljust(max_exon_length + len(sequence), 'X') | ||
97 | + elif label == "utr": | ||
98 | + if len(sequence) < max_utr_length: | ||
99 | + sequence.ljust(max_utr_length + len(sequence), 'X') | ||
85 | integer_encoded = integer_encoder.fit_transform(list(sequence)) | 100 | integer_encoded = integer_encoder.fit_transform(list(sequence)) |
86 | integer_encoded = np.array(integer_encoded).reshape(-1, 1) | 101 | integer_encoded = np.array(integer_encoded).reshape(-1, 1) |
87 | one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded) | 102 | one_hot_encoded = one_hot_encoder.fit_transform(integer_encoded) |
88 | input_features.append(one_hot_encoded.toarray()) | 103 | input_features.append(one_hot_encoded.toarray()) |
89 | - ''' | ||
90 | - | ||
91 | - print("Max exon length: {}".format(max_exon_length)) | ||
92 | - print("Max utr length: {}".format(max_utr_length)) | ||
93 | - | ||
94 | - exit() | ||
95 | 104 | ||
96 | # Print first sequence and one-hot-encoding | 105 | # Print first sequence and one-hot-encoding |
97 | np.set_printoptions(threshold=40) | 106 | np.set_printoptions(threshold=40) | ... | ... |
-
Please register or login to post a comment