Carlos-Francisco Méndez-Cruz

Training, crossvalidation and testing dataset

1 +# -*- encoding: utf-8 -*-
2 +
3 +import os
4 +from time import time
5 +import argparse
6 +from sklearn.naive_bayes import BernoulliNB
7 +from sklearn.svm import SVC
8 +from sklearn.neighbors import KNeighborsClassifier
9 +from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \
10 + classification_report
11 +from sklearn.externals import joblib
12 +from sklearn import model_selection
13 +from sklearn.feature_selection import SelectKBest, chi2
14 +from sklearn.decomposition import TruncatedSVD
15 +from scipy.sparse import csr_matrix
16 +import scipy
17 +from imblearn.combine import SMOTEENN, SMOTETomek
18 +from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
19 +from imblearn.under_sampling import EditedNearestNeighbours, TomekLinks, \
20 + OneSidedSelection, RandomUnderSampler, NeighbourhoodCleaningRule, \
21 + InstanceHardnessThreshold, ClusterCentroids
22 +from imblearn.ensemble import EasyEnsemble, BalanceCascade
23 +
24 +__author__ = 'CMendezC'
25 +
26 +# Goal: training, crossvalidation and testing binding thrombin data set
27 +
28 +# Parameters:
29 +# 1) --inputPath Path to read input files.
30 +# 2) --inputTrainingData File to read training data.
31 +# 3) --inputTestingData File to read testing data.
32 +# 4) --inputTestingClasses File to read testing classes.
33 +# 5) --outputModelPath Path to place output model.
34 +# 6) --outputModelFile File to place output model.
35 +# 7) --outputReportPath Path to place evaluation report.
36 +# 8) --outputReportFile File to place evaluation report.
37 +# 9) --classifier Classifier: BernoulliNB, SVM, kNN.
38 +# 10) --saveData Save matrices
39 +# 11) --kernel Kernel
40 +# 12) --reduction Feature selection or dimensionality reduction
41 +# 13) --imbalanced Imbalanced method
42 +
43 +# Ouput:
44 +# 1) Classification model and evaluation report.
45 +
46 +# Execution:
47 +
48 +# python training-crossvalidation-testing-binding-thrombin.py
49 +# --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset
50 +# --inputTrainingData thrombin.data
51 +# --inputTestingData Thrombin.testset
52 +# --inputTestingClasses Thrombin.testset.class
53 +# --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models
54 +# --outputModelFile SVM-lineal-model.mod
55 +# --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports
56 +# --outputReportFile SVM-lineal.txt
57 +# --classifier SVM
58 +# --saveData
59 +# --kernel linear
60 +# --imbalanced RandomUS
61 +
62 +# source activate python3
63 +# python training-crossvalidation-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-lineal-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM-lineal.txt --classifier SVM --kernel linear --imbalanced RandomUS
64 +
65 +###########################################################
66 +# MAIN PROGRAM #
67 +###########################################################
68 +
69 +if __name__ == "__main__":
70 + # Parameter definition
71 + parser = argparse.ArgumentParser(description='Training validation Binding Thrombin Dataset.')
72 + parser.add_argument("--inputPath", dest="inputPath",
73 + help="Path to read input files", metavar="PATH")
74 + parser.add_argument("--inputTrainingData", dest="inputTrainingData",
75 + help="File to read training data", metavar="FILE")
76 + parser.add_argument("--inputTestingData", dest="inputTestingData",
77 + help="File to read testing data", metavar="FILE")
78 + parser.add_argument("--inputTestingClasses", dest="inputTestingClasses",
79 + help="File to read testing classes", metavar="FILE")
80 + parser.add_argument("--outputModelPath", dest="outputModelPath",
81 + help="Path to place output model", metavar="PATH")
82 + parser.add_argument("--outputModelFile", dest="outputModelFile",
83 + help="File to place output model", metavar="FILE")
84 + parser.add_argument("--outputReportPath", dest="outputReportPath",
85 + help="Path to place evaluation report", metavar="PATH")
86 + parser.add_argument("--outputReportFile", dest="outputReportFile",
87 + help="File to place evaluation report", metavar="FILE")
88 + parser.add_argument("--classifier", dest="classifier",
89 + help="Classifier", metavar="NAME",
90 + choices=('BernoulliNB', 'SVM', 'kNN'), default='SVM')
91 + parser.add_argument("--saveData", dest="saveData", action='store_true',
92 + help="Save matrices")
93 + parser.add_argument("--kernel", dest="kernel",
94 + help="Kernel SVM", metavar="NAME",
95 + choices=('linear', 'rbf', 'poly'), default='linear')
96 + parser.add_argument("--reduction", dest="reduction",
97 + help="Feature selection or dimensionality reduction", metavar="NAME",
98 + choices=('SVD200', 'SVD300', 'CHI250', 'CHI2100'), default=None)
99 + parser.add_argument("--imbalanced", dest="imbalanced",
100 + choices=('RandomUS', 'Tomek', 'NCR',
101 + 'IHT', 'RandomOS', 'ADASYN', 'SMOTE_reg',
102 + 'SMOTE_svm', 'SMOTE_b1', 'SMOTE_b2', 'OSS',
103 + 'SMOTE+ENN'), default=None,
104 + help="Undersampling: RandomUS, Tomek, Neighbourhood Cleanning Rule (NCR), "
105 + "Instance Hardess Threshold (IHT), One Sided Selection (OSS). "
106 + "Oversampling: RandomOS, ADACYN, SMOTE_reg, "
107 + "SMOTE_svm, SMOTE_b1, SMOTE_b2. Combine: "
108 + "SMOTE + ENN", metavar="TEXT")
109 +
110 + args = parser.parse_args()
111 +
112 + # Printing parameter values
113 + print('-------------------------------- PARAMETERS --------------------------------')
114 + print("Path to read input files: " + str(args.inputPath))
115 + print("File to read training data: " + str(args.inputTrainingData))
116 + print("File to read testing data: " + str(args.inputTestingData))
117 + print("File to read testing classes: " + str(args.inputTestingClasses))
118 + print("Path to place output model: " + str(args.outputModelPath))
119 + print("File to place output model: " + str(args.outputModelFile))
120 + print("Path to place evaluation report: " + str(args.outputReportPath))
121 + print("File to place evaluation report: " + str(args.outputReportFile))
122 + print("Classifier: " + str(args.classifier))
123 + print("Save matrices: " + str(args.saveData))
124 + print("Kernel: " + str(args.kernel))
125 + print("Reduction: " + str(args.reduction))
126 + print("Imbalanced: " + str(args.imbalanced))
127 +
128 + # Start time
129 + t0 = time()
130 +
131 + print("Reading training data and true classes...")
132 + X_train = None
133 + if args.saveData:
134 + y_train = []
135 + trainingData = []
136 + with open(os.path.join(args.inputPath, args.inputTrainingData), encoding='utf8', mode='r') \
137 + as iFile:
138 + for line in iFile:
139 + line = line.strip('\r\n')
140 + listLine = line.split(',')
141 + y_train.append(listLine[0])
142 + trainingData.append(listLine[1:])
143 + # X_train = np.matrix(trainingData)
144 + X_train = csr_matrix(trainingData, dtype='double')
145 + print(" Saving matrix and classes...")
146 + joblib.dump(X_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb'))
147 + joblib.dump(y_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb'))
148 + print(" Done!")
149 + else:
150 + print(" Loading matrix and classes...")
151 + X_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb'))
152 + y_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb'))
153 + print(" Done!")
154 +
155 + print(" Number of training classes: {}".format(len(y_train)))
156 + print(" Number of training class A: {}".format(y_train.count('A')))
157 + print(" Number of training class I: {}".format(y_train.count('I')))
158 + print(" Shape of training matrix: {}".format(X_train.shape))
159 +
160 + print("Reading testing data and true classes...")
161 + X_test = None
162 + if args.saveData:
163 + y_test = []
164 + testingData = []
165 + with open(os.path.join(args.inputPath, args.inputTestingData), encoding='utf8', mode='r') \
166 + as iFile:
167 + for line in iFile:
168 + line = line.strip('\r\n')
169 + listLine = line.split(',')
170 + testingData.append(listLine[1:])
171 + X_test = csr_matrix(testingData, dtype='double')
172 + with open(os.path.join(args.inputPath, args.inputTestingClasses), encoding='utf8', mode='r') \
173 + as iFile:
174 + for line in iFile:
175 + line = line.strip('\r\n')
176 + y_test.append(line)
177 + print(" Saving matrix and classes...")
178 + joblib.dump(X_test, os.path.join(args.outputModelPath, args.inputTestingData + '.jlb'))
179 + joblib.dump(y_test, os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb'))
180 + print(" Done!")
181 + else:
182 + print(" Loading matrix and classes...")
183 + X_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingData + '.jlb'))
184 + y_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb'))
185 + print(" Done!")
186 +
187 + print(" Number of testing classes: {}".format(len(y_test)))
188 + print(" Number of testing class A: {}".format(y_test.count('A')))
189 + print(" Number of testing class I: {}".format(y_test.count('I')))
190 + print(" Shape of testing matrix: {}".format(X_test.shape))
191 +
192 + # Feature selection and dimensional reduction
193 + if args.reduction is not None:
194 + print('Performing dimensionality reduction or feature selection...', args.reduction)
195 + if args.reduction == 'SVD200':
196 + reduc = TruncatedSVD(n_components=200, random_state=42)
197 + X_train = reduc.fit_transform(X_train)
198 + if args.reduction == 'SVD300':
199 + reduc = TruncatedSVD(n_components=300, random_state=42)
200 + X_train = reduc.fit_transform(X_train)
201 + elif args.reduction == 'CHI250':
202 + reduc = SelectKBest(chi2, k=50)
203 + X_train = reduc.fit_transform(X_train, y_train)
204 + elif args.reduction == 'CHI2100':
205 + reduc = SelectKBest(chi2, k=100)
206 + X_train = reduc.fit_transform(X_train, y_train)
207 + print(" Done!")
208 + print(' New shape of training matrix: ', X_train.shape)
209 +
210 + if args.imbalanced != None:
211 + t1 = time()
212 + # Combination over and under sampling
213 + jobs = 15
214 + if args.imbalanced == "SMOTE+ENN":
215 + sm = SMOTEENN(random_state=42, n_jobs=jobs)
216 + elif args.imbalanced == "SMOTE+Tomek":
217 + sm = SMOTETomek(random_state=42, n_jobs=jobs)
218 + # Over sampling
219 + elif args.imbalanced == "SMOTE_reg":
220 + sm = SMOTE(random_state=42, n_jobs=jobs)
221 + elif args.imbalanced == "SMOTE_svm":
222 + sm = SMOTE(random_state=42, n_jobs=jobs, kind='svm')
223 + elif args.imbalanced == "SMOTE_b1":
224 + sm = SMOTE(random_state=42, n_jobs=jobs, kind='borderline1')
225 + elif args.imbalanced == "SMOTE_b2":
226 + sm = SMOTE(random_state=42, n_jobs=jobs, kind='borderline2')
227 + elif args.imbalanced == "RandomOS":
228 + sm = RandomOverSampler(random_state=42)
229 + # Under sampling
230 + elif args.imbalanced == "ENN":
231 + sm = EditedNearestNeighbours(random_state=42, n_jobs=jobs)
232 + elif args.imbalanced == "Tomek":
233 + sm = TomekLinks(random_state=42, n_jobs=jobs)
234 + elif args.imbalanced == "OSS":
235 + sm = OneSidedSelection(random_state=42, n_jobs=jobs)
236 + elif args.imbalanced == "RandomUS":
237 + sm = RandomUnderSampler(random_state=42)
238 + elif args.imbalanced == "NCR":
239 + sm = NeighbourhoodCleaningRule(random_state=42, n_jobs=jobs)
240 + elif args.imbalanced == "IHT":
241 + sm = InstanceHardnessThreshold(random_state=42, n_jobs=jobs)
242 + elif args.imbalanced == "ClusterC":
243 + sm = ClusterCentroids(random_state=42, n_jobs=jobs)
244 + elif args.imbalanced == "Balanced":
245 + sm = BalanceCascade(random_state=42)
246 + elif args.imbalanced == "Easy":
247 + sm = EasyEnsemble(random_state=42, n_subsets=3)
248 + elif args.imbalanced == "ADASYN":
249 + sm = ADASYN(random_state=42, n_jobs=jobs)
250 +
251 + # Apply transformation
252 + X_train, y_train = sm.fit_sample(X_train, y_train)
253 +
254 + print(" After transformtion with {}".format(args.imbalanced))
255 + print(" Number of testing classes: {}".format(len(y_test)))
256 + print(" Number of testing class A: {}".format(y_test.count('A')))
257 + print(" Number of testing class I: {}".format(y_test.count('I')))
258 + print(" Shape of testing matrix: {}".format(X_test.shape))
259 + print(" Data transformation done in : %fs" % (time() - t1))
260 +
261 + jobs = -1
262 + paramGrid = []
263 + nIter = 20
264 + crossV = 10
265 + print("Defining randomized grid search...")
266 + if args.classifier == 'SVM':
267 + # SVM
268 + classifier = SVC()
269 + if args.kernel == 'rbf':
270 + paramGrid = {'C': scipy.stats.expon(scale=100),
271 + 'gamma': scipy.stats.expon(scale=.1),
272 + 'kernel': ['rbf'], 'class_weight': ['balanced', None]}
273 + elif args.kernel == 'linear':
274 + paramGrid = {'C': scipy.stats.expon(scale=100),
275 + 'kernel': ['linear'],
276 + 'class_weight': ['balanced', None]}
277 + elif args.kernel == 'poly':
278 + paramGrid = {'C': scipy.stats.expon(scale=100),
279 + 'gamma': scipy.stats.expon(scale=.1), 'degree': [2, 3],
280 + 'kernel': ['poly'], 'class_weight': ['balanced', None]}
281 + myClassifier = model_selection.RandomizedSearchCV(classifier,
282 + paramGrid, n_iter=nIter,
283 + cv=crossV, n_jobs=jobs, verbose=3)
284 + elif args.classifier == 'BernoulliNB':
285 + # BernoulliNB
286 + classifier = BernoulliNB()
287 + paramGrid = {'alpha': scipy.stats.expon(scale=1.0)}
288 + myClassifier = model_selection.RandomizedSearchCV(classifier, paramGrid, n_iter=nIter,
289 + cv=crossV, n_jobs=jobs, verbose=3)
290 + # elif args.classifier == 'kNN':
291 + # # kNN
292 + # k_range = list(range(1, 7, 2))
293 + # classifier = KNeighborsClassifier()
294 + # paramGrid = {'n_neighbors ': k_range}
295 + # myClassifier = model_selection.RandomizedSearchCV(classifier, paramGrid, n_iter=3,
296 + # cv=crossV, n_jobs=jobs, verbose=3)
297 + else:
298 + print("Bad classifier")
299 + exit()
300 + print(" Done!")
301 +
302 + print("Training...")
303 + myClassifier.fit(X_train, y_train)
304 + print(" Done!")
305 +
306 + print("Testing (prediction in new data)...")
307 + if args.reduction is not None:
308 + X_test = reduc.transform(X_test)
309 + y_pred = myClassifier.predict(X_test)
310 + best_parameters = myClassifier.best_estimator_.get_params()
311 + print(" Done!")
312 +
313 + print("Saving report...")
314 + with open(os.path.join(args.outputReportPath, args.outputReportFile), mode='w', encoding='utf8') as oFile:
315 + oFile.write('********** EVALUATION REPORT **********\n')
316 + oFile.write('Reduction: {}\n'.format(args.reduction))
317 + oFile.write('Classifier: {}\n'.format(args.classifier))
318 + oFile.write('Kernel: {}\n'.format(args.kernel))
319 + oFile.write('Accuracy: {}\n'.format(accuracy_score(y_test, y_pred)))
320 + oFile.write('Precision: {}\n'.format(precision_score(y_test, y_pred, average='weighted')))
321 + oFile.write('Recall: {}\n'.format(recall_score(y_test, y_pred, average='weighted')))
322 + oFile.write('F-score: {}\n'.format(f1_score(y_test, y_pred, average='weighted')))
323 + oFile.write('Confusion matrix: \n')
324 + oFile.write(str(confusion_matrix(y_test, y_pred)) + '\n')
325 + oFile.write('Classification report: \n')
326 + oFile.write(classification_report(y_test, y_pred) + '\n')
327 + oFile.write('Best parameters: \n')
328 + for param in sorted(best_parameters.keys()):
329 + oFile.write("\t%s: %r\n" % (param, best_parameters[param]))
330 + print(" Done!")
331 +
332 + print("Training and testing done in: %fs" % (time() - t0))
...@@ -55,6 +55,7 @@ __author__ = 'CMendezC' ...@@ -55,6 +55,7 @@ __author__ = 'CMendezC'
55 # source activate python3 55 # source activate python3
56 # python training-crossvalidation-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-linear-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM-linear.txt --classifier SVM --kernel rbf --reduction SVD200 56 # python training-crossvalidation-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-linear-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM-linear.txt --classifier SVM --kernel rbf --reduction SVD200
57 # python training-crossvalidation-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile kNN-CHI2100-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile kNN-CHI2100.txt --classifier kNN --reduction CHI2100 57 # python training-crossvalidation-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile kNN-CHI2100-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile kNN-CHI2100.txt --classifier kNN --reduction CHI2100
58 +# python training-crossvalidation-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-rbf-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM-rbf.txt --classifier SVM --kernel rbf
58 59
59 ########################################################### 60 ###########################################################
60 # MAIN PROGRAM # 61 # MAIN PROGRAM #
......