Carlos-Francisco Méndez-Cruz

Training, crossvalidation and testing dataset

1 +# -*- encoding: utf-8 -*-
2 +
3 +import os
4 +from time import time
5 +import argparse
6 +from sklearn.naive_bayes import BernoulliNB
7 +from sklearn.svm import SVC
8 +from sklearn.neighbors import KNeighborsClassifier
9 +from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \
10 + classification_report
11 +from sklearn.externals import joblib
12 +from sklearn import model_selection
13 +from scipy.sparse import csr_matrix
14 +import scipy
15 +from imblearn.under_sampling import RandomUnderSampler
16 +from imblearn.over_sampling import RandomOverSampler
17 +
18 +__author__ = 'CMendezC'
19 +
20 +# Goal: training, crossvalidation and testing binding thrombin data set
21 +
22 +# Parameters:
23 +# 1) --inputPath Path to read input files.
24 +# 2) --inputTrainingData File to read training data.
25 +# 3) --inputTestingData File to read testing data.
26 +# 4) --inputTestingClasses File to read testing classes.
27 +# 5) --outputModelPath Path to place output model.
28 +# 6) --outputModelFile File to place output model.
29 +# 7) --outputReportPath Path to place evaluation report.
30 +# 8) --outputReportFile File to place evaluation report.
31 +# 9) --classifier Classifier: BernoulliNB, SVM, kNN.
32 +# 10) --saveData Save matrices
33 +# 11) --kernel Kernel
34 +# 12) --imbalanced Imbalanced method
35 +
36 +# Ouput:
37 +# 1) Classification model and evaluation report.
38 +
39 +# Execution:
40 +
41 +# python training-crossvalidation-testing-binding-thrombin.py
42 +# --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset
43 +# --inputTrainingData thrombin.data
44 +# --inputTestingData Thrombin.testset
45 +# --inputTestingClasses Thrombin.testset.class
46 +# --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models
47 +# --outputModelFile SVM-lineal-model.mod
48 +# --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports
49 +# --outputReportFile SVM-lineal.txt
50 +# --classifier SVM
51 +# --saveData
52 +# --kernel linear
53 +# --imbalanced RandomUS
54 +
55 +# source activate python3
56 +# python training-crossvalidation-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-lineal-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM-lineal.txt --classifier SVM --kernel linear --imbalanced RandomUS
57 +
58 +###########################################################
59 +# MAIN PROGRAM #
60 +###########################################################
61 +
62 +if __name__ == "__main__":
63 + # Parameter definition
64 + parser = argparse.ArgumentParser(description='Training validation Binding Thrombin Dataset.')
65 + parser.add_argument("--inputPath", dest="inputPath",
66 + help="Path to read input files", metavar="PATH")
67 + parser.add_argument("--inputTrainingData", dest="inputTrainingData",
68 + help="File to read training data", metavar="FILE")
69 + parser.add_argument("--inputTestingData", dest="inputTestingData",
70 + help="File to read testing data", metavar="FILE")
71 + parser.add_argument("--inputTestingClasses", dest="inputTestingClasses",
72 + help="File to read testing classes", metavar="FILE")
73 + parser.add_argument("--outputModelPath", dest="outputModelPath",
74 + help="Path to place output model", metavar="PATH")
75 + parser.add_argument("--outputModelFile", dest="outputModelFile",
76 + help="File to place output model", metavar="FILE")
77 + parser.add_argument("--outputReportPath", dest="outputReportPath",
78 + help="Path to place evaluation report", metavar="PATH")
79 + parser.add_argument("--outputReportFile", dest="outputReportFile",
80 + help="File to place evaluation report", metavar="FILE")
81 + parser.add_argument("--classifier", dest="classifier",
82 + help="Classifier", metavar="NAME",
83 + choices=('BernoulliNB', 'SVM', 'kNN'), default='SVM')
84 + parser.add_argument("--saveData", dest="saveData", action='store_true',
85 + help="Save matrices")
86 + parser.add_argument("--kernel", dest="kernel",
87 + help="Kernel SVM", metavar="NAME",
88 + choices=('linear', 'rbf', 'poly'), default='linear')
89 + parser.add_argument("--imbalanced", dest="imbalanced",
90 + choices=('RandomUS', 'RandomOS'), default=None,
91 + help="Undersampling: RandomUS. Oversampling: RandomOS", metavar="TEXT")
92 +
93 + args = parser.parse_args()
94 +
95 + # Printing parameter values
96 + print('-------------------------------- PARAMETERS --------------------------------')
97 + print("Path to read input files: " + str(args.inputPath))
98 + print("File to read training data: " + str(args.inputTrainingData))
99 + print("File to read testing data: " + str(args.inputTestingData))
100 + print("File to read testing classes: " + str(args.inputTestingClasses))
101 + print("Path to place output model: " + str(args.outputModelPath))
102 + print("File to place output model: " + str(args.outputModelFile))
103 + print("Path to place evaluation report: " + str(args.outputReportPath))
104 + print("File to place evaluation report: " + str(args.outputReportFile))
105 + print("Classifier: " + str(args.classifier))
106 + print("Save matrices: " + str(args.saveData))
107 + print("Kernel: " + str(args.kernel))
108 + print("Imbalanced: " + str(args.imbalanced))
109 +
110 + # Start time
111 + t0 = time()
112 +
113 + print("Reading training data and true classes...")
114 + X_train = None
115 + if args.saveData:
116 + y_train = []
117 + trainingData = []
118 + with open(os.path.join(args.inputPath, args.inputTrainingData), encoding='utf8', mode='r') \
119 + as iFile:
120 + for line in iFile:
121 + line = line.strip('\r\n')
122 + listLine = line.split(',')
123 + y_train.append(listLine[0])
124 + trainingData.append(listLine[1:])
125 + # X_train = np.matrix(trainingData)
126 + X_train = csr_matrix(trainingData, dtype='double')
127 + print(" Saving matrix and classes...")
128 + joblib.dump(X_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb'))
129 + joblib.dump(y_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb'))
130 + print(" Done!")
131 + else:
132 + print(" Loading matrix and classes...")
133 + X_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb'))
134 + y_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb'))
135 + print(" Done!")
136 +
137 + print(" Number of training classes: {}".format(len(y_train)))
138 + print(" Number of training class A: {}".format(y_train.count('A')))
139 + print(" Number of training class I: {}".format(y_train.count('I')))
140 + print(" Shape of training matrix: {}".format(X_train.shape))
141 +
142 + if args.imbalanced != None:
143 + t1 = time()
144 + # Combination over and under sampling
145 + jobs = 15
146 + if args.imbalanced == "RandomOS":
147 + sm = RandomOverSampler(random_state=42)
148 + # Under sampling
149 + elif args.imbalanced == "RandomUS":
150 + sm = RandomUnderSampler(random_state=42)
151 +
152 + # Apply transformation
153 + X_train, y_train = sm.fit_sample(X_train, y_train)
154 +
155 + print(" After transformtion with {}".format(args.imbalanced))
156 + print(" Number of training classes: {}".format(len(y_train)))
157 + print(" Number of training class A: {}".format(list(y_train).count('A')))
158 + print(" Number of training class I: {}".format(list(y_train).count('I')))
159 + print(" Shape of training matrix: {}".format(X_train.shape))
160 + print(" Data transformation done in : %fs" % (time() - t1))
161 +
162 + print("Reading testing data and true classes...")
163 + X_test = None
164 + if args.saveData:
165 + y_test = []
166 + testingData = []
167 + with open(os.path.join(args.inputPath, args.inputTestingData), encoding='utf8', mode='r') \
168 + as iFile:
169 + for line in iFile:
170 + line = line.strip('\r\n')
171 + listLine = line.split(',')
172 + testingData.append(listLine[1:])
173 + X_test = csr_matrix(testingData, dtype='double')
174 + with open(os.path.join(args.inputPath, args.inputTestingClasses), encoding='utf8', mode='r') \
175 + as iFile:
176 + for line in iFile:
177 + line = line.strip('\r\n')
178 + y_test.append(line)
179 + print(" Saving matrix and classes...")
180 + joblib.dump(X_test, os.path.join(args.outputModelPath, args.inputTestingData + '.jlb'))
181 + joblib.dump(y_test, os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb'))
182 + print(" Done!")
183 + else:
184 + print(" Loading matrix and classes...")
185 + X_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingData + '.jlb'))
186 + y_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb'))
187 + print(" Done!")
188 +
189 + print(" Number of testing classes: {}".format(len(y_test)))
190 + print(" Number of testing class A: {}".format(y_test.count('A')))
191 + print(" Number of testing class I: {}".format(y_test.count('I')))
192 + print(" Shape of testing matrix: {}".format(X_test.shape))
193 +
194 + jobs = -1
195 + paramGrid = []
196 + nIter = 20
197 + crossV = 10
198 + print("Defining randomized grid search...")
199 + if args.classifier == 'SVM':
200 + # SVM
201 + classifier = SVC(args.kernel)
202 + elif args.classifier == 'BernoulliNB':
203 + # BernoulliNB
204 + classifier = BernoulliNB()
205 + elif args.classifier == 'kNN':
206 + # kNN
207 + k_range = list(range(1, 7, 2))
208 + classifier = KNeighborsClassifier()
209 + else:
210 + print("Bad classifier")
211 + exit()
212 + print(" Done!")
213 +
214 + print("Training...")
215 + classifier.fit(X_train, y_train)
216 + print(" Done!")
217 +
218 + y_pred = classifier.predict(X_test)
219 + best_parameters = classifier.best_estimator_.get_params()
220 + print(" Done!")
221 +
222 + print("Saving report...")
223 + with open(os.path.join(args.outputReportPath, args.outputReportFile), mode='w', encoding='utf8') as oFile:
224 + oFile.write('********** EVALUATION REPORT **********\n')
225 + oFile.write('Reduction: {}\n'.format(args.reduction))
226 + oFile.write('Classifier: {}\n'.format(args.classifier))
227 + oFile.write('Kernel: {}\n'.format(args.kernel))
228 + oFile.write('Accuracy: {}\n'.format(accuracy_score(y_test, y_pred)))
229 + oFile.write('Precision: {}\n'.format(precision_score(y_test, y_pred, average='weighted')))
230 + oFile.write('Recall: {}\n'.format(recall_score(y_test, y_pred, average='weighted')))
231 + oFile.write('F-score: {}\n'.format(f1_score(y_test, y_pred, average='weighted')))
232 + oFile.write('Confusion matrix: \n')
233 + oFile.write(str(confusion_matrix(y_test, y_pred)) + '\n')
234 + oFile.write('Classification report: \n')
235 + oFile.write(classification_report(y_test, y_pred) + '\n')
236 + oFile.write('Best parameters: \n')
237 + for param in sorted(best_parameters.keys()):
238 + oFile.write("\t%s: %r\n" % (param, best_parameters[param]))
239 + print(" Done!")
240 +
241 + print("Training and testing done in: %fs" % (time() - t0))