Carlos-Francisco Méndez-Cruz

Training, crossvalidation and testing binding thrombin dataset

...@@ -5,11 +5,13 @@ from time import time ...@@ -5,11 +5,13 @@ from time import time
5 import argparse 5 import argparse
6 from sklearn.naive_bayes import BernoulliNB 6 from sklearn.naive_bayes import BernoulliNB
7 from sklearn.svm import SVC 7 from sklearn.svm import SVC
8 -from sklearn.neighbors import NearestCentroid 8 +from sklearn.neighbors import KNeighborsClassifier
9 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \ 9 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \
10 classification_report 10 classification_report
11 from sklearn.externals import joblib 11 from sklearn.externals import joblib
12 +from sklearn import model_selection
12 from scipy.sparse import csr_matrix 13 from scipy.sparse import csr_matrix
14 +import scipy
13 15
14 __author__ = 'CMendezC' 16 __author__ = 'CMendezC'
15 17
...@@ -47,7 +49,7 @@ __author__ = 'CMendezC' ...@@ -47,7 +49,7 @@ __author__ = 'CMendezC'
47 # --kernel linear 49 # --kernel linear
48 50
49 # source activate python3 51 # source activate python3
50 -# python training-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM.txt --classifier SVM 52 +# python training-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-linear-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM-linear.txt --classifier SVM --kernel linear
51 53
52 ########################################################### 54 ###########################################################
53 # MAIN PROGRAM # 55 # MAIN PROGRAM #
...@@ -77,6 +79,9 @@ if __name__ == "__main__": ...@@ -77,6 +79,9 @@ if __name__ == "__main__":
77 choices=('BernoulliNB', 'SVM', 'NearestCentroid'), default='SVM') 79 choices=('BernoulliNB', 'SVM', 'NearestCentroid'), default='SVM')
78 parser.add_argument("--saveData", dest="saveData", action='store_true', 80 parser.add_argument("--saveData", dest="saveData", action='store_true',
79 help="Save matrices") 81 help="Save matrices")
82 + parser.add_argument("--kernel", dest="kernel",
83 + help="Kernel SVM", metavar="NAME",
84 + choices=('linear', 'rbf', 'poly'), default='linear')
80 85
81 args = parser.parse_args() 86 args = parser.parse_args()
82 87
...@@ -92,6 +97,7 @@ if __name__ == "__main__": ...@@ -92,6 +97,7 @@ if __name__ == "__main__":
92 print("File to place evaluation report: " + str(args.outputReportFile)) 97 print("File to place evaluation report: " + str(args.outputReportFile))
93 print("Classifier: " + str(args.classifier)) 98 print("Classifier: " + str(args.classifier))
94 print("Save matrices: " + str(args.saveData)) 99 print("Save matrices: " + str(args.saveData))
100 + print("Kernel: " + str(args.kernel))
95 101
96 # Start time 102 # Start time
97 t0 = time() 103 t0 = time()
...@@ -157,15 +163,45 @@ if __name__ == "__main__": ...@@ -157,15 +163,45 @@ if __name__ == "__main__":
157 print(" Number of testing class I: {}".format(y_test.count('I'))) 163 print(" Number of testing class I: {}".format(y_test.count('I')))
158 print(" Shape of testing matrix: {}".format(X_test.shape)) 164 print(" Shape of testing matrix: {}".format(X_test.shape))
159 165
160 - if args.classifier == "BernoulliNB": 166 + jobs = -1
161 - classifier = BernoulliNB() 167 + paramGrid = []
162 - elif args.classifier == "SVM": 168 + nIter = 20
169 + crossV = 10
170 + print("Defining randomized grid search...")
171 + if args.classifier == 'SVM':
172 + # SVM
163 classifier = SVC() 173 classifier = SVC()
164 - elif args.classifier == "NearestCentroid": 174 + if args.kernel == 'rbf':
165 - classifier = NearestCentroid() 175 + paramGrid = {'C': scipy.stats.expon(scale=100),
176 + 'gamma': scipy.stats.expon(scale=.1),
177 + 'kernel': ['rbf'], 'class_weight': ['balanced', None]}
178 + elif args.kernel == 'linear':
179 + paramGrid = {'C': scipy.stats.expon(scale=100),
180 + 'kernel': ['linear'],
181 + 'class_weight': ['balanced', None]}
182 + elif args.kernel == 'poly':
183 + paramGrid = {'C': scipy.stats.expon(scale=100),
184 + 'gamma': scipy.stats.expon(scale=.1), 'degree': [2, 3],
185 + 'kernel': ['poly'], 'class_weight': ['balanced', None]}
186 + myClassifier = model_selection.RandomizedSearchCV(classifier,
187 + paramGrid, n_iter=nIter,
188 + cv=crossV, n_jobs=jobs, verbose=3)
189 + elif args.classifier == 'BernoulliNB':
190 + # BernoulliNB
191 + classifier = BernoulliNB()
192 + paramGrid = {'alpha': scipy.stats.expon(scale=1.0)}
193 + myClassifier = model_selection.RandomizedSearchCV(classifier, paramGrid, n_iter=nIter,
194 + cv=crossV, n_jobs=jobs, verbose=3)
195 + elif args.classifier == 'kNN':
196 + # kNN
197 + classifier = KNeighborsClassifier()
198 + paramGrid = {'n_neighbors ': [3, 5, 7]}
199 + myClassifier = model_selection.RandomizedSearchCV(classifier, paramGrid, n_iter=nIter,
200 + cv=crossV, n_jobs=jobs, verbose=3)
166 else: 201 else:
167 print("Bad classifier") 202 print("Bad classifier")
168 exit() 203 exit()
204 + print(" Done!")
169 205
170 print("Training...") 206 print("Training...")
171 classifier.fit(X_train, y_train) 207 classifier.fit(X_train, y_train)
......