Training, crossvalidation and testing binding thrombin dataset
Showing
1 changed file
with
43 additions
and
7 deletions
... | @@ -5,11 +5,13 @@ from time import time | ... | @@ -5,11 +5,13 @@ from time import time |
5 | import argparse | 5 | import argparse |
6 | from sklearn.naive_bayes import BernoulliNB | 6 | from sklearn.naive_bayes import BernoulliNB |
7 | from sklearn.svm import SVC | 7 | from sklearn.svm import SVC |
8 | -from sklearn.neighbors import NearestCentroid | 8 | +from sklearn.neighbors import KNeighborsClassifier |
9 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \ | 9 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \ |
10 | classification_report | 10 | classification_report |
11 | from sklearn.externals import joblib | 11 | from sklearn.externals import joblib |
12 | +from sklearn import model_selection | ||
12 | from scipy.sparse import csr_matrix | 13 | from scipy.sparse import csr_matrix |
14 | +import scipy | ||
13 | 15 | ||
14 | __author__ = 'CMendezC' | 16 | __author__ = 'CMendezC' |
15 | 17 | ||
... | @@ -47,7 +49,7 @@ __author__ = 'CMendezC' | ... | @@ -47,7 +49,7 @@ __author__ = 'CMendezC' |
47 | # --kernel linear | 49 | # --kernel linear |
48 | 50 | ||
49 | # source activate python3 | 51 | # source activate python3 |
50 | -# python training-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM.txt --classifier SVM | 52 | +# python training-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-linear-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM-linear.txt --classifier SVM --kernel linear |
51 | 53 | ||
52 | ########################################################### | 54 | ########################################################### |
53 | # MAIN PROGRAM # | 55 | # MAIN PROGRAM # |
... | @@ -77,6 +79,9 @@ if __name__ == "__main__": | ... | @@ -77,6 +79,9 @@ if __name__ == "__main__": |
77 | choices=('BernoulliNB', 'SVM', 'NearestCentroid'), default='SVM') | 79 | choices=('BernoulliNB', 'SVM', 'NearestCentroid'), default='SVM') |
78 | parser.add_argument("--saveData", dest="saveData", action='store_true', | 80 | parser.add_argument("--saveData", dest="saveData", action='store_true', |
79 | help="Save matrices") | 81 | help="Save matrices") |
82 | + parser.add_argument("--kernel", dest="kernel", | ||
83 | + help="Kernel SVM", metavar="NAME", | ||
84 | + choices=('linear', 'rbf', 'poly'), default='linear') | ||
80 | 85 | ||
81 | args = parser.parse_args() | 86 | args = parser.parse_args() |
82 | 87 | ||
... | @@ -92,6 +97,7 @@ if __name__ == "__main__": | ... | @@ -92,6 +97,7 @@ if __name__ == "__main__": |
92 | print("File to place evaluation report: " + str(args.outputReportFile)) | 97 | print("File to place evaluation report: " + str(args.outputReportFile)) |
93 | print("Classifier: " + str(args.classifier)) | 98 | print("Classifier: " + str(args.classifier)) |
94 | print("Save matrices: " + str(args.saveData)) | 99 | print("Save matrices: " + str(args.saveData)) |
100 | + print("Kernel: " + str(args.kernel)) | ||
95 | 101 | ||
96 | # Start time | 102 | # Start time |
97 | t0 = time() | 103 | t0 = time() |
... | @@ -157,15 +163,45 @@ if __name__ == "__main__": | ... | @@ -157,15 +163,45 @@ if __name__ == "__main__": |
157 | print(" Number of testing class I: {}".format(y_test.count('I'))) | 163 | print(" Number of testing class I: {}".format(y_test.count('I'))) |
158 | print(" Shape of testing matrix: {}".format(X_test.shape)) | 164 | print(" Shape of testing matrix: {}".format(X_test.shape)) |
159 | 165 | ||
160 | - if args.classifier == "BernoulliNB": | 166 | + jobs = -1 |
161 | - classifier = BernoulliNB() | 167 | + paramGrid = [] |
162 | - elif args.classifier == "SVM": | 168 | + nIter = 20 |
169 | + crossV = 10 | ||
170 | + print("Defining randomized grid search...") | ||
171 | + if args.classifier == 'SVM': | ||
172 | + # SVM | ||
163 | classifier = SVC() | 173 | classifier = SVC() |
164 | - elif args.classifier == "NearestCentroid": | 174 | + if args.kernel == 'rbf': |
165 | - classifier = NearestCentroid() | 175 | + paramGrid = {'C': scipy.stats.expon(scale=100), |
176 | + 'gamma': scipy.stats.expon(scale=.1), | ||
177 | + 'kernel': ['rbf'], 'class_weight': ['balanced', None]} | ||
178 | + elif args.kernel == 'linear': | ||
179 | + paramGrid = {'C': scipy.stats.expon(scale=100), | ||
180 | + 'kernel': ['linear'], | ||
181 | + 'class_weight': ['balanced', None]} | ||
182 | + elif args.kernel == 'poly': | ||
183 | + paramGrid = {'C': scipy.stats.expon(scale=100), | ||
184 | + 'gamma': scipy.stats.expon(scale=.1), 'degree': [2, 3], | ||
185 | + 'kernel': ['poly'], 'class_weight': ['balanced', None]} | ||
186 | + myClassifier = model_selection.RandomizedSearchCV(classifier, | ||
187 | + paramGrid, n_iter=nIter, | ||
188 | + cv=crossV, n_jobs=jobs, verbose=3) | ||
189 | + elif args.classifier == 'BernoulliNB': | ||
190 | + # BernoulliNB | ||
191 | + classifier = BernoulliNB() | ||
192 | + paramGrid = {'alpha': scipy.stats.expon(scale=1.0)} | ||
193 | + myClassifier = model_selection.RandomizedSearchCV(classifier, paramGrid, n_iter=nIter, | ||
194 | + cv=crossV, n_jobs=jobs, verbose=3) | ||
195 | + elif args.classifier == 'kNN': | ||
196 | + # kNN | ||
197 | + classifier = KNeighborsClassifier() | ||
198 | + paramGrid = {'n_neighbors ': [3, 5, 7]} | ||
199 | + myClassifier = model_selection.RandomizedSearchCV(classifier, paramGrid, n_iter=nIter, | ||
200 | + cv=crossV, n_jobs=jobs, verbose=3) | ||
166 | else: | 201 | else: |
167 | print("Bad classifier") | 202 | print("Bad classifier") |
168 | exit() | 203 | exit() |
204 | + print(" Done!") | ||
169 | 205 | ||
170 | print("Training...") | 206 | print("Training...") |
171 | classifier.fit(X_train, y_train) | 207 | classifier.fit(X_train, y_train) | ... | ... |
-
Please register or login to post a comment