Carlos-Francisco Méndez-Cruz

Training, crossvalidation and testing binding thrombin dataset

......@@ -5,11 +5,13 @@ from time import time
import argparse
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.neighbors import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \
classification_report
from sklearn.externals import joblib
from sklearn import model_selection
from scipy.sparse import csr_matrix
import scipy
__author__ = 'CMendezC'
......@@ -47,7 +49,7 @@ __author__ = 'CMendezC'
# --kernel linear
# source activate python3
# python training-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM.txt --classifier SVM
# python training-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-linear-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM-linear.txt --classifier SVM --kernel linear
###########################################################
# MAIN PROGRAM #
......@@ -77,6 +79,9 @@ if __name__ == "__main__":
choices=('BernoulliNB', 'SVM', 'NearestCentroid'), default='SVM')
parser.add_argument("--saveData", dest="saveData", action='store_true',
help="Save matrices")
parser.add_argument("--kernel", dest="kernel",
help="Kernel SVM", metavar="NAME",
choices=('linear', 'rbf', 'poly'), default='linear')
args = parser.parse_args()
......@@ -92,6 +97,7 @@ if __name__ == "__main__":
print("File to place evaluation report: " + str(args.outputReportFile))
print("Classifier: " + str(args.classifier))
print("Save matrices: " + str(args.saveData))
print("Kernel: " + str(args.kernel))
# Start time
t0 = time()
......@@ -157,15 +163,45 @@ if __name__ == "__main__":
print(" Number of testing class I: {}".format(y_test.count('I')))
print(" Shape of testing matrix: {}".format(X_test.shape))
if args.classifier == "BernoulliNB":
classifier = BernoulliNB()
elif args.classifier == "SVM":
jobs = -1
paramGrid = []
nIter = 20
crossV = 10
print("Defining randomized grid search...")
if args.classifier == 'SVM':
# SVM
classifier = SVC()
elif args.classifier == "NearestCentroid":
classifier = NearestCentroid()
if args.kernel == 'rbf':
paramGrid = {'C': scipy.stats.expon(scale=100),
'gamma': scipy.stats.expon(scale=.1),
'kernel': ['rbf'], 'class_weight': ['balanced', None]}
elif args.kernel == 'linear':
paramGrid = {'C': scipy.stats.expon(scale=100),
'kernel': ['linear'],
'class_weight': ['balanced', None]}
elif args.kernel == 'poly':
paramGrid = {'C': scipy.stats.expon(scale=100),
'gamma': scipy.stats.expon(scale=.1), 'degree': [2, 3],
'kernel': ['poly'], 'class_weight': ['balanced', None]}
myClassifier = model_selection.RandomizedSearchCV(classifier,
paramGrid, n_iter=nIter,
cv=crossV, n_jobs=jobs, verbose=3)
elif args.classifier == 'BernoulliNB':
# BernoulliNB
classifier = BernoulliNB()
paramGrid = {'alpha': scipy.stats.expon(scale=1.0)}
myClassifier = model_selection.RandomizedSearchCV(classifier, paramGrid, n_iter=nIter,
cv=crossV, n_jobs=jobs, verbose=3)
elif args.classifier == 'kNN':
# kNN
classifier = KNeighborsClassifier()
paramGrid = {'n_neighbors ': [3, 5, 7]}
myClassifier = model_selection.RandomizedSearchCV(classifier, paramGrid, n_iter=nIter,
cv=crossV, n_jobs=jobs, verbose=3)
else:
print("Bad classifier")
exit()
print(" Done!")
print("Training...")
classifier.fit(X_train, y_train)
......