Carlos-Francisco Méndez-Cruz

Training, crossvalidation and testing binding thrombin dataset

......@@ -10,7 +10,9 @@ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_sc
classification_report
from sklearn.externals import joblib
from sklearn import model_selection
from sklearn.feature_selection import SelectKBest, chi2
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
import scipy
__author__ = 'CMendezC'
......@@ -29,13 +31,14 @@ __author__ = 'CMendezC'
# 9) --classifier Classifier: BernoulliNB, SVM, NearestCentroid.
# 10) --saveData Save matrices
# 11) --kernel Kernel
# 12) --reduction Feature selection or dimensionality reduction
# Ouput:
# 1) Classification model and evaluation report.
# Execution:
# python training-testing-binding-thrombin.py
# python training-crossvalidation-testing-binding-thrombin.py
# --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset
# --inputTrainingData thrombin.data
# --inputTestingData Thrombin.testset
......@@ -47,9 +50,10 @@ __author__ = 'CMendezC'
# --classifier SVM
# --saveData
# --kernel linear
# --reduction SVD200
# source activate python3
# python training-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-linear-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM-linear.txt --classifier SVM --kernel linear
# python training-crossvalidation-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-linear-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM-linear.txt --classifier SVM --kernel linear --reduction SVD200
###########################################################
# MAIN PROGRAM #
......@@ -82,6 +86,9 @@ if __name__ == "__main__":
parser.add_argument("--kernel", dest="kernel",
help="Kernel SVM", metavar="NAME",
choices=('linear', 'rbf', 'poly'), default='linear')
parser.add_argument("--reduction", dest="reduction",
help="Feature selection or dimensionality reduction", metavar="NAME",
choices=('SVD200', 'SVD300', 'CHI250', 'CHI2100'), default='SVD200')
args = parser.parse_args()
......@@ -98,6 +105,7 @@ if __name__ == "__main__":
print("Classifier: " + str(args.classifier))
print("Save matrices: " + str(args.saveData))
print("Kernel: " + str(args.kernel))
print("Reduction: " + str(args.reduction))
# Start time
t0 = time()
......@@ -163,6 +171,24 @@ if __name__ == "__main__":
print(" Number of testing class I: {}".format(y_test.count('I')))
print(" Shape of testing matrix: {}".format(X_test.shape))
# Feature selection and dimensional reduction
if args.reduction is not None:
print('Performing dimensionality reduction or feature selection...', args.reduction)
if args.reduction == 'SVD200':
reduc = TruncatedSVD(n_components=200, random_state=42)
X_train = reduc.fit_transform(X_train)
if args.reduction == 'SVD300':
reduc = TruncatedSVD(n_components=300, random_state=42)
X_train = reduc.fit_transform(X_train)
elif args.reduction == 'CHI250':
reduc = SelectKBest(chi2, k=50)
X_train = reduc.fit_transform(X_train, y_train)
elif args.reduction == 'CHI2100':
reduc = SelectKBest(chi2, k=100)
X_train = reduc.fit_transform(X_train, y_train)
print(" Done!")
print(' New shape of training matrix: ', X_train.shape)
jobs = -1
paramGrid = []
nIter = 20
......