training-validation-binding-thrombin.py 7.79 KB
# -*- encoding: utf-8 -*-

import os
from time import time
import argparse
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \
    classification_report
import sys
from scipy.sparse import csr_matrix
import numpy as np

__author__ = 'CMendezC'

# Goal: training and validation binding thrombin data set

# Parameters:
# 1) --inputPath Path to read input files.
# 2) --inputTrainingData File to read training data.
# 3) --inputTestingData File to read testing data.
# 4) --inputTestingClasses File to read testing classes.
# 5) --outputModelPath Path to place output model.
# 6) --outputModelFile File to place output model.
# 7) --outputReportPath Path to place evaluation report.
# 8) --outputReportFile File to place evaluation report.
# 9) --classifier Classifier: BernoulliNB, SVM, NearestCentroid.

# Ouput:
# 1) Classification model and evaluation report.

# Execution:

# python training-validation-binding-thrombin.py
# --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset
# --inputTrainingData thrombin.data
# --inputTestingData Thrombin.testset
# --inputTestingClasses Thrombin.testset.class
# --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models
# --outputModelFile SVM-model.mod
# --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports
# --outputReportFile SVM.txt
# --classifier SVM

# source activate python3
# python training-validation-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM.txt --classifier SVM

###########################################################
#                       MAIN PROGRAM                      #
###########################################################

if __name__ == "__main__":
    # Parameter definition
    parser = argparse.ArgumentParser(description='Training validation Binding Thrombin Dataset.')
    parser.add_argument("--inputPath", dest="inputPath",
                      help="Path to read input files", metavar="PATH")
    parser.add_argument("--inputTrainingData", dest="inputTrainingData",
                      help="File to read training data", metavar="FILE")
    parser.add_argument("--inputTestingData", dest="inputTestingData",
                      help="File to read testing data", metavar="FILE")
    parser.add_argument("--inputTestingClasses", dest="inputTestingClasses",
                      help="File to read testing classes", metavar="FILE")
    parser.add_argument("--outputModelPath", dest="outputModelPath",
                      help="Path to place output model", metavar="PATH")
    parser.add_argument("--outputModelFile", dest="outputModelFile",
                      help="File to place output model", metavar="FILE")
    parser.add_argument("--outputReportPath", dest="outputReportPath",
                      help="Path to place evaluation report", metavar="PATH")
    parser.add_argument("--outputReportFile", dest="outputReportFile",
                      help="File to place evaluation report", metavar="FILE")
    parser.add_argument("--classifier", dest="classifier",
                      help="Classifier", metavar="NAME",
                      choices=('BernoulliNB', 'SVM', 'NearestCentroid'), default='SVM')

    args = parser.parse_args()

    # Printing parameter values
    print('-------------------------------- PARAMETERS --------------------------------')
    print("Path to read input files: " + str(args.inputPath))
    print("File to read training data: " + str(args.inputTrainingData))
    print("File to read testing data: " + str(args.inputTestingData))
    print("File to read testing classes: " + str(args.inputTestingClasses))
    print("Path to place output model: " + str(args.outputModelPath))
    print("File to place output model: " + str(args.outputModelFile))
    print("Path to place evaluation report: " + str(args.outputReportPath))
    print("File to place evaluation report: " + str(args.outputReportFile))
    print("Classifier: " + str(args.classifier))

    # Start time
    t0 = time()

    print("   Reading training data and true classes...")
    trainingClasses = []
    trainingData = []
    with open(os.path.join(args.inputPath, args.inputTrainingData), encoding='utf8', mode='r') \
            as iFile:
        for line in iFile:
            line = line.strip('\r\n')
            listLine = line.split(',')
            trainingClasses.append(listLine[0])
            trainingData.append(listLine[1:])
    # trainingMatrix = np.matrix(trainingData)
    trainingMatrix = csr_matrix(trainingData, dtype='double')

    print("Number of training classes: {}".format(len(trainingClasses)))
    print("Number of training class A: {}".format(trainingClasses.count('A')))
    print("Number of training class I: {}".format(trainingClasses.count('I')))
    print("Shape of training matrix: {}".format(trainingMatrix.shape))

    print("   Reading testing data and true classes...")
    testingClasses = []
    testingData = []
    with open(os.path.join(args.inputPath, args.inputTestingData), encoding='utf8', mode='r') \
            as iFile:
        for line in iFile:
            line = line.strip('\r\n')
            listLine = line.split(',')
            testingData.append(listLine[1:])
    testingMatrix = csr_matrix(testingData, dtype='double')
    with open(os.path.join(args.inputPath, args.inputTestingClasses), encoding='utf8', mode='r') \
            as iFile:
        for line in iFile:
            line = line.strip('\r\n')
            testingClasses.append(line)

    print("Number of testing classes: {}".format(len(testingClasses)))
    print("Number of testing class A: {}".format(trainingClasses.count('A')))
    print("Number of testing class I: {}".format(trainingClasses.count('I')))
    print("Shape of testing matrix: {}".format(testingMatrix.shape))

    if args.classifier == "MultinomialNB":
        classifier = BernoulliNB()
    elif args.classifier == "SVM":
        classifier = SVC()
    elif args.classifier == "NearestCentroid":
        classifier = NearestCentroid()

    print("   Training...")
    classifier.fit(trainingMatrix, trainingClasses)
    print("      Done!")

    print("   Testing (prediction in new data)...")
    y_pred = classifier.predict(testingMatrix)
    print("      Done!")

    print("   Saving report...")
    with open(os.path.join(args.outputPath, args.outputFile), mode='w', encoding='utf8') as oFile:
        oFile.write('**********        EVALUATION REPORT     **********\n')
        oFile.write('Classifier: {}\n'.format(args.classifier))
        oFile.write('Accuracy: {}\n'.format(accuracy_score(testingClasses, y_pred)))
        oFile.write('Precision: {}\n'.format(precision_score(testingClasses, y_pred, average='weighted')))
        oFile.write('Recall: {}\n'.format(recall_score(testingClasses, y_pred, average='weighted')))
        oFile.write('F-score: {}\n'.format(f1_score(testingClasses, y_pred, average='weighted')))
        oFile.write('Confusion matrix: \n')
        oFile.write(str(confusion_matrix(testingClasses, y_pred)) + '\n')
        oFile.write('Classification report: \n')
        oFile.write(classification_report(testingClasses, y_pred) + '\n')
    print("      Done!")

    print("Training and testing done in: %fs" % (time() - t0))