Carlos-Francisco Méndez-Cruz

Training and testing binding thrombin dataset

......@@ -8,9 +8,8 @@ from sklearn.svm import SVC
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \
classification_report
import sys
from sklearn.externals import joblib
from scipy.sparse import csr_matrix
import numpy as np
__author__ = 'CMendezC'
......@@ -26,6 +25,7 @@ __author__ = 'CMendezC'
# 7) --outputReportPath Path to place evaluation report.
# 8) --outputReportFile File to place evaluation report.
# 9) --classifier Classifier: BernoulliNB, SVM, NearestCentroid.
# 10) --saveData Save matrices
# Ouput:
# 1) Classification model and evaluation report.
......@@ -42,9 +42,10 @@ __author__ = 'CMendezC'
# --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports
# --outputReportFile SVM.txt
# --classifier SVM
# --saveData
# source activate python3
# python training-validation-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM.txt --classifier SVM
# python training-validation-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM.txt --classifier SVM --saveData
###########################################################
# MAIN PROGRAM #
......@@ -72,6 +73,8 @@ if __name__ == "__main__":
parser.add_argument("--classifier", dest="classifier",
help="Classifier", metavar="NAME",
choices=('BernoulliNB', 'SVM', 'NearestCentroid'), default='SVM')
parser.add_argument("--saveData", dest="saveData", action='store_true',
help="Save matrices")
args = parser.parse_args()
......@@ -86,48 +89,63 @@ if __name__ == "__main__":
print("Path to place evaluation report: " + str(args.outputReportPath))
print("File to place evaluation report: " + str(args.outputReportFile))
print("Classifier: " + str(args.classifier))
print("Save matrices: " + str(args.saveData))
# Start time
t0 = time()
print(" Reading training data and true classes...")
trainingClasses = []
trainingData = []
with open(os.path.join(args.inputPath, args.inputTrainingData), encoding='utf8', mode='r') \
as iFile:
for line in iFile:
line = line.strip('\r\n')
listLine = line.split(',')
trainingClasses.append(listLine[0])
trainingData.append(listLine[1:])
# trainingMatrix = np.matrix(trainingData)
trainingMatrix = csr_matrix(trainingData, dtype='double')
print("Number of training classes: {}".format(len(trainingClasses)))
print("Number of training class A: {}".format(trainingClasses.count('A')))
print("Number of training class I: {}".format(trainingClasses.count('I')))
print("Shape of training matrix: {}".format(trainingMatrix.shape))
print(" Reading testing data and true classes...")
testingClasses = []
testingData = []
with open(os.path.join(args.inputPath, args.inputTestingData), encoding='utf8', mode='r') \
as iFile:
for line in iFile:
line = line.strip('\r\n')
listLine = line.split(',')
testingData.append(listLine[1:])
testingMatrix = csr_matrix(testingData, dtype='double')
with open(os.path.join(args.inputPath, args.inputTestingClasses), encoding='utf8', mode='r') \
as iFile:
for line in iFile:
line = line.strip('\r\n')
testingClasses.append(line)
print("Number of testing classes: {}".format(len(testingClasses)))
print("Number of testing class A: {}".format(trainingClasses.count('A')))
print("Number of testing class I: {}".format(trainingClasses.count('I')))
print("Shape of testing matrix: {}".format(testingMatrix.shape))
print("Reading training data and true classes...")
X_train = None
if args.saveData:
y_train = []
trainingData = []
with open(os.path.join(args.inputPath, args.inputTrainingData), encoding='utf8', mode='r') \
as iFile:
for line in iFile:
line = line.strip('\r\n')
listLine = line.split(',')
y_train.append(listLine[0])
trainingData.append(listLine[1:])
# X_train = np.matrix(trainingData)
X_train = csr_matrix(trainingData, dtype='double')
joblib.dump(X_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb'))
joblib.dump(y_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb'))
else:
X_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb'))
y_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb'))
print(" Number of training classes: {}".format(len(y_train)))
print(" Number of training class A: {}".format(y_train.count('A')))
print(" Number of training class I: {}".format(y_train.count('I')))
print(" Shape of training matrix: {}".format(X_train.shape))
print("Reading testing data and true classes...")
X_test = None
if args.saveData:
y_test = []
testingData = []
with open(os.path.join(args.inputPath, args.inputTestingData), encoding='utf8', mode='r') \
as iFile:
for line in iFile:
line = line.strip('\r\n')
listLine = line.split(',')
testingData.append(listLine[1:])
X_test = csr_matrix(testingData, dtype='double')
with open(os.path.join(args.inputPath, args.inputTestingClasses), encoding='utf8', mode='r') \
as iFile:
for line in iFile:
line = line.strip('\r\n')
y_test.append(line)
joblib.dump(X_test, os.path.join(args.outputModelPath, args.inputTestingData + '.jlb'))
joblib.dump(y_test, os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb'))
else:
X_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingData + '.jlb'))
y_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb'))
print(" Number of testing classes: {}".format(len(y_test)))
print(" Number of testing class A: {}".format(y_test.count('A')))
print(" Number of testing class I: {}".format(y_test.count('I')))
print(" Shape of testing matrix: {}".format(X_test.shape))
if args.classifier == "MultinomialNB":
classifier = BernoulliNB()
......@@ -136,26 +154,26 @@ if __name__ == "__main__":
elif args.classifier == "NearestCentroid":
classifier = NearestCentroid()
print(" Training...")
classifier.fit(trainingMatrix, trainingClasses)
print(" Done!")
print("Training...")
classifier.fit(X_train, y_train)
print(" Done!")
print(" Testing (prediction in new data)...")
y_pred = classifier.predict(testingMatrix)
print(" Done!")
print("Testing (prediction in new data)...")
y_pred = classifier.predict(X_test)
print(" Done!")
print(" Saving report...")
with open(os.path.join(args.outputPath, args.outputFile), mode='w', encoding='utf8') as oFile:
print("Saving report...")
with open(os.path.join(args.outputReportPath, args.outputReportFile), mode='w', encoding='utf8') as oFile:
oFile.write('********** EVALUATION REPORT **********\n')
oFile.write('Classifier: {}\n'.format(args.classifier))
oFile.write('Accuracy: {}\n'.format(accuracy_score(testingClasses, y_pred)))
oFile.write('Precision: {}\n'.format(precision_score(testingClasses, y_pred, average='weighted')))
oFile.write('Recall: {}\n'.format(recall_score(testingClasses, y_pred, average='weighted')))
oFile.write('F-score: {}\n'.format(f1_score(testingClasses, y_pred, average='weighted')))
oFile.write('Accuracy: {}\n'.format(accuracy_score(y_test, y_pred)))
oFile.write('Precision: {}\n'.format(precision_score(y_test, y_pred, average='weighted')))
oFile.write('Recall: {}\n'.format(recall_score(y_test, y_pred, average='weighted')))
oFile.write('F-score: {}\n'.format(f1_score(y_test, y_pred, average='weighted')))
oFile.write('Confusion matrix: \n')
oFile.write(str(confusion_matrix(testingClasses, y_pred)) + '\n')
oFile.write(str(confusion_matrix(y_test, y_pred)) + '\n')
oFile.write('Classification report: \n')
oFile.write(classification_report(testingClasses, y_pred) + '\n')
print(" Done!")
oFile.write(classification_report(y_test, y_pred) + '\n')
print(" Done!")
print("Training and testing done in: %fs" % (time() - t0))
......
......@@ -2,9 +2,7 @@
import os
from time import time
# from optparse import OptionParser
import argparse
import sys
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
......