Showing
2 changed files
with
201 additions
and
5 deletions
| ... | @@ -13,7 +13,7 @@ from scipy.sparse import csr_matrix | ... | @@ -13,7 +13,7 @@ from scipy.sparse import csr_matrix |
| 13 | 13 | ||
| 14 | __author__ = 'CMendezC' | 14 | __author__ = 'CMendezC' |
| 15 | 15 | ||
| 16 | -# Goal: training and validation binding thrombin data set | 16 | +# Goal: training, crossvalidation and testing binding thrombin data set |
| 17 | 17 | ||
| 18 | # Parameters: | 18 | # Parameters: |
| 19 | # 1) --inputPath Path to read input files. | 19 | # 1) --inputPath Path to read input files. |
| ... | @@ -26,13 +26,14 @@ __author__ = 'CMendezC' | ... | @@ -26,13 +26,14 @@ __author__ = 'CMendezC' |
| 26 | # 8) --outputReportFile File to place evaluation report. | 26 | # 8) --outputReportFile File to place evaluation report. |
| 27 | # 9) --classifier Classifier: BernoulliNB, SVM, NearestCentroid. | 27 | # 9) --classifier Classifier: BernoulliNB, SVM, NearestCentroid. |
| 28 | # 10) --saveData Save matrices | 28 | # 10) --saveData Save matrices |
| 29 | +# 11) --kernel Kernel | ||
| 29 | 30 | ||
| 30 | # Ouput: | 31 | # Ouput: |
| 31 | # 1) Classification model and evaluation report. | 32 | # 1) Classification model and evaluation report. |
| 32 | 33 | ||
| 33 | # Execution: | 34 | # Execution: |
| 34 | 35 | ||
| 35 | -# python training-validation-binding-thrombin.py | 36 | +# python training-testing-binding-thrombin.py |
| 36 | # --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset | 37 | # --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset |
| 37 | # --inputTrainingData thrombin.data | 38 | # --inputTrainingData thrombin.data |
| 38 | # --inputTestingData Thrombin.testset | 39 | # --inputTestingData Thrombin.testset |
| ... | @@ -43,9 +44,10 @@ __author__ = 'CMendezC' | ... | @@ -43,9 +44,10 @@ __author__ = 'CMendezC' |
| 43 | # --outputReportFile SVM.txt | 44 | # --outputReportFile SVM.txt |
| 44 | # --classifier SVM | 45 | # --classifier SVM |
| 45 | # --saveData | 46 | # --saveData |
| 47 | +# --kernel linear | ||
| 46 | 48 | ||
| 47 | # source activate python3 | 49 | # source activate python3 |
| 48 | -# python training-validation-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM.txt --classifier SVM --saveData | 50 | +# python training-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM.txt --classifier SVM |
| 49 | 51 | ||
| 50 | ########################################################### | 52 | ########################################################### |
| 51 | # MAIN PROGRAM # | 53 | # MAIN PROGRAM # |
| ... | @@ -108,10 +110,12 @@ if __name__ == "__main__": | ... | @@ -108,10 +110,12 @@ if __name__ == "__main__": |
| 108 | trainingData.append(listLine[1:]) | 110 | trainingData.append(listLine[1:]) |
| 109 | # X_train = np.matrix(trainingData) | 111 | # X_train = np.matrix(trainingData) |
| 110 | X_train = csr_matrix(trainingData, dtype='double') | 112 | X_train = csr_matrix(trainingData, dtype='double') |
| 113 | + print(" Saving matrix and classes...") | ||
| 111 | joblib.dump(X_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb')) | 114 | joblib.dump(X_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb')) |
| 112 | joblib.dump(y_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb')) | 115 | joblib.dump(y_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb')) |
| 116 | + print(" Done!") | ||
| 113 | else: | 117 | else: |
| 114 | - print(" Saving matrix and classes...") | 118 | + print(" Loading matrix and classes...") |
| 115 | X_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb')) | 119 | X_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb')) |
| 116 | y_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb')) | 120 | y_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb')) |
| 117 | print(" Done!") | 121 | print(" Done!") |
| ... | @@ -138,10 +142,12 @@ if __name__ == "__main__": | ... | @@ -138,10 +142,12 @@ if __name__ == "__main__": |
| 138 | for line in iFile: | 142 | for line in iFile: |
| 139 | line = line.strip('\r\n') | 143 | line = line.strip('\r\n') |
| 140 | y_test.append(line) | 144 | y_test.append(line) |
| 145 | + print(" Saving matrix and classes...") | ||
| 141 | joblib.dump(X_test, os.path.join(args.outputModelPath, args.inputTestingData + '.jlb')) | 146 | joblib.dump(X_test, os.path.join(args.outputModelPath, args.inputTestingData + '.jlb')) |
| 142 | joblib.dump(y_test, os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb')) | 147 | joblib.dump(y_test, os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb')) |
| 148 | + print(" Done!") | ||
| 143 | else: | 149 | else: |
| 144 | - print(" Saving matrix and classes...") | 150 | + print(" Loading matrix and classes...") |
| 145 | X_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingData + '.jlb')) | 151 | X_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingData + '.jlb')) |
| 146 | y_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb')) | 152 | y_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb')) |
| 147 | print(" Done!") | 153 | print(" Done!") | ... | ... |
| 1 | +# -*- encoding: utf-8 -*- | ||
| 2 | + | ||
| 3 | +import os | ||
| 4 | +from time import time | ||
| 5 | +import argparse | ||
| 6 | +from sklearn.naive_bayes import BernoulliNB | ||
| 7 | +from sklearn.svm import SVC | ||
| 8 | +from sklearn.neighbors import KNeighborsClassifier | ||
| 9 | +from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \ | ||
| 10 | + classification_report | ||
| 11 | +from sklearn.externals import joblib | ||
| 12 | +from scipy.sparse import csr_matrix | ||
| 13 | + | ||
| 14 | +__author__ = 'CMendezC' | ||
| 15 | + | ||
| 16 | +# Goal: training and testing binding thrombin data set | ||
| 17 | + | ||
| 18 | +# Parameters: | ||
| 19 | +# 1) --inputPath Path to read input files. | ||
| 20 | +# 2) --inputTrainingData File to read training data. | ||
| 21 | +# 3) --inputTestingData File to read testing data. | ||
| 22 | +# 4) --inputTestingClasses File to read testing classes. | ||
| 23 | +# 5) --outputModelPath Path to place output model. | ||
| 24 | +# 6) --outputModelFile File to place output model. | ||
| 25 | +# 7) --outputReportPath Path to place evaluation report. | ||
| 26 | +# 8) --outputReportFile File to place evaluation report. | ||
| 27 | +# 9) --classifier Classifier: BernoulliNB, SVM, kNN. | ||
| 28 | +# 10) --saveData Save matrices | ||
| 29 | + | ||
| 30 | +# Ouput: | ||
| 31 | +# 1) Classification model and evaluation report. | ||
| 32 | + | ||
| 33 | +# Execution: | ||
| 34 | + | ||
| 35 | +# python training-testing-binding-thrombin.py | ||
| 36 | +# --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset | ||
| 37 | +# --inputTrainingData thrombin.data | ||
| 38 | +# --inputTestingData Thrombin.testset | ||
| 39 | +# --inputTestingClasses Thrombin.testset.class | ||
| 40 | +# --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models | ||
| 41 | +# --outputModelFile SVM-model.mod | ||
| 42 | +# --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports | ||
| 43 | +# --outputReportFile SVM.txt | ||
| 44 | +# --classifier SVM | ||
| 45 | +# --saveData | ||
| 46 | + | ||
| 47 | +# source activate python3 | ||
| 48 | +# python training-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM.txt --classifier SVM --saveData | ||
| 49 | + | ||
| 50 | +########################################################### | ||
| 51 | +# MAIN PROGRAM # | ||
| 52 | +########################################################### | ||
| 53 | + | ||
| 54 | +if __name__ == "__main__": | ||
| 55 | + # Parameter definition | ||
| 56 | + parser = argparse.ArgumentParser(description='Training and testing Binding Thrombin Dataset.') | ||
| 57 | + parser.add_argument("--inputPath", dest="inputPath", | ||
| 58 | + help="Path to read input files", metavar="PATH") | ||
| 59 | + parser.add_argument("--inputTrainingData", dest="inputTrainingData", | ||
| 60 | + help="File to read training data", metavar="FILE") | ||
| 61 | + parser.add_argument("--inputTestingData", dest="inputTestingData", | ||
| 62 | + help="File to read testing data", metavar="FILE") | ||
| 63 | + parser.add_argument("--inputTestingClasses", dest="inputTestingClasses", | ||
| 64 | + help="File to read testing classes", metavar="FILE") | ||
| 65 | + parser.add_argument("--outputModelPath", dest="outputModelPath", | ||
| 66 | + help="Path to place output model", metavar="PATH") | ||
| 67 | + parser.add_argument("--outputModelFile", dest="outputModelFile", | ||
| 68 | + help="File to place output model", metavar="FILE") | ||
| 69 | + parser.add_argument("--outputReportPath", dest="outputReportPath", | ||
| 70 | + help="Path to place evaluation report", metavar="PATH") | ||
| 71 | + parser.add_argument("--outputReportFile", dest="outputReportFile", | ||
| 72 | + help="File to place evaluation report", metavar="FILE") | ||
| 73 | + parser.add_argument("--classifier", dest="classifier", | ||
| 74 | + help="Classifier", metavar="NAME", | ||
| 75 | + choices=('BernoulliNB', 'SVM', 'kNN'), default='SVM') | ||
| 76 | + parser.add_argument("--saveData", dest="saveData", action='store_true', | ||
| 77 | + help="Save matrices") | ||
| 78 | + | ||
| 79 | + args = parser.parse_args() | ||
| 80 | + | ||
| 81 | + # Printing parameter values | ||
| 82 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
| 83 | + print("Path to read input files: " + str(args.inputPath)) | ||
| 84 | + print("File to read training data: " + str(args.inputTrainingData)) | ||
| 85 | + print("File to read testing data: " + str(args.inputTestingData)) | ||
| 86 | + print("File to read testing classes: " + str(args.inputTestingClasses)) | ||
| 87 | + print("Path to place output model: " + str(args.outputModelPath)) | ||
| 88 | + print("File to place output model: " + str(args.outputModelFile)) | ||
| 89 | + print("Path to place evaluation report: " + str(args.outputReportPath)) | ||
| 90 | + print("File to place evaluation report: " + str(args.outputReportFile)) | ||
| 91 | + print("Classifier: " + str(args.classifier)) | ||
| 92 | + print("Save matrices: " + str(args.saveData)) | ||
| 93 | + | ||
| 94 | + # Start time | ||
| 95 | + t0 = time() | ||
| 96 | + | ||
| 97 | + print("Reading training data and true classes...") | ||
| 98 | + X_train = None | ||
| 99 | + if args.saveData: | ||
| 100 | + y_train = [] | ||
| 101 | + trainingData = [] | ||
| 102 | + with open(os.path.join(args.inputPath, args.inputTrainingData), encoding='utf8', mode='r') \ | ||
| 103 | + as iFile: | ||
| 104 | + for line in iFile: | ||
| 105 | + line = line.strip('\r\n') | ||
| 106 | + listLine = line.split(',') | ||
| 107 | + y_train.append(listLine[0]) | ||
| 108 | + trainingData.append(listLine[1:]) | ||
| 109 | + # X_train = np.matrix(trainingData) | ||
| 110 | + X_train = csr_matrix(trainingData, dtype='double') | ||
| 111 | + print(" Saving matrix and classes...") | ||
| 112 | + joblib.dump(X_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb')) | ||
| 113 | + joblib.dump(y_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb')) | ||
| 114 | + print(" Done!") | ||
| 115 | + else: | ||
| 116 | + print(" Loading matrix and classes...") | ||
| 117 | + X_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb')) | ||
| 118 | + y_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb')) | ||
| 119 | + print(" Done!") | ||
| 120 | + | ||
| 121 | + print(" Number of training classes: {}".format(len(y_train))) | ||
| 122 | + print(" Number of training class A: {}".format(y_train.count('A'))) | ||
| 123 | + print(" Number of training class I: {}".format(y_train.count('I'))) | ||
| 124 | + print(" Shape of training matrix: {}".format(X_train.shape)) | ||
| 125 | + | ||
| 126 | + print("Reading testing data and true classes...") | ||
| 127 | + X_test = None | ||
| 128 | + if args.saveData: | ||
| 129 | + y_test = [] | ||
| 130 | + testingData = [] | ||
| 131 | + with open(os.path.join(args.inputPath, args.inputTestingData), encoding='utf8', mode='r') \ | ||
| 132 | + as iFile: | ||
| 133 | + for line in iFile: | ||
| 134 | + line = line.strip('\r\n') | ||
| 135 | + listLine = line.split(',') | ||
| 136 | + testingData.append(listLine[1:]) | ||
| 137 | + X_test = csr_matrix(testingData, dtype='double') | ||
| 138 | + with open(os.path.join(args.inputPath, args.inputTestingClasses), encoding='utf8', mode='r') \ | ||
| 139 | + as iFile: | ||
| 140 | + for line in iFile: | ||
| 141 | + line = line.strip('\r\n') | ||
| 142 | + y_test.append(line) | ||
| 143 | + print(" Saving matrix and classes...") | ||
| 144 | + joblib.dump(X_test, os.path.join(args.outputModelPath, args.inputTestingData + '.jlb')) | ||
| 145 | + joblib.dump(y_test, os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb')) | ||
| 146 | + print(" Done!") | ||
| 147 | + else: | ||
| 148 | + print(" Loading matrix and classes...") | ||
| 149 | + X_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingData + '.jlb')) | ||
| 150 | + y_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb')) | ||
| 151 | + print(" Done!") | ||
| 152 | + | ||
| 153 | + print(" Number of testing classes: {}".format(len(y_test))) | ||
| 154 | + print(" Number of testing class A: {}".format(y_test.count('A'))) | ||
| 155 | + print(" Number of testing class I: {}".format(y_test.count('I'))) | ||
| 156 | + print(" Shape of testing matrix: {}".format(X_test.shape)) | ||
| 157 | + | ||
| 158 | + if args.classifier == "BernoulliNB": | ||
| 159 | + classifier = BernoulliNB() | ||
| 160 | + elif args.classifier == "SVM": | ||
| 161 | + classifier = SVC() | ||
| 162 | + elif args.classifier == "kNN": | ||
| 163 | + classifier = KNeighborsClassifier() | ||
| 164 | + else: | ||
| 165 | + print("Bad classifier") | ||
| 166 | + exit() | ||
| 167 | + | ||
| 168 | + print("Training...") | ||
| 169 | + classifier.fit(X_train, y_train) | ||
| 170 | + print(" Done!") | ||
| 171 | + | ||
| 172 | + print("Testing (prediction in new data)...") | ||
| 173 | + y_pred = classifier.predict(X_test) | ||
| 174 | + print(" Done!") | ||
| 175 | + | ||
| 176 | + print("Saving report...") | ||
| 177 | + with open(os.path.join(args.outputReportPath, args.outputReportFile), mode='w', encoding='utf8') as oFile: | ||
| 178 | + oFile.write('********** EVALUATION REPORT **********\n') | ||
| 179 | + oFile.write('Classifier: {}\n'.format(args.classifier)) | ||
| 180 | + oFile.write('Accuracy: {}\n'.format(accuracy_score(y_test, y_pred))) | ||
| 181 | + oFile.write('Precision: {}\n'.format(precision_score(y_test, y_pred, average='weighted'))) | ||
| 182 | + oFile.write('Recall: {}\n'.format(recall_score(y_test, y_pred, average='weighted'))) | ||
| 183 | + oFile.write('F-score: {}\n'.format(f1_score(y_test, y_pred, average='weighted'))) | ||
| 184 | + oFile.write('Confusion matrix: \n') | ||
| 185 | + oFile.write(str(confusion_matrix(y_test, y_pred)) + '\n') | ||
| 186 | + oFile.write('Classification report: \n') | ||
| 187 | + oFile.write(classification_report(y_test, y_pred) + '\n') | ||
| 188 | + print(" Done!") | ||
| 189 | + | ||
| 190 | + print("Training and testing done in: %fs" % (time() - t0)) |
-
Please register or login to post a comment