Showing
2 changed files
with
53 additions
and
37 deletions
| ... | @@ -8,9 +8,8 @@ from sklearn.svm import SVC | ... | @@ -8,9 +8,8 @@ from sklearn.svm import SVC |
| 8 | from sklearn.neighbors import NearestCentroid | 8 | from sklearn.neighbors import NearestCentroid |
| 9 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \ | 9 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \ |
| 10 | classification_report | 10 | classification_report |
| 11 | -import sys | 11 | +from sklearn.externals import joblib |
| 12 | from scipy.sparse import csr_matrix | 12 | from scipy.sparse import csr_matrix |
| 13 | -import numpy as np | ||
| 14 | 13 | ||
| 15 | __author__ = 'CMendezC' | 14 | __author__ = 'CMendezC' |
| 16 | 15 | ||
| ... | @@ -26,6 +25,7 @@ __author__ = 'CMendezC' | ... | @@ -26,6 +25,7 @@ __author__ = 'CMendezC' |
| 26 | # 7) --outputReportPath Path to place evaluation report. | 25 | # 7) --outputReportPath Path to place evaluation report. |
| 27 | # 8) --outputReportFile File to place evaluation report. | 26 | # 8) --outputReportFile File to place evaluation report. |
| 28 | # 9) --classifier Classifier: BernoulliNB, SVM, NearestCentroid. | 27 | # 9) --classifier Classifier: BernoulliNB, SVM, NearestCentroid. |
| 28 | +# 10) --saveData Save matrices | ||
| 29 | 29 | ||
| 30 | # Ouput: | 30 | # Ouput: |
| 31 | # 1) Classification model and evaluation report. | 31 | # 1) Classification model and evaluation report. |
| ... | @@ -42,9 +42,10 @@ __author__ = 'CMendezC' | ... | @@ -42,9 +42,10 @@ __author__ = 'CMendezC' |
| 42 | # --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports | 42 | # --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports |
| 43 | # --outputReportFile SVM.txt | 43 | # --outputReportFile SVM.txt |
| 44 | # --classifier SVM | 44 | # --classifier SVM |
| 45 | +# --saveData | ||
| 45 | 46 | ||
| 46 | # source activate python3 | 47 | # source activate python3 |
| 47 | -# python training-validation-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM.txt --classifier SVM | 48 | +# python training-validation-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM.txt --classifier SVM --saveData |
| 48 | 49 | ||
| 49 | ########################################################### | 50 | ########################################################### |
| 50 | # MAIN PROGRAM # | 51 | # MAIN PROGRAM # |
| ... | @@ -72,6 +73,8 @@ if __name__ == "__main__": | ... | @@ -72,6 +73,8 @@ if __name__ == "__main__": |
| 72 | parser.add_argument("--classifier", dest="classifier", | 73 | parser.add_argument("--classifier", dest="classifier", |
| 73 | help="Classifier", metavar="NAME", | 74 | help="Classifier", metavar="NAME", |
| 74 | choices=('BernoulliNB', 'SVM', 'NearestCentroid'), default='SVM') | 75 | choices=('BernoulliNB', 'SVM', 'NearestCentroid'), default='SVM') |
| 76 | + parser.add_argument("--saveData", dest="saveData", action='store_true', | ||
| 77 | + help="Save matrices") | ||
| 75 | 78 | ||
| 76 | args = parser.parse_args() | 79 | args = parser.parse_args() |
| 77 | 80 | ||
| ... | @@ -86,30 +89,40 @@ if __name__ == "__main__": | ... | @@ -86,30 +89,40 @@ if __name__ == "__main__": |
| 86 | print("Path to place evaluation report: " + str(args.outputReportPath)) | 89 | print("Path to place evaluation report: " + str(args.outputReportPath)) |
| 87 | print("File to place evaluation report: " + str(args.outputReportFile)) | 90 | print("File to place evaluation report: " + str(args.outputReportFile)) |
| 88 | print("Classifier: " + str(args.classifier)) | 91 | print("Classifier: " + str(args.classifier)) |
| 92 | + print("Save matrices: " + str(args.saveData)) | ||
| 89 | 93 | ||
| 90 | # Start time | 94 | # Start time |
| 91 | t0 = time() | 95 | t0 = time() |
| 92 | 96 | ||
| 93 | - print(" Reading training data and true classes...") | 97 | + print("Reading training data and true classes...") |
| 94 | - trainingClasses = [] | 98 | + X_train = None |
| 99 | + if args.saveData: | ||
| 100 | + y_train = [] | ||
| 95 | trainingData = [] | 101 | trainingData = [] |
| 96 | with open(os.path.join(args.inputPath, args.inputTrainingData), encoding='utf8', mode='r') \ | 102 | with open(os.path.join(args.inputPath, args.inputTrainingData), encoding='utf8', mode='r') \ |
| 97 | as iFile: | 103 | as iFile: |
| 98 | for line in iFile: | 104 | for line in iFile: |
| 99 | line = line.strip('\r\n') | 105 | line = line.strip('\r\n') |
| 100 | listLine = line.split(',') | 106 | listLine = line.split(',') |
| 101 | - trainingClasses.append(listLine[0]) | 107 | + y_train.append(listLine[0]) |
| 102 | trainingData.append(listLine[1:]) | 108 | trainingData.append(listLine[1:]) |
| 103 | - # trainingMatrix = np.matrix(trainingData) | 109 | + # X_train = np.matrix(trainingData) |
| 104 | - trainingMatrix = csr_matrix(trainingData, dtype='double') | 110 | + X_train = csr_matrix(trainingData, dtype='double') |
| 105 | - | 111 | + joblib.dump(X_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb')) |
| 106 | - print("Number of training classes: {}".format(len(trainingClasses))) | 112 | + joblib.dump(y_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb')) |
| 107 | - print("Number of training class A: {}".format(trainingClasses.count('A'))) | 113 | + else: |
| 108 | - print("Number of training class I: {}".format(trainingClasses.count('I'))) | 114 | + X_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb')) |
| 109 | - print("Shape of training matrix: {}".format(trainingMatrix.shape)) | 115 | + y_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb')) |
| 110 | - | 116 | + |
| 111 | - print(" Reading testing data and true classes...") | 117 | + print(" Number of training classes: {}".format(len(y_train))) |
| 112 | - testingClasses = [] | 118 | + print(" Number of training class A: {}".format(y_train.count('A'))) |
| 119 | + print(" Number of training class I: {}".format(y_train.count('I'))) | ||
| 120 | + print(" Shape of training matrix: {}".format(X_train.shape)) | ||
| 121 | + | ||
| 122 | + print("Reading testing data and true classes...") | ||
| 123 | + X_test = None | ||
| 124 | + if args.saveData: | ||
| 125 | + y_test = [] | ||
| 113 | testingData = [] | 126 | testingData = [] |
| 114 | with open(os.path.join(args.inputPath, args.inputTestingData), encoding='utf8', mode='r') \ | 127 | with open(os.path.join(args.inputPath, args.inputTestingData), encoding='utf8', mode='r') \ |
| 115 | as iFile: | 128 | as iFile: |
| ... | @@ -117,17 +130,22 @@ if __name__ == "__main__": | ... | @@ -117,17 +130,22 @@ if __name__ == "__main__": |
| 117 | line = line.strip('\r\n') | 130 | line = line.strip('\r\n') |
| 118 | listLine = line.split(',') | 131 | listLine = line.split(',') |
| 119 | testingData.append(listLine[1:]) | 132 | testingData.append(listLine[1:]) |
| 120 | - testingMatrix = csr_matrix(testingData, dtype='double') | 133 | + X_test = csr_matrix(testingData, dtype='double') |
| 121 | with open(os.path.join(args.inputPath, args.inputTestingClasses), encoding='utf8', mode='r') \ | 134 | with open(os.path.join(args.inputPath, args.inputTestingClasses), encoding='utf8', mode='r') \ |
| 122 | as iFile: | 135 | as iFile: |
| 123 | for line in iFile: | 136 | for line in iFile: |
| 124 | line = line.strip('\r\n') | 137 | line = line.strip('\r\n') |
| 125 | - testingClasses.append(line) | 138 | + y_test.append(line) |
| 126 | - | 139 | + joblib.dump(X_test, os.path.join(args.outputModelPath, args.inputTestingData + '.jlb')) |
| 127 | - print("Number of testing classes: {}".format(len(testingClasses))) | 140 | + joblib.dump(y_test, os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb')) |
| 128 | - print("Number of testing class A: {}".format(trainingClasses.count('A'))) | 141 | + else: |
| 129 | - print("Number of testing class I: {}".format(trainingClasses.count('I'))) | 142 | + X_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingData + '.jlb')) |
| 130 | - print("Shape of testing matrix: {}".format(testingMatrix.shape)) | 143 | + y_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb')) |
| 144 | + | ||
| 145 | + print(" Number of testing classes: {}".format(len(y_test))) | ||
| 146 | + print(" Number of testing class A: {}".format(y_test.count('A'))) | ||
| 147 | + print(" Number of testing class I: {}".format(y_test.count('I'))) | ||
| 148 | + print(" Shape of testing matrix: {}".format(X_test.shape)) | ||
| 131 | 149 | ||
| 132 | if args.classifier == "MultinomialNB": | 150 | if args.classifier == "MultinomialNB": |
| 133 | classifier = BernoulliNB() | 151 | classifier = BernoulliNB() |
| ... | @@ -136,26 +154,26 @@ if __name__ == "__main__": | ... | @@ -136,26 +154,26 @@ if __name__ == "__main__": |
| 136 | elif args.classifier == "NearestCentroid": | 154 | elif args.classifier == "NearestCentroid": |
| 137 | classifier = NearestCentroid() | 155 | classifier = NearestCentroid() |
| 138 | 156 | ||
| 139 | - print(" Training...") | 157 | + print("Training...") |
| 140 | - classifier.fit(trainingMatrix, trainingClasses) | 158 | + classifier.fit(X_train, y_train) |
| 141 | print(" Done!") | 159 | print(" Done!") |
| 142 | 160 | ||
| 143 | - print(" Testing (prediction in new data)...") | 161 | + print("Testing (prediction in new data)...") |
| 144 | - y_pred = classifier.predict(testingMatrix) | 162 | + y_pred = classifier.predict(X_test) |
| 145 | print(" Done!") | 163 | print(" Done!") |
| 146 | 164 | ||
| 147 | - print(" Saving report...") | 165 | + print("Saving report...") |
| 148 | - with open(os.path.join(args.outputPath, args.outputFile), mode='w', encoding='utf8') as oFile: | 166 | + with open(os.path.join(args.outputReportPath, args.outputReportFile), mode='w', encoding='utf8') as oFile: |
| 149 | oFile.write('********** EVALUATION REPORT **********\n') | 167 | oFile.write('********** EVALUATION REPORT **********\n') |
| 150 | oFile.write('Classifier: {}\n'.format(args.classifier)) | 168 | oFile.write('Classifier: {}\n'.format(args.classifier)) |
| 151 | - oFile.write('Accuracy: {}\n'.format(accuracy_score(testingClasses, y_pred))) | 169 | + oFile.write('Accuracy: {}\n'.format(accuracy_score(y_test, y_pred))) |
| 152 | - oFile.write('Precision: {}\n'.format(precision_score(testingClasses, y_pred, average='weighted'))) | 170 | + oFile.write('Precision: {}\n'.format(precision_score(y_test, y_pred, average='weighted'))) |
| 153 | - oFile.write('Recall: {}\n'.format(recall_score(testingClasses, y_pred, average='weighted'))) | 171 | + oFile.write('Recall: {}\n'.format(recall_score(y_test, y_pred, average='weighted'))) |
| 154 | - oFile.write('F-score: {}\n'.format(f1_score(testingClasses, y_pred, average='weighted'))) | 172 | + oFile.write('F-score: {}\n'.format(f1_score(y_test, y_pred, average='weighted'))) |
| 155 | oFile.write('Confusion matrix: \n') | 173 | oFile.write('Confusion matrix: \n') |
| 156 | - oFile.write(str(confusion_matrix(testingClasses, y_pred)) + '\n') | 174 | + oFile.write(str(confusion_matrix(y_test, y_pred)) + '\n') |
| 157 | oFile.write('Classification report: \n') | 175 | oFile.write('Classification report: \n') |
| 158 | - oFile.write(classification_report(testingClasses, y_pred) + '\n') | 176 | + oFile.write(classification_report(y_test, y_pred) + '\n') |
| 159 | print(" Done!") | 177 | print(" Done!") |
| 160 | 178 | ||
| 161 | print("Training and testing done in: %fs" % (time() - t0)) | 179 | print("Training and testing done in: %fs" % (time() - t0)) | ... | ... |
| ... | @@ -2,9 +2,7 @@ | ... | @@ -2,9 +2,7 @@ |
| 2 | 2 | ||
| 3 | import os | 3 | import os |
| 4 | from time import time | 4 | from time import time |
| 5 | -# from optparse import OptionParser | ||
| 6 | import argparse | 5 | import argparse |
| 7 | -import sys | ||
| 8 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | 6 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer |
| 9 | from scipy.sparse import csr_matrix | 7 | from scipy.sparse import csr_matrix |
| 10 | from sklearn.metrics.pairwise import cosine_similarity | 8 | from sklearn.metrics.pairwise import cosine_similarity | ... | ... |
-
Please register or login to post a comment