Showing
2 changed files
with
74 additions
and
58 deletions
... | @@ -8,9 +8,8 @@ from sklearn.svm import SVC | ... | @@ -8,9 +8,8 @@ from sklearn.svm import SVC |
8 | from sklearn.neighbors import NearestCentroid | 8 | from sklearn.neighbors import NearestCentroid |
9 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \ | 9 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \ |
10 | classification_report | 10 | classification_report |
11 | -import sys | 11 | +from sklearn.externals import joblib |
12 | from scipy.sparse import csr_matrix | 12 | from scipy.sparse import csr_matrix |
13 | -import numpy as np | ||
14 | 13 | ||
15 | __author__ = 'CMendezC' | 14 | __author__ = 'CMendezC' |
16 | 15 | ||
... | @@ -26,6 +25,7 @@ __author__ = 'CMendezC' | ... | @@ -26,6 +25,7 @@ __author__ = 'CMendezC' |
26 | # 7) --outputReportPath Path to place evaluation report. | 25 | # 7) --outputReportPath Path to place evaluation report. |
27 | # 8) --outputReportFile File to place evaluation report. | 26 | # 8) --outputReportFile File to place evaluation report. |
28 | # 9) --classifier Classifier: BernoulliNB, SVM, NearestCentroid. | 27 | # 9) --classifier Classifier: BernoulliNB, SVM, NearestCentroid. |
28 | +# 10) --saveData Save matrices | ||
29 | 29 | ||
30 | # Ouput: | 30 | # Ouput: |
31 | # 1) Classification model and evaluation report. | 31 | # 1) Classification model and evaluation report. |
... | @@ -42,9 +42,10 @@ __author__ = 'CMendezC' | ... | @@ -42,9 +42,10 @@ __author__ = 'CMendezC' |
42 | # --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports | 42 | # --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports |
43 | # --outputReportFile SVM.txt | 43 | # --outputReportFile SVM.txt |
44 | # --classifier SVM | 44 | # --classifier SVM |
45 | +# --saveData | ||
45 | 46 | ||
46 | # source activate python3 | 47 | # source activate python3 |
47 | -# python training-validation-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM.txt --classifier SVM | 48 | +# python training-validation-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM.txt --classifier SVM --saveData |
48 | 49 | ||
49 | ########################################################### | 50 | ########################################################### |
50 | # MAIN PROGRAM # | 51 | # MAIN PROGRAM # |
... | @@ -72,6 +73,8 @@ if __name__ == "__main__": | ... | @@ -72,6 +73,8 @@ if __name__ == "__main__": |
72 | parser.add_argument("--classifier", dest="classifier", | 73 | parser.add_argument("--classifier", dest="classifier", |
73 | help="Classifier", metavar="NAME", | 74 | help="Classifier", metavar="NAME", |
74 | choices=('BernoulliNB', 'SVM', 'NearestCentroid'), default='SVM') | 75 | choices=('BernoulliNB', 'SVM', 'NearestCentroid'), default='SVM') |
76 | + parser.add_argument("--saveData", dest="saveData", action='store_true', | ||
77 | + help="Save matrices") | ||
75 | 78 | ||
76 | args = parser.parse_args() | 79 | args = parser.parse_args() |
77 | 80 | ||
... | @@ -86,48 +89,63 @@ if __name__ == "__main__": | ... | @@ -86,48 +89,63 @@ if __name__ == "__main__": |
86 | print("Path to place evaluation report: " + str(args.outputReportPath)) | 89 | print("Path to place evaluation report: " + str(args.outputReportPath)) |
87 | print("File to place evaluation report: " + str(args.outputReportFile)) | 90 | print("File to place evaluation report: " + str(args.outputReportFile)) |
88 | print("Classifier: " + str(args.classifier)) | 91 | print("Classifier: " + str(args.classifier)) |
92 | + print("Save matrices: " + str(args.saveData)) | ||
89 | 93 | ||
90 | # Start time | 94 | # Start time |
91 | t0 = time() | 95 | t0 = time() |
92 | 96 | ||
93 | - print(" Reading training data and true classes...") | 97 | + print("Reading training data and true classes...") |
94 | - trainingClasses = [] | 98 | + X_train = None |
95 | - trainingData = [] | 99 | + if args.saveData: |
96 | - with open(os.path.join(args.inputPath, args.inputTrainingData), encoding='utf8', mode='r') \ | 100 | + y_train = [] |
97 | - as iFile: | 101 | + trainingData = [] |
98 | - for line in iFile: | 102 | + with open(os.path.join(args.inputPath, args.inputTrainingData), encoding='utf8', mode='r') \ |
99 | - line = line.strip('\r\n') | 103 | + as iFile: |
100 | - listLine = line.split(',') | 104 | + for line in iFile: |
101 | - trainingClasses.append(listLine[0]) | 105 | + line = line.strip('\r\n') |
102 | - trainingData.append(listLine[1:]) | 106 | + listLine = line.split(',') |
103 | - # trainingMatrix = np.matrix(trainingData) | 107 | + y_train.append(listLine[0]) |
104 | - trainingMatrix = csr_matrix(trainingData, dtype='double') | 108 | + trainingData.append(listLine[1:]) |
105 | - | 109 | + # X_train = np.matrix(trainingData) |
106 | - print("Number of training classes: {}".format(len(trainingClasses))) | 110 | + X_train = csr_matrix(trainingData, dtype='double') |
107 | - print("Number of training class A: {}".format(trainingClasses.count('A'))) | 111 | + joblib.dump(X_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb')) |
108 | - print("Number of training class I: {}".format(trainingClasses.count('I'))) | 112 | + joblib.dump(y_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb')) |
109 | - print("Shape of training matrix: {}".format(trainingMatrix.shape)) | 113 | + else: |
110 | - | 114 | + X_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb')) |
111 | - print(" Reading testing data and true classes...") | 115 | + y_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb')) |
112 | - testingClasses = [] | 116 | + |
113 | - testingData = [] | 117 | + print(" Number of training classes: {}".format(len(y_train))) |
114 | - with open(os.path.join(args.inputPath, args.inputTestingData), encoding='utf8', mode='r') \ | 118 | + print(" Number of training class A: {}".format(y_train.count('A'))) |
115 | - as iFile: | 119 | + print(" Number of training class I: {}".format(y_train.count('I'))) |
116 | - for line in iFile: | 120 | + print(" Shape of training matrix: {}".format(X_train.shape)) |
117 | - line = line.strip('\r\n') | 121 | + |
118 | - listLine = line.split(',') | 122 | + print("Reading testing data and true classes...") |
119 | - testingData.append(listLine[1:]) | 123 | + X_test = None |
120 | - testingMatrix = csr_matrix(testingData, dtype='double') | 124 | + if args.saveData: |
121 | - with open(os.path.join(args.inputPath, args.inputTestingClasses), encoding='utf8', mode='r') \ | 125 | + y_test = [] |
122 | - as iFile: | 126 | + testingData = [] |
123 | - for line in iFile: | 127 | + with open(os.path.join(args.inputPath, args.inputTestingData), encoding='utf8', mode='r') \ |
124 | - line = line.strip('\r\n') | 128 | + as iFile: |
125 | - testingClasses.append(line) | 129 | + for line in iFile: |
126 | - | 130 | + line = line.strip('\r\n') |
127 | - print("Number of testing classes: {}".format(len(testingClasses))) | 131 | + listLine = line.split(',') |
128 | - print("Number of testing class A: {}".format(trainingClasses.count('A'))) | 132 | + testingData.append(listLine[1:]) |
129 | - print("Number of testing class I: {}".format(trainingClasses.count('I'))) | 133 | + X_test = csr_matrix(testingData, dtype='double') |
130 | - print("Shape of testing matrix: {}".format(testingMatrix.shape)) | 134 | + with open(os.path.join(args.inputPath, args.inputTestingClasses), encoding='utf8', mode='r') \ |
135 | + as iFile: | ||
136 | + for line in iFile: | ||
137 | + line = line.strip('\r\n') | ||
138 | + y_test.append(line) | ||
139 | + joblib.dump(X_test, os.path.join(args.outputModelPath, args.inputTestingData + '.jlb')) | ||
140 | + joblib.dump(y_test, os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb')) | ||
141 | + else: | ||
142 | + X_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingData + '.jlb')) | ||
143 | + y_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb')) | ||
144 | + | ||
145 | + print(" Number of testing classes: {}".format(len(y_test))) | ||
146 | + print(" Number of testing class A: {}".format(y_test.count('A'))) | ||
147 | + print(" Number of testing class I: {}".format(y_test.count('I'))) | ||
148 | + print(" Shape of testing matrix: {}".format(X_test.shape)) | ||
131 | 149 | ||
132 | if args.classifier == "MultinomialNB": | 150 | if args.classifier == "MultinomialNB": |
133 | classifier = BernoulliNB() | 151 | classifier = BernoulliNB() |
... | @@ -136,26 +154,26 @@ if __name__ == "__main__": | ... | @@ -136,26 +154,26 @@ if __name__ == "__main__": |
136 | elif args.classifier == "NearestCentroid": | 154 | elif args.classifier == "NearestCentroid": |
137 | classifier = NearestCentroid() | 155 | classifier = NearestCentroid() |
138 | 156 | ||
139 | - print(" Training...") | 157 | + print("Training...") |
140 | - classifier.fit(trainingMatrix, trainingClasses) | 158 | + classifier.fit(X_train, y_train) |
141 | - print(" Done!") | 159 | + print(" Done!") |
142 | 160 | ||
143 | - print(" Testing (prediction in new data)...") | 161 | + print("Testing (prediction in new data)...") |
144 | - y_pred = classifier.predict(testingMatrix) | 162 | + y_pred = classifier.predict(X_test) |
145 | - print(" Done!") | 163 | + print(" Done!") |
146 | 164 | ||
147 | - print(" Saving report...") | 165 | + print("Saving report...") |
148 | - with open(os.path.join(args.outputPath, args.outputFile), mode='w', encoding='utf8') as oFile: | 166 | + with open(os.path.join(args.outputReportPath, args.outputReportFile), mode='w', encoding='utf8') as oFile: |
149 | oFile.write('********** EVALUATION REPORT **********\n') | 167 | oFile.write('********** EVALUATION REPORT **********\n') |
150 | oFile.write('Classifier: {}\n'.format(args.classifier)) | 168 | oFile.write('Classifier: {}\n'.format(args.classifier)) |
151 | - oFile.write('Accuracy: {}\n'.format(accuracy_score(testingClasses, y_pred))) | 169 | + oFile.write('Accuracy: {}\n'.format(accuracy_score(y_test, y_pred))) |
152 | - oFile.write('Precision: {}\n'.format(precision_score(testingClasses, y_pred, average='weighted'))) | 170 | + oFile.write('Precision: {}\n'.format(precision_score(y_test, y_pred, average='weighted'))) |
153 | - oFile.write('Recall: {}\n'.format(recall_score(testingClasses, y_pred, average='weighted'))) | 171 | + oFile.write('Recall: {}\n'.format(recall_score(y_test, y_pred, average='weighted'))) |
154 | - oFile.write('F-score: {}\n'.format(f1_score(testingClasses, y_pred, average='weighted'))) | 172 | + oFile.write('F-score: {}\n'.format(f1_score(y_test, y_pred, average='weighted'))) |
155 | oFile.write('Confusion matrix: \n') | 173 | oFile.write('Confusion matrix: \n') |
156 | - oFile.write(str(confusion_matrix(testingClasses, y_pred)) + '\n') | 174 | + oFile.write(str(confusion_matrix(y_test, y_pred)) + '\n') |
157 | oFile.write('Classification report: \n') | 175 | oFile.write('Classification report: \n') |
158 | - oFile.write(classification_report(testingClasses, y_pred) + '\n') | 176 | + oFile.write(classification_report(y_test, y_pred) + '\n') |
159 | - print(" Done!") | 177 | + print(" Done!") |
160 | 178 | ||
161 | print("Training and testing done in: %fs" % (time() - t0)) | 179 | print("Training and testing done in: %fs" % (time() - t0)) | ... | ... |
... | @@ -2,9 +2,7 @@ | ... | @@ -2,9 +2,7 @@ |
2 | 2 | ||
3 | import os | 3 | import os |
4 | from time import time | 4 | from time import time |
5 | -# from optparse import OptionParser | ||
6 | import argparse | 5 | import argparse |
7 | -import sys | ||
8 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | 6 | from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer |
9 | from scipy.sparse import csr_matrix | 7 | from scipy.sparse import csr_matrix |
10 | from sklearn.metrics.pairwise import cosine_similarity | 8 | from sklearn.metrics.pairwise import cosine_similarity | ... | ... |
-
Please register or login to post a comment