Carlos-Francisco Méndez-Cruz

Training and testing binding thrombin dataset

...@@ -8,9 +8,8 @@ from sklearn.svm import SVC ...@@ -8,9 +8,8 @@ from sklearn.svm import SVC
8 from sklearn.neighbors import NearestCentroid 8 from sklearn.neighbors import NearestCentroid
9 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \ 9 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \
10 classification_report 10 classification_report
11 -import sys 11 +from sklearn.externals import joblib
12 from scipy.sparse import csr_matrix 12 from scipy.sparse import csr_matrix
13 -import numpy as np
14 13
15 __author__ = 'CMendezC' 14 __author__ = 'CMendezC'
16 15
...@@ -26,6 +25,7 @@ __author__ = 'CMendezC' ...@@ -26,6 +25,7 @@ __author__ = 'CMendezC'
26 # 7) --outputReportPath Path to place evaluation report. 25 # 7) --outputReportPath Path to place evaluation report.
27 # 8) --outputReportFile File to place evaluation report. 26 # 8) --outputReportFile File to place evaluation report.
28 # 9) --classifier Classifier: BernoulliNB, SVM, NearestCentroid. 27 # 9) --classifier Classifier: BernoulliNB, SVM, NearestCentroid.
28 +# 10) --saveData Save matrices
29 29
30 # Ouput: 30 # Ouput:
31 # 1) Classification model and evaluation report. 31 # 1) Classification model and evaluation report.
...@@ -42,9 +42,10 @@ __author__ = 'CMendezC' ...@@ -42,9 +42,10 @@ __author__ = 'CMendezC'
42 # --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports 42 # --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports
43 # --outputReportFile SVM.txt 43 # --outputReportFile SVM.txt
44 # --classifier SVM 44 # --classifier SVM
45 +# --saveData
45 46
46 # source activate python3 47 # source activate python3
47 -# python training-validation-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM.txt --classifier SVM 48 +# python training-validation-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM.txt --classifier SVM --saveData
48 49
49 ########################################################### 50 ###########################################################
50 # MAIN PROGRAM # 51 # MAIN PROGRAM #
...@@ -72,6 +73,8 @@ if __name__ == "__main__": ...@@ -72,6 +73,8 @@ if __name__ == "__main__":
72 parser.add_argument("--classifier", dest="classifier", 73 parser.add_argument("--classifier", dest="classifier",
73 help="Classifier", metavar="NAME", 74 help="Classifier", metavar="NAME",
74 choices=('BernoulliNB', 'SVM', 'NearestCentroid'), default='SVM') 75 choices=('BernoulliNB', 'SVM', 'NearestCentroid'), default='SVM')
76 + parser.add_argument("--saveData", dest="saveData", action='store_true',
77 + help="Save matrices")
75 78
76 args = parser.parse_args() 79 args = parser.parse_args()
77 80
...@@ -86,48 +89,63 @@ if __name__ == "__main__": ...@@ -86,48 +89,63 @@ if __name__ == "__main__":
86 print("Path to place evaluation report: " + str(args.outputReportPath)) 89 print("Path to place evaluation report: " + str(args.outputReportPath))
87 print("File to place evaluation report: " + str(args.outputReportFile)) 90 print("File to place evaluation report: " + str(args.outputReportFile))
88 print("Classifier: " + str(args.classifier)) 91 print("Classifier: " + str(args.classifier))
92 + print("Save matrices: " + str(args.saveData))
89 93
90 # Start time 94 # Start time
91 t0 = time() 95 t0 = time()
92 96
93 - print(" Reading training data and true classes...") 97 + print("Reading training data and true classes...")
94 - trainingClasses = [] 98 + X_train = None
95 - trainingData = [] 99 + if args.saveData:
96 - with open(os.path.join(args.inputPath, args.inputTrainingData), encoding='utf8', mode='r') \ 100 + y_train = []
97 - as iFile: 101 + trainingData = []
98 - for line in iFile: 102 + with open(os.path.join(args.inputPath, args.inputTrainingData), encoding='utf8', mode='r') \
99 - line = line.strip('\r\n') 103 + as iFile:
100 - listLine = line.split(',') 104 + for line in iFile:
101 - trainingClasses.append(listLine[0]) 105 + line = line.strip('\r\n')
102 - trainingData.append(listLine[1:]) 106 + listLine = line.split(',')
103 - # trainingMatrix = np.matrix(trainingData) 107 + y_train.append(listLine[0])
104 - trainingMatrix = csr_matrix(trainingData, dtype='double') 108 + trainingData.append(listLine[1:])
105 - 109 + # X_train = np.matrix(trainingData)
106 - print("Number of training classes: {}".format(len(trainingClasses))) 110 + X_train = csr_matrix(trainingData, dtype='double')
107 - print("Number of training class A: {}".format(trainingClasses.count('A'))) 111 + joblib.dump(X_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb'))
108 - print("Number of training class I: {}".format(trainingClasses.count('I'))) 112 + joblib.dump(y_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb'))
109 - print("Shape of training matrix: {}".format(trainingMatrix.shape)) 113 + else:
110 - 114 + X_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb'))
111 - print(" Reading testing data and true classes...") 115 + y_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb'))
112 - testingClasses = [] 116 +
113 - testingData = [] 117 + print(" Number of training classes: {}".format(len(y_train)))
114 - with open(os.path.join(args.inputPath, args.inputTestingData), encoding='utf8', mode='r') \ 118 + print(" Number of training class A: {}".format(y_train.count('A')))
115 - as iFile: 119 + print(" Number of training class I: {}".format(y_train.count('I')))
116 - for line in iFile: 120 + print(" Shape of training matrix: {}".format(X_train.shape))
117 - line = line.strip('\r\n') 121 +
118 - listLine = line.split(',') 122 + print("Reading testing data and true classes...")
119 - testingData.append(listLine[1:]) 123 + X_test = None
120 - testingMatrix = csr_matrix(testingData, dtype='double') 124 + if args.saveData:
121 - with open(os.path.join(args.inputPath, args.inputTestingClasses), encoding='utf8', mode='r') \ 125 + y_test = []
122 - as iFile: 126 + testingData = []
123 - for line in iFile: 127 + with open(os.path.join(args.inputPath, args.inputTestingData), encoding='utf8', mode='r') \
124 - line = line.strip('\r\n') 128 + as iFile:
125 - testingClasses.append(line) 129 + for line in iFile:
126 - 130 + line = line.strip('\r\n')
127 - print("Number of testing classes: {}".format(len(testingClasses))) 131 + listLine = line.split(',')
128 - print("Number of testing class A: {}".format(trainingClasses.count('A'))) 132 + testingData.append(listLine[1:])
129 - print("Number of testing class I: {}".format(trainingClasses.count('I'))) 133 + X_test = csr_matrix(testingData, dtype='double')
130 - print("Shape of testing matrix: {}".format(testingMatrix.shape)) 134 + with open(os.path.join(args.inputPath, args.inputTestingClasses), encoding='utf8', mode='r') \
135 + as iFile:
136 + for line in iFile:
137 + line = line.strip('\r\n')
138 + y_test.append(line)
139 + joblib.dump(X_test, os.path.join(args.outputModelPath, args.inputTestingData + '.jlb'))
140 + joblib.dump(y_test, os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb'))
141 + else:
142 + X_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingData + '.jlb'))
143 + y_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb'))
144 +
145 + print(" Number of testing classes: {}".format(len(y_test)))
146 + print(" Number of testing class A: {}".format(y_test.count('A')))
147 + print(" Number of testing class I: {}".format(y_test.count('I')))
148 + print(" Shape of testing matrix: {}".format(X_test.shape))
131 149
132 if args.classifier == "MultinomialNB": 150 if args.classifier == "MultinomialNB":
133 classifier = BernoulliNB() 151 classifier = BernoulliNB()
...@@ -136,26 +154,26 @@ if __name__ == "__main__": ...@@ -136,26 +154,26 @@ if __name__ == "__main__":
136 elif args.classifier == "NearestCentroid": 154 elif args.classifier == "NearestCentroid":
137 classifier = NearestCentroid() 155 classifier = NearestCentroid()
138 156
139 - print(" Training...") 157 + print("Training...")
140 - classifier.fit(trainingMatrix, trainingClasses) 158 + classifier.fit(X_train, y_train)
141 - print(" Done!") 159 + print(" Done!")
142 160
143 - print(" Testing (prediction in new data)...") 161 + print("Testing (prediction in new data)...")
144 - y_pred = classifier.predict(testingMatrix) 162 + y_pred = classifier.predict(X_test)
145 - print(" Done!") 163 + print(" Done!")
146 164
147 - print(" Saving report...") 165 + print("Saving report...")
148 - with open(os.path.join(args.outputPath, args.outputFile), mode='w', encoding='utf8') as oFile: 166 + with open(os.path.join(args.outputReportPath, args.outputReportFile), mode='w', encoding='utf8') as oFile:
149 oFile.write('********** EVALUATION REPORT **********\n') 167 oFile.write('********** EVALUATION REPORT **********\n')
150 oFile.write('Classifier: {}\n'.format(args.classifier)) 168 oFile.write('Classifier: {}\n'.format(args.classifier))
151 - oFile.write('Accuracy: {}\n'.format(accuracy_score(testingClasses, y_pred))) 169 + oFile.write('Accuracy: {}\n'.format(accuracy_score(y_test, y_pred)))
152 - oFile.write('Precision: {}\n'.format(precision_score(testingClasses, y_pred, average='weighted'))) 170 + oFile.write('Precision: {}\n'.format(precision_score(y_test, y_pred, average='weighted')))
153 - oFile.write('Recall: {}\n'.format(recall_score(testingClasses, y_pred, average='weighted'))) 171 + oFile.write('Recall: {}\n'.format(recall_score(y_test, y_pred, average='weighted')))
154 - oFile.write('F-score: {}\n'.format(f1_score(testingClasses, y_pred, average='weighted'))) 172 + oFile.write('F-score: {}\n'.format(f1_score(y_test, y_pred, average='weighted')))
155 oFile.write('Confusion matrix: \n') 173 oFile.write('Confusion matrix: \n')
156 - oFile.write(str(confusion_matrix(testingClasses, y_pred)) + '\n') 174 + oFile.write(str(confusion_matrix(y_test, y_pred)) + '\n')
157 oFile.write('Classification report: \n') 175 oFile.write('Classification report: \n')
158 - oFile.write(classification_report(testingClasses, y_pred) + '\n') 176 + oFile.write(classification_report(y_test, y_pred) + '\n')
159 - print(" Done!") 177 + print(" Done!")
160 178
161 print("Training and testing done in: %fs" % (time() - t0)) 179 print("Training and testing done in: %fs" % (time() - t0))
......
...@@ -2,9 +2,7 @@ ...@@ -2,9 +2,7 @@
2 2
3 import os 3 import os
4 from time import time 4 from time import time
5 -# from optparse import OptionParser
6 import argparse 5 import argparse
7 -import sys
8 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 6 from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
9 from scipy.sparse import csr_matrix 7 from scipy.sparse import csr_matrix
10 from sklearn.metrics.pairwise import cosine_similarity 8 from sklearn.metrics.pairwise import cosine_similarity
......