Carlos-Francisco Méndez-Cruz

Training and testing binding thrombin dataset

...@@ -13,7 +13,7 @@ from scipy.sparse import csr_matrix ...@@ -13,7 +13,7 @@ from scipy.sparse import csr_matrix
13 13
14 __author__ = 'CMendezC' 14 __author__ = 'CMendezC'
15 15
16 -# Goal: training and validation binding thrombin data set 16 +# Goal: training, crossvalidation and testing binding thrombin data set
17 17
18 # Parameters: 18 # Parameters:
19 # 1) --inputPath Path to read input files. 19 # 1) --inputPath Path to read input files.
...@@ -26,13 +26,14 @@ __author__ = 'CMendezC' ...@@ -26,13 +26,14 @@ __author__ = 'CMendezC'
26 # 8) --outputReportFile File to place evaluation report. 26 # 8) --outputReportFile File to place evaluation report.
27 # 9) --classifier Classifier: BernoulliNB, SVM, NearestCentroid. 27 # 9) --classifier Classifier: BernoulliNB, SVM, NearestCentroid.
28 # 10) --saveData Save matrices 28 # 10) --saveData Save matrices
29 +# 11) --kernel Kernel
29 30
30 # Ouput: 31 # Ouput:
31 # 1) Classification model and evaluation report. 32 # 1) Classification model and evaluation report.
32 33
33 # Execution: 34 # Execution:
34 35
35 -# python training-validation-binding-thrombin.py 36 +# python training-testing-binding-thrombin.py
36 # --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset 37 # --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset
37 # --inputTrainingData thrombin.data 38 # --inputTrainingData thrombin.data
38 # --inputTestingData Thrombin.testset 39 # --inputTestingData Thrombin.testset
...@@ -43,9 +44,10 @@ __author__ = 'CMendezC' ...@@ -43,9 +44,10 @@ __author__ = 'CMendezC'
43 # --outputReportFile SVM.txt 44 # --outputReportFile SVM.txt
44 # --classifier SVM 45 # --classifier SVM
45 # --saveData 46 # --saveData
47 +# --kernel linear
46 48
47 # source activate python3 49 # source activate python3
48 -# python training-validation-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM.txt --classifier SVM --saveData 50 +# python training-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM.txt --classifier SVM
49 51
50 ########################################################### 52 ###########################################################
51 # MAIN PROGRAM # 53 # MAIN PROGRAM #
...@@ -108,10 +110,12 @@ if __name__ == "__main__": ...@@ -108,10 +110,12 @@ if __name__ == "__main__":
108 trainingData.append(listLine[1:]) 110 trainingData.append(listLine[1:])
109 # X_train = np.matrix(trainingData) 111 # X_train = np.matrix(trainingData)
110 X_train = csr_matrix(trainingData, dtype='double') 112 X_train = csr_matrix(trainingData, dtype='double')
113 + print(" Saving matrix and classes...")
111 joblib.dump(X_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb')) 114 joblib.dump(X_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb'))
112 joblib.dump(y_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb')) 115 joblib.dump(y_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb'))
116 + print(" Done!")
113 else: 117 else:
114 - print(" Saving matrix and classes...") 118 + print(" Loading matrix and classes...")
115 X_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb')) 119 X_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb'))
116 y_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb')) 120 y_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb'))
117 print(" Done!") 121 print(" Done!")
...@@ -138,10 +142,12 @@ if __name__ == "__main__": ...@@ -138,10 +142,12 @@ if __name__ == "__main__":
138 for line in iFile: 142 for line in iFile:
139 line = line.strip('\r\n') 143 line = line.strip('\r\n')
140 y_test.append(line) 144 y_test.append(line)
145 + print(" Saving matrix and classes...")
141 joblib.dump(X_test, os.path.join(args.outputModelPath, args.inputTestingData + '.jlb')) 146 joblib.dump(X_test, os.path.join(args.outputModelPath, args.inputTestingData + '.jlb'))
142 joblib.dump(y_test, os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb')) 147 joblib.dump(y_test, os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb'))
148 + print(" Done!")
143 else: 149 else:
144 - print(" Saving matrix and classes...") 150 + print(" Loading matrix and classes...")
145 X_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingData + '.jlb')) 151 X_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingData + '.jlb'))
146 y_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb')) 152 y_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb'))
147 print(" Done!") 153 print(" Done!")
......
1 +# -*- encoding: utf-8 -*-
2 +
3 +import os
4 +from time import time
5 +import argparse
6 +from sklearn.naive_bayes import BernoulliNB
7 +from sklearn.svm import SVC
8 +from sklearn.neighbors import KNeighborsClassifier
9 +from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \
10 + classification_report
11 +from sklearn.externals import joblib
12 +from scipy.sparse import csr_matrix
13 +
14 +__author__ = 'CMendezC'
15 +
16 +# Goal: training and testing binding thrombin data set
17 +
18 +# Parameters:
19 +# 1) --inputPath Path to read input files.
20 +# 2) --inputTrainingData File to read training data.
21 +# 3) --inputTestingData File to read testing data.
22 +# 4) --inputTestingClasses File to read testing classes.
23 +# 5) --outputModelPath Path to place output model.
24 +# 6) --outputModelFile File to place output model.
25 +# 7) --outputReportPath Path to place evaluation report.
26 +# 8) --outputReportFile File to place evaluation report.
27 +# 9) --classifier Classifier: BernoulliNB, SVM, kNN.
28 +# 10) --saveData Save matrices
29 +
30 +# Ouput:
31 +# 1) Classification model and evaluation report.
32 +
33 +# Execution:
34 +
35 +# python training-testing-binding-thrombin.py
36 +# --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset
37 +# --inputTrainingData thrombin.data
38 +# --inputTestingData Thrombin.testset
39 +# --inputTestingClasses Thrombin.testset.class
40 +# --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models
41 +# --outputModelFile SVM-model.mod
42 +# --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports
43 +# --outputReportFile SVM.txt
44 +# --classifier SVM
45 +# --saveData
46 +
47 +# source activate python3
48 +# python training-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM.txt --classifier SVM --saveData
49 +
50 +###########################################################
51 +# MAIN PROGRAM #
52 +###########################################################
53 +
54 +if __name__ == "__main__":
55 + # Parameter definition
56 + parser = argparse.ArgumentParser(description='Training and testing Binding Thrombin Dataset.')
57 + parser.add_argument("--inputPath", dest="inputPath",
58 + help="Path to read input files", metavar="PATH")
59 + parser.add_argument("--inputTrainingData", dest="inputTrainingData",
60 + help="File to read training data", metavar="FILE")
61 + parser.add_argument("--inputTestingData", dest="inputTestingData",
62 + help="File to read testing data", metavar="FILE")
63 + parser.add_argument("--inputTestingClasses", dest="inputTestingClasses",
64 + help="File to read testing classes", metavar="FILE")
65 + parser.add_argument("--outputModelPath", dest="outputModelPath",
66 + help="Path to place output model", metavar="PATH")
67 + parser.add_argument("--outputModelFile", dest="outputModelFile",
68 + help="File to place output model", metavar="FILE")
69 + parser.add_argument("--outputReportPath", dest="outputReportPath",
70 + help="Path to place evaluation report", metavar="PATH")
71 + parser.add_argument("--outputReportFile", dest="outputReportFile",
72 + help="File to place evaluation report", metavar="FILE")
73 + parser.add_argument("--classifier", dest="classifier",
74 + help="Classifier", metavar="NAME",
75 + choices=('BernoulliNB', 'SVM', 'kNN'), default='SVM')
76 + parser.add_argument("--saveData", dest="saveData", action='store_true',
77 + help="Save matrices")
78 +
79 + args = parser.parse_args()
80 +
81 + # Printing parameter values
82 + print('-------------------------------- PARAMETERS --------------------------------')
83 + print("Path to read input files: " + str(args.inputPath))
84 + print("File to read training data: " + str(args.inputTrainingData))
85 + print("File to read testing data: " + str(args.inputTestingData))
86 + print("File to read testing classes: " + str(args.inputTestingClasses))
87 + print("Path to place output model: " + str(args.outputModelPath))
88 + print("File to place output model: " + str(args.outputModelFile))
89 + print("Path to place evaluation report: " + str(args.outputReportPath))
90 + print("File to place evaluation report: " + str(args.outputReportFile))
91 + print("Classifier: " + str(args.classifier))
92 + print("Save matrices: " + str(args.saveData))
93 +
94 + # Start time
95 + t0 = time()
96 +
97 + print("Reading training data and true classes...")
98 + X_train = None
99 + if args.saveData:
100 + y_train = []
101 + trainingData = []
102 + with open(os.path.join(args.inputPath, args.inputTrainingData), encoding='utf8', mode='r') \
103 + as iFile:
104 + for line in iFile:
105 + line = line.strip('\r\n')
106 + listLine = line.split(',')
107 + y_train.append(listLine[0])
108 + trainingData.append(listLine[1:])
109 + # X_train = np.matrix(trainingData)
110 + X_train = csr_matrix(trainingData, dtype='double')
111 + print(" Saving matrix and classes...")
112 + joblib.dump(X_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb'))
113 + joblib.dump(y_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb'))
114 + print(" Done!")
115 + else:
116 + print(" Loading matrix and classes...")
117 + X_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb'))
118 + y_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb'))
119 + print(" Done!")
120 +
121 + print(" Number of training classes: {}".format(len(y_train)))
122 + print(" Number of training class A: {}".format(y_train.count('A')))
123 + print(" Number of training class I: {}".format(y_train.count('I')))
124 + print(" Shape of training matrix: {}".format(X_train.shape))
125 +
126 + print("Reading testing data and true classes...")
127 + X_test = None
128 + if args.saveData:
129 + y_test = []
130 + testingData = []
131 + with open(os.path.join(args.inputPath, args.inputTestingData), encoding='utf8', mode='r') \
132 + as iFile:
133 + for line in iFile:
134 + line = line.strip('\r\n')
135 + listLine = line.split(',')
136 + testingData.append(listLine[1:])
137 + X_test = csr_matrix(testingData, dtype='double')
138 + with open(os.path.join(args.inputPath, args.inputTestingClasses), encoding='utf8', mode='r') \
139 + as iFile:
140 + for line in iFile:
141 + line = line.strip('\r\n')
142 + y_test.append(line)
143 + print(" Saving matrix and classes...")
144 + joblib.dump(X_test, os.path.join(args.outputModelPath, args.inputTestingData + '.jlb'))
145 + joblib.dump(y_test, os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb'))
146 + print(" Done!")
147 + else:
148 + print(" Loading matrix and classes...")
149 + X_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingData + '.jlb'))
150 + y_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb'))
151 + print(" Done!")
152 +
153 + print(" Number of testing classes: {}".format(len(y_test)))
154 + print(" Number of testing class A: {}".format(y_test.count('A')))
155 + print(" Number of testing class I: {}".format(y_test.count('I')))
156 + print(" Shape of testing matrix: {}".format(X_test.shape))
157 +
158 + if args.classifier == "BernoulliNB":
159 + classifier = BernoulliNB()
160 + elif args.classifier == "SVM":
161 + classifier = SVC()
162 + elif args.classifier == "kNN":
163 + classifier = KNeighborsClassifier()
164 + else:
165 + print("Bad classifier")
166 + exit()
167 +
168 + print("Training...")
169 + classifier.fit(X_train, y_train)
170 + print(" Done!")
171 +
172 + print("Testing (prediction in new data)...")
173 + y_pred = classifier.predict(X_test)
174 + print(" Done!")
175 +
176 + print("Saving report...")
177 + with open(os.path.join(args.outputReportPath, args.outputReportFile), mode='w', encoding='utf8') as oFile:
178 + oFile.write('********** EVALUATION REPORT **********\n')
179 + oFile.write('Classifier: {}\n'.format(args.classifier))
180 + oFile.write('Accuracy: {}\n'.format(accuracy_score(y_test, y_pred)))
181 + oFile.write('Precision: {}\n'.format(precision_score(y_test, y_pred, average='weighted')))
182 + oFile.write('Recall: {}\n'.format(recall_score(y_test, y_pred, average='weighted')))
183 + oFile.write('F-score: {}\n'.format(f1_score(y_test, y_pred, average='weighted')))
184 + oFile.write('Confusion matrix: \n')
185 + oFile.write(str(confusion_matrix(y_test, y_pred)) + '\n')
186 + oFile.write('Classification report: \n')
187 + oFile.write(classification_report(y_test, y_pred) + '\n')
188 + print(" Done!")
189 +
190 + print("Training and testing done in: %fs" % (time() - t0))