Carlos-Francisco Méndez-Cruz

Training, crossvalidation and testing binding thrombin dataset

...@@ -10,7 +10,9 @@ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_sc ...@@ -10,7 +10,9 @@ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_sc
10 classification_report 10 classification_report
11 from sklearn.externals import joblib 11 from sklearn.externals import joblib
12 from sklearn import model_selection 12 from sklearn import model_selection
13 +from sklearn.feature_selection import SelectKBest, chi2
13 from scipy.sparse import csr_matrix 14 from scipy.sparse import csr_matrix
15 +from sklearn.decomposition import TruncatedSVD
14 import scipy 16 import scipy
15 17
16 __author__ = 'CMendezC' 18 __author__ = 'CMendezC'
...@@ -29,13 +31,14 @@ __author__ = 'CMendezC' ...@@ -29,13 +31,14 @@ __author__ = 'CMendezC'
29 # 9) --classifier Classifier: BernoulliNB, SVM, NearestCentroid. 31 # 9) --classifier Classifier: BernoulliNB, SVM, NearestCentroid.
30 # 10) --saveData Save matrices 32 # 10) --saveData Save matrices
31 # 11) --kernel Kernel 33 # 11) --kernel Kernel
34 +# 12) --reduction Feature selection or dimensionality reduction
32 35
33 # Ouput: 36 # Ouput:
34 # 1) Classification model and evaluation report. 37 # 1) Classification model and evaluation report.
35 38
36 # Execution: 39 # Execution:
37 40
38 -# python training-testing-binding-thrombin.py 41 +# python training-crossvalidation-testing-binding-thrombin.py
39 # --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset 42 # --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset
40 # --inputTrainingData thrombin.data 43 # --inputTrainingData thrombin.data
41 # --inputTestingData Thrombin.testset 44 # --inputTestingData Thrombin.testset
...@@ -47,9 +50,10 @@ __author__ = 'CMendezC' ...@@ -47,9 +50,10 @@ __author__ = 'CMendezC'
47 # --classifier SVM 50 # --classifier SVM
48 # --saveData 51 # --saveData
49 # --kernel linear 52 # --kernel linear
53 +# --reduction SVD200
50 54
51 # source activate python3 55 # source activate python3
52 -# python training-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-linear-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM-linear.txt --classifier SVM --kernel linear 56 +# python training-crossvalidation-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-linear-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM-linear.txt --classifier SVM --kernel linear --reduction SVD200
53 57
54 ########################################################### 58 ###########################################################
55 # MAIN PROGRAM # 59 # MAIN PROGRAM #
...@@ -82,6 +86,9 @@ if __name__ == "__main__": ...@@ -82,6 +86,9 @@ if __name__ == "__main__":
82 parser.add_argument("--kernel", dest="kernel", 86 parser.add_argument("--kernel", dest="kernel",
83 help="Kernel SVM", metavar="NAME", 87 help="Kernel SVM", metavar="NAME",
84 choices=('linear', 'rbf', 'poly'), default='linear') 88 choices=('linear', 'rbf', 'poly'), default='linear')
89 + parser.add_argument("--reduction", dest="reduction",
90 + help="Feature selection or dimensionality reduction", metavar="NAME",
91 + choices=('SVD200', 'SVD300', 'CHI250', 'CHI2100'), default='SVD200')
85 92
86 args = parser.parse_args() 93 args = parser.parse_args()
87 94
...@@ -98,6 +105,7 @@ if __name__ == "__main__": ...@@ -98,6 +105,7 @@ if __name__ == "__main__":
98 print("Classifier: " + str(args.classifier)) 105 print("Classifier: " + str(args.classifier))
99 print("Save matrices: " + str(args.saveData)) 106 print("Save matrices: " + str(args.saveData))
100 print("Kernel: " + str(args.kernel)) 107 print("Kernel: " + str(args.kernel))
108 + print("Reduction: " + str(args.reduction))
101 109
102 # Start time 110 # Start time
103 t0 = time() 111 t0 = time()
...@@ -163,6 +171,24 @@ if __name__ == "__main__": ...@@ -163,6 +171,24 @@ if __name__ == "__main__":
163 print(" Number of testing class I: {}".format(y_test.count('I'))) 171 print(" Number of testing class I: {}".format(y_test.count('I')))
164 print(" Shape of testing matrix: {}".format(X_test.shape)) 172 print(" Shape of testing matrix: {}".format(X_test.shape))
165 173
174 + # Feature selection and dimensional reduction
175 + if args.reduction is not None:
176 + print('Performing dimensionality reduction or feature selection...', args.reduction)
177 + if args.reduction == 'SVD200':
178 + reduc = TruncatedSVD(n_components=200, random_state=42)
179 + X_train = reduc.fit_transform(X_train)
180 + if args.reduction == 'SVD300':
181 + reduc = TruncatedSVD(n_components=300, random_state=42)
182 + X_train = reduc.fit_transform(X_train)
183 + elif args.reduction == 'CHI250':
184 + reduc = SelectKBest(chi2, k=50)
185 + X_train = reduc.fit_transform(X_train, y_train)
186 + elif args.reduction == 'CHI2100':
187 + reduc = SelectKBest(chi2, k=100)
188 + X_train = reduc.fit_transform(X_train, y_train)
189 + print(" Done!")
190 + print(' New shape of training matrix: ', X_train.shape)
191 +
166 jobs = -1 192 jobs = -1
167 paramGrid = [] 193 paramGrid = []
168 nIter = 20 194 nIter = 20
......