Training, crossvalidation and testing binding thrombin dataset
Showing
1 changed file
with
28 additions
and
2 deletions
... | @@ -10,7 +10,9 @@ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_sc | ... | @@ -10,7 +10,9 @@ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_sc |
10 | classification_report | 10 | classification_report |
11 | from sklearn.externals import joblib | 11 | from sklearn.externals import joblib |
12 | from sklearn import model_selection | 12 | from sklearn import model_selection |
13 | +from sklearn.feature_selection import SelectKBest, chi2 | ||
13 | from scipy.sparse import csr_matrix | 14 | from scipy.sparse import csr_matrix |
15 | +from sklearn.decomposition import TruncatedSVD | ||
14 | import scipy | 16 | import scipy |
15 | 17 | ||
16 | __author__ = 'CMendezC' | 18 | __author__ = 'CMendezC' |
... | @@ -29,13 +31,14 @@ __author__ = 'CMendezC' | ... | @@ -29,13 +31,14 @@ __author__ = 'CMendezC' |
29 | # 9) --classifier Classifier: BernoulliNB, SVM, NearestCentroid. | 31 | # 9) --classifier Classifier: BernoulliNB, SVM, NearestCentroid. |
30 | # 10) --saveData Save matrices | 32 | # 10) --saveData Save matrices |
31 | # 11) --kernel Kernel | 33 | # 11) --kernel Kernel |
34 | +# 12) --reduction Feature selection or dimensionality reduction | ||
32 | 35 | ||
33 | # Ouput: | 36 | # Ouput: |
34 | # 1) Classification model and evaluation report. | 37 | # 1) Classification model and evaluation report. |
35 | 38 | ||
36 | # Execution: | 39 | # Execution: |
37 | 40 | ||
38 | -# python training-testing-binding-thrombin.py | 41 | +# python training-crossvalidation-testing-binding-thrombin.py |
39 | # --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset | 42 | # --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset |
40 | # --inputTrainingData thrombin.data | 43 | # --inputTrainingData thrombin.data |
41 | # --inputTestingData Thrombin.testset | 44 | # --inputTestingData Thrombin.testset |
... | @@ -47,9 +50,10 @@ __author__ = 'CMendezC' | ... | @@ -47,9 +50,10 @@ __author__ = 'CMendezC' |
47 | # --classifier SVM | 50 | # --classifier SVM |
48 | # --saveData | 51 | # --saveData |
49 | # --kernel linear | 52 | # --kernel linear |
53 | +# --reduction SVD200 | ||
50 | 54 | ||
51 | # source activate python3 | 55 | # source activate python3 |
52 | -# python training-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-linear-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM-linear.txt --classifier SVM --kernel linear | 56 | +# python training-crossvalidation-testing-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-linear-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM-linear.txt --classifier SVM --kernel linear --reduction SVD200 |
53 | 57 | ||
54 | ########################################################### | 58 | ########################################################### |
55 | # MAIN PROGRAM # | 59 | # MAIN PROGRAM # |
... | @@ -82,6 +86,9 @@ if __name__ == "__main__": | ... | @@ -82,6 +86,9 @@ if __name__ == "__main__": |
82 | parser.add_argument("--kernel", dest="kernel", | 86 | parser.add_argument("--kernel", dest="kernel", |
83 | help="Kernel SVM", metavar="NAME", | 87 | help="Kernel SVM", metavar="NAME", |
84 | choices=('linear', 'rbf', 'poly'), default='linear') | 88 | choices=('linear', 'rbf', 'poly'), default='linear') |
89 | + parser.add_argument("--reduction", dest="reduction", | ||
90 | + help="Feature selection or dimensionality reduction", metavar="NAME", | ||
91 | + choices=('SVD200', 'SVD300', 'CHI250', 'CHI2100'), default='SVD200') | ||
85 | 92 | ||
86 | args = parser.parse_args() | 93 | args = parser.parse_args() |
87 | 94 | ||
... | @@ -98,6 +105,7 @@ if __name__ == "__main__": | ... | @@ -98,6 +105,7 @@ if __name__ == "__main__": |
98 | print("Classifier: " + str(args.classifier)) | 105 | print("Classifier: " + str(args.classifier)) |
99 | print("Save matrices: " + str(args.saveData)) | 106 | print("Save matrices: " + str(args.saveData)) |
100 | print("Kernel: " + str(args.kernel)) | 107 | print("Kernel: " + str(args.kernel)) |
108 | + print("Reduction: " + str(args.reduction)) | ||
101 | 109 | ||
102 | # Start time | 110 | # Start time |
103 | t0 = time() | 111 | t0 = time() |
... | @@ -163,6 +171,24 @@ if __name__ == "__main__": | ... | @@ -163,6 +171,24 @@ if __name__ == "__main__": |
163 | print(" Number of testing class I: {}".format(y_test.count('I'))) | 171 | print(" Number of testing class I: {}".format(y_test.count('I'))) |
164 | print(" Shape of testing matrix: {}".format(X_test.shape)) | 172 | print(" Shape of testing matrix: {}".format(X_test.shape)) |
165 | 173 | ||
174 | + # Feature selection and dimensional reduction | ||
175 | + if args.reduction is not None: | ||
176 | + print('Performing dimensionality reduction or feature selection...', args.reduction) | ||
177 | + if args.reduction == 'SVD200': | ||
178 | + reduc = TruncatedSVD(n_components=200, random_state=42) | ||
179 | + X_train = reduc.fit_transform(X_train) | ||
180 | + if args.reduction == 'SVD300': | ||
181 | + reduc = TruncatedSVD(n_components=300, random_state=42) | ||
182 | + X_train = reduc.fit_transform(X_train) | ||
183 | + elif args.reduction == 'CHI250': | ||
184 | + reduc = SelectKBest(chi2, k=50) | ||
185 | + X_train = reduc.fit_transform(X_train, y_train) | ||
186 | + elif args.reduction == 'CHI2100': | ||
187 | + reduc = SelectKBest(chi2, k=100) | ||
188 | + X_train = reduc.fit_transform(X_train, y_train) | ||
189 | + print(" Done!") | ||
190 | + print(' New shape of training matrix: ', X_train.shape) | ||
191 | + | ||
166 | jobs = -1 | 192 | jobs = -1 |
167 | paramGrid = [] | 193 | paramGrid = [] |
168 | nIter = 20 | 194 | nIter = 20 | ... | ... |
-
Please register or login to post a comment