Showing
1 changed file
with
323 additions
and
0 deletions
clasificacion-automatica/structural-domain-dataset/training-crossvalidation-testing-dom-v02.py
0 → 100644
1 | +# -*- encoding: utf-8 -*- | ||
2 | + | ||
3 | +import os | ||
4 | +from time import time | ||
5 | +import argparse | ||
6 | +import scipy | ||
7 | +from sklearn import model_selection | ||
8 | +from sklearn.naive_bayes import MultinomialNB, BernoulliNB | ||
9 | +from sklearn.svm import SVC | ||
10 | +from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \ | ||
11 | + classification_report, make_scorer | ||
12 | +from sklearn.externals import joblib | ||
13 | +from nltk.corpus import stopwords | ||
14 | +from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | ||
15 | +from sklearn.feature_selection import SelectKBest, chi2 | ||
16 | +from sklearn.decomposition import TruncatedSVD | ||
17 | +from scipy.sparse import csr_matrix | ||
18 | + | ||
19 | + | ||
20 | +__author__ = 'CMendezC' | ||
21 | + | ||
22 | +# Goal: training, crossvalidation and testing transcription factor structural domain sentences | ||
23 | + | ||
24 | +# Parameters: | ||
25 | +# 1) --inputPath Path to read input files. | ||
26 | +# 2) --inputTrainingData File to read training data. | ||
27 | +# 4) --inputTrainingClasses File to read training classes. | ||
28 | +# 3) --inputTestingData File to read testing data. | ||
29 | +# 4) --inputTestingClasses File to read testing classes. | ||
30 | +# 5) --outputModelPath Path to place output model. | ||
31 | +# 6) --outputModelFile File to place output model. | ||
32 | +# 7) --outputReportPath Path to place evaluation report. | ||
33 | +# 8) --outputReportFile File to place evaluation report. | ||
34 | +# 9) --classifier Classifier: BernoulliNB, SVM, kNN. | ||
35 | +# 10) --saveData Save matrices | ||
36 | +# 11) --kernel Kernel | ||
37 | +# 12) --reduction Feature selection or dimensionality reduction | ||
38 | +# 13) --removeStopWords Remove most frequent words | ||
39 | +# 14) --vectorizer Vectorizer: b=binary, f=frequency, t=tf-idf. | ||
40 | + | ||
41 | + | ||
42 | +# Ouput: | ||
43 | +# 1) Classification model and evaluation report. | ||
44 | + | ||
45 | +# Execution: | ||
46 | + | ||
47 | +# source activate python3 | ||
48 | +# python training-crossvalidation-testing-dom.py | ||
49 | +# --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/structural-domain-dataset | ||
50 | +# --inputTrainingData trainData.txt | ||
51 | +# --inputTrainingClasses trainClasses.txt | ||
52 | +# --inputTestingData testData.txt | ||
53 | +# --inputTestingClasses testClasses.txt | ||
54 | +# --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/structural-domain-dataset/models | ||
55 | +# --outputModelFile SVM-lineal-model.mod | ||
56 | +# --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/structural-domain-dataset/reports | ||
57 | +# --outputReportFile SVM-linear.txt | ||
58 | +# --classifier SVM | ||
59 | +# --saveData | ||
60 | +# --kernel linear | ||
61 | +# --reduction SVD200 | ||
62 | +# --removeStopWords | ||
63 | +# --vectorizer b | ||
64 | +# --ngrinitial 2 | ||
65 | +# --ngrfinal 2 | ||
66 | +# python training-crossvalidation-testing-dom.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/structural-domain-dataset --inputTrainingData trainData.txt --inputTrainingClasses trainClasses.txt --inputTestingData testData.txt --inputTestingClasses testClasses.txt --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/structural-domain-dataset/models --outputModelFile SVM-lineal-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/structural-domain-dataset/reports --outputReportFile SVM-linear.txt --classifier SVM --kernel linear --saveData --vectorizer b --ngrinitial 2 --ngrfinal 2 --removeStopWords | ||
67 | +# --reduction SVD200 | ||
68 | +# --removeStopWords | ||
69 | + | ||
70 | +########################################################### | ||
71 | +# MAIN PROGRAM # | ||
72 | +########################################################### | ||
73 | + | ||
74 | +if __name__ == "__main__": | ||
75 | + # Parameter definition | ||
76 | + parser = argparse.ArgumentParser(description='Training validation structural domain dataset.') | ||
77 | + parser.add_argument("--inputPath", dest="inputPath", | ||
78 | + help="Path to read input files", metavar="PATH") | ||
79 | + parser.add_argument("--inputTrainingData", dest="inputTrainingData", | ||
80 | + help="File to read training data", metavar="FILE") | ||
81 | + parser.add_argument("--inputTrainingClasses", dest="inputTrainingClasses", | ||
82 | + help="File to read training classes", metavar="FILE") | ||
83 | + parser.add_argument("--inputTestingData", dest="inputTestingData", | ||
84 | + help="File to read testing data", metavar="FILE") | ||
85 | + parser.add_argument("--inputTestingClasses", dest="inputTestingClasses", | ||
86 | + help="File to read testing classes", metavar="FILE") | ||
87 | + parser.add_argument("--outputModelPath", dest="outputModelPath", | ||
88 | + help="Path to place output model", metavar="PATH") | ||
89 | + parser.add_argument("--outputModelFile", dest="outputModelFile", | ||
90 | + help="File to place output model", metavar="FILE") | ||
91 | + parser.add_argument("--outputReportPath", dest="outputReportPath", | ||
92 | + help="Path to place evaluation report", metavar="PATH") | ||
93 | + parser.add_argument("--outputReportFile", dest="outputReportFile", | ||
94 | + help="File to place evaluation report", metavar="FILE") | ||
95 | + parser.add_argument("--classifier", dest="classifier", | ||
96 | + help="Classifier", metavar="NAME", | ||
97 | + choices=('BernoulliNB', 'SVM', 'kNN'), default='SVM') | ||
98 | + parser.add_argument("--saveData", dest="saveData", action='store_true', | ||
99 | + help="Save matrices") | ||
100 | + parser.add_argument("--kernel", dest="kernel", | ||
101 | + help="Kernel SVM", metavar="NAME", | ||
102 | + choices=('linear', 'rbf', 'poly'), default='linear') | ||
103 | + parser.add_argument("--reduction", dest="reduction", | ||
104 | + help="Feature selection or dimensionality reduction", metavar="NAME", | ||
105 | + choices=('SVD200', 'SVD300', 'CHI250', 'CHI2100'), default=None) | ||
106 | + parser.add_argument("--removeStopWords", default=False, | ||
107 | + action="store_true", dest="removeStopWords", | ||
108 | + help="Remove stop words") | ||
109 | + parser.add_argument("--ngrinitial", type=int, | ||
110 | + dest="ngrinitial", default=1, | ||
111 | + help="Initial n-gram", metavar="INTEGER") | ||
112 | + parser.add_argument("--ngrfinal", type=int, | ||
113 | + dest="ngrfinal", default=1, | ||
114 | + help="Final n-gram", metavar="INTEGER") | ||
115 | + parser.add_argument("--vectorizer", dest="vectorizer", required=True, | ||
116 | + help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR", | ||
117 | + choices=('b', 'f', 't'), default='b') | ||
118 | + | ||
119 | + args = parser.parse_args() | ||
120 | + | ||
121 | + # Printing parameter values | ||
122 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
123 | + print("Path to read input files: " + str(args.inputPath)) | ||
124 | + print("File to read training data: " + str(args.inputTrainingData)) | ||
125 | + print("File to read training classes: " + str(args.inputTrainingClasses)) | ||
126 | + print("File to read testing data: " + str(args.inputTestingData)) | ||
127 | + print("File to read testing classes: " + str(args.inputTestingClasses)) | ||
128 | + print("Path to place output model: " + str(args.outputModelPath)) | ||
129 | + print("File to place output model: " + str(args.outputModelFile)) | ||
130 | + print("Path to place evaluation report: " + str(args.outputReportPath)) | ||
131 | + print("File to place evaluation report: " + str(args.outputReportFile)) | ||
132 | + print("Classifier: " + str(args.classifier)) | ||
133 | + print("Save matrices: " + str(args.saveData)) | ||
134 | + print("Kernel: " + str(args.kernel)) | ||
135 | + print("Reduction: " + str(args.reduction)) | ||
136 | + print("Remove stop words: " + str(args.removeStopWords)) | ||
137 | + print("Initial ngram: " + str(args.ngrinitial)) | ||
138 | + print("Final ngram: " + str(args.ngrfinal)) | ||
139 | + print("Vectorizer: " + str(args.vectorizer)) | ||
140 | + | ||
141 | + # Start time | ||
142 | + t0 = time() | ||
143 | + | ||
144 | + if args.removeStopWords: | ||
145 | + pf = stopwords.words('english') | ||
146 | + else: | ||
147 | + pf = None | ||
148 | + | ||
149 | + y_train = [] | ||
150 | + trainingData = [] | ||
151 | + y_test = [] | ||
152 | + testingData = [] | ||
153 | + X_train = None | ||
154 | + X_test = None | ||
155 | + | ||
156 | + if args.saveData: | ||
157 | + print("Reading training data and true classes...") | ||
158 | + with open(os.path.join(args.inputPath, args.inputTrainingClasses), encoding='utf8', mode='r') \ | ||
159 | + as iFile: | ||
160 | + for line in iFile: | ||
161 | + line = line.strip('\r\n') | ||
162 | + y_train.append(line) | ||
163 | + with open(os.path.join(args.inputPath, args.inputTrainingData), encoding='utf8', mode='r') \ | ||
164 | + as iFile: | ||
165 | + for line in iFile: | ||
166 | + line = line.strip('\r\n') | ||
167 | + trainingData.append(line) | ||
168 | + print(" Done!") | ||
169 | + | ||
170 | + print("Reading testing data and true classes...") | ||
171 | + with open(os.path.join(args.inputPath, args.inputTestingClasses), encoding='utf8', mode='r') \ | ||
172 | + as iFile: | ||
173 | + for line in iFile: | ||
174 | + line = line.strip('\r\n') | ||
175 | + y_test.append(line) | ||
176 | + with open(os.path.join(args.inputPath, args.inputTestingData), encoding='utf8', mode='r') \ | ||
177 | + as iFile: | ||
178 | + for line in iFile: | ||
179 | + line = line.strip('\r\n') | ||
180 | + testingData.append(line) | ||
181 | + print(" Done!") | ||
182 | + | ||
183 | + # Create vectorizer | ||
184 | + print('Vectorization: {}'.format(args.vectorizer)) | ||
185 | + if args.vectorizer == "b": | ||
186 | + # Binary vectorizer | ||
187 | + vectorizer = CountVectorizer(ngram_range=(args.ngrinitial, args.ngrfinal), binary=True, stop_words=pf) | ||
188 | + elif args.vectorizer == "f": | ||
189 | + # Frequency vectorizer | ||
190 | + vectorizer = CountVectorizer(ngram_range=(args.ngrinitial, args.ngrfinal), stop_words=pf) | ||
191 | + else: | ||
192 | + # Binary vectorizer | ||
193 | + vectorizer = TfidfVectorizer(ngram_range=(args.ngrinitial, args.ngrfinal), stop_words=pf) | ||
194 | + | ||
195 | + X_train = csr_matrix(vectorizer.fit_transform(trainingData), dtype='double') | ||
196 | + X_test = csr_matrix(vectorizer.transform(testingData), dtype='double') | ||
197 | + | ||
198 | + print(" Saving matrix and classes...") | ||
199 | + joblib.dump(X_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb')) | ||
200 | + joblib.dump(y_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb')) | ||
201 | + joblib.dump(X_test, os.path.join(args.outputModelPath, args.inputTestingData + '.jlb')) | ||
202 | + joblib.dump(y_test, os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb')) | ||
203 | + print(" Done!") | ||
204 | + else: | ||
205 | + print(" Loading matrix and classes...") | ||
206 | + X_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb')) | ||
207 | + y_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb')) | ||
208 | + X_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingData + '.jlb')) | ||
209 | + y_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb')) | ||
210 | + print(" Done!") | ||
211 | + | ||
212 | + print(" Number of training classes: {}".format(len(y_train))) | ||
213 | + print(" Number of training class DOM: {}".format(y_train.count('DOM'))) | ||
214 | + print(" Number of training class OTHER: {}".format(y_train.count('OTHER'))) | ||
215 | + print(" Shape of training matrix: {}".format(X_train.shape)) | ||
216 | + | ||
217 | + print(" Number of testing classes: {}".format(len(y_test))) | ||
218 | + print(" Number of testing class DOM: {}".format(y_test.count('DOM'))) | ||
219 | + print(" Number of testing class OTHER: {}".format(y_test.count('OTHER'))) | ||
220 | + print(" Shape of testing matrix: {}".format(X_test.shape)) | ||
221 | + | ||
222 | + # Feature selection and dimensional reduction | ||
223 | + if args.reduction is not None: | ||
224 | + print('Performing dimensionality reduction or feature selection...', args.reduction) | ||
225 | + if args.reduction == 'SVD200': | ||
226 | + reduc = TruncatedSVD(n_components=200, random_state=42) | ||
227 | + X_train = reduc.fit_transform(X_train) | ||
228 | + if args.reduction == 'SVD300': | ||
229 | + reduc = TruncatedSVD(n_components=300, random_state=42) | ||
230 | + X_train = reduc.fit_transform(X_train) | ||
231 | + elif args.reduction == 'CHI250': | ||
232 | + reduc = SelectKBest(chi2, k=50) | ||
233 | + X_train = reduc.fit_transform(X_train, y_train) | ||
234 | + elif args.reduction == 'CHI2100': | ||
235 | + reduc = SelectKBest(chi2, k=100) | ||
236 | + X_train = reduc.fit_transform(X_train, y_train) | ||
237 | + print(" Done!") | ||
238 | + print(' New shape of training matrix: ', X_train.shape) | ||
239 | + | ||
240 | + jobs = -1 | ||
241 | + paramGrid = [] | ||
242 | + nIter = 20 | ||
243 | + crossV = 10 | ||
244 | + # New performance scorer | ||
245 | + myScorer = make_scorer(f1_score, average='weighted') | ||
246 | + print("Defining randomized grid search...") | ||
247 | + if args.classifier == 'SVM': | ||
248 | + # SVM | ||
249 | + classifier = SVC() | ||
250 | + if args.kernel == 'rbf': | ||
251 | + paramGrid = {'C': scipy.stats.expon(scale=100), | ||
252 | + 'gamma': scipy.stats.expon(scale=.1), | ||
253 | + 'kernel': ['rbf'], 'class_weight': ['balanced', None]} | ||
254 | + elif args.kernel == 'linear': | ||
255 | + paramGrid = {'C': scipy.stats.expon(scale=100), | ||
256 | + 'kernel': ['linear'], | ||
257 | + 'class_weight': ['balanced', None]} | ||
258 | + elif args.kernel == 'poly': | ||
259 | + paramGrid = {'C': scipy.stats.expon(scale=100), | ||
260 | + 'gamma': scipy.stats.expon(scale=.1), 'degree': [2, 3], | ||
261 | + 'kernel': ['poly'], 'class_weight': ['balanced', None]} | ||
262 | + myClassifier = model_selection.RandomizedSearchCV(classifier, | ||
263 | + paramGrid, n_iter=nIter, | ||
264 | + cv=crossV, n_jobs=jobs, verbose=3, scoring=myScorer) | ||
265 | + elif args.classifier == 'BernoulliNB': | ||
266 | + # BernoulliNB | ||
267 | + classifier = BernoulliNB() | ||
268 | + paramGrid = {'alpha': scipy.stats.expon(scale=1.0)} | ||
269 | + myClassifier = model_selection.RandomizedSearchCV(classifier, paramGrid, n_iter=nIter, | ||
270 | + cv=crossV, n_jobs=jobs, verbose=3, scoring=myScorer) | ||
271 | + elif args.classifier == 'MultinomialNB': | ||
272 | + # MultinomialNB | ||
273 | + classifier = MultinomialNB() | ||
274 | + paramGrid = {'alpha': scipy.stats.expon(scale=1.0)} | ||
275 | + myClassifier = model_selection.RandomizedSearchCV(classifier, paramGrid, n_iter=nIter, | ||
276 | + cv=crossV, n_jobs=jobs, verbose=3, scoring=myScorer) | ||
277 | + else: | ||
278 | + print("Bad classifier") | ||
279 | + exit() | ||
280 | + print(" Done!") | ||
281 | + | ||
282 | + print("Training...") | ||
283 | + myClassifier.fit(X_train, y_train) | ||
284 | + print(" Done!") | ||
285 | + | ||
286 | + print("Getting best model and hyperparameters") | ||
287 | + print('Best score {}: {}\n'.format(myScorer, myClassifier.best_score_)) | ||
288 | + print('Best parameters:\n') | ||
289 | + best_parameters = myClassifier.best_estimator_.get_params() | ||
290 | + for param in sorted(best_parameters.keys()): | ||
291 | + print("\t%s: %r\n" % (param, best_parameters[param])) | ||
292 | + theBestClassifier = myClassifier.best_estimator_ | ||
293 | + print(str(theBestClassifier) + '\n') | ||
294 | + print("FALTA ESCRIBIR EL MEJOR SCORE (\"myClassifier.best_score_\") A UN ARCHIVO") | ||
295 | + | ||
296 | + print("Testing (prediction in new data)...") | ||
297 | + if args.reduction is not None: | ||
298 | + X_test = reduc.transform(X_test) | ||
299 | + y_pred = myClassifier.predict(X_test) | ||
300 | + best_parameters = myClassifier.best_estimator_.get_params() | ||
301 | + print(" Done!") | ||
302 | + | ||
303 | + print("Saving report...") | ||
304 | + with open(os.path.join(args.outputReportPath, args.outputReportFile), mode='w', encoding='utf8') as oFile: | ||
305 | + oFile.write('********** EVALUATION REPORT **********\n') | ||
306 | + oFile.write('Reduction: {}\n'.format(args.reduction)) | ||
307 | + oFile.write('Classifier: {}\n'.format(args.classifier)) | ||
308 | + oFile.write('Kernel: {}\n'.format(args.kernel)) | ||
309 | + oFile.write('Accuracy: {}\n'.format(accuracy_score(y_test, y_pred))) | ||
310 | + oFile.write('Precision: {}\n'.format(precision_score(y_test, y_pred, average='weighted'))) | ||
311 | + oFile.write('Recall: {}\n'.format(recall_score(y_test, y_pred, average='weighted'))) | ||
312 | + oFile.write('F-score: {}\n'.format(f1_score(y_test, y_pred, average='weighted'))) | ||
313 | + oFile.write('Confusion matrix: \n') | ||
314 | + oFile.write(str(confusion_matrix(y_test, y_pred)) + '\n') | ||
315 | + oFile.write('Classification report: \n') | ||
316 | + oFile.write(classification_report(y_test, y_pred) + '\n') | ||
317 | + oFile.write('Best parameters: \n') | ||
318 | + for param in sorted(best_parameters.keys()): | ||
319 | + oFile.write("\t%s: %r\n" % (param, best_parameters[param])) | ||
320 | + | ||
321 | + print(" Done!") | ||
322 | + | ||
323 | + print("Training and testing done in: %fs" % (time() - t0)) |
-
Please register or login to post a comment