Showing
1 changed file
with
323 additions
and
0 deletions
clasificacion-automatica/structural-domain-dataset/training-crossvalidation-testing-dom-v02.py
0 → 100644
| 1 | +# -*- encoding: utf-8 -*- | ||
| 2 | + | ||
| 3 | +import os | ||
| 4 | +from time import time | ||
| 5 | +import argparse | ||
| 6 | +import scipy | ||
| 7 | +from sklearn import model_selection | ||
| 8 | +from sklearn.naive_bayes import MultinomialNB, BernoulliNB | ||
| 9 | +from sklearn.svm import SVC | ||
| 10 | +from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \ | ||
| 11 | + classification_report, make_scorer | ||
| 12 | +from sklearn.externals import joblib | ||
| 13 | +from nltk.corpus import stopwords | ||
| 14 | +from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | ||
| 15 | +from sklearn.feature_selection import SelectKBest, chi2 | ||
| 16 | +from sklearn.decomposition import TruncatedSVD | ||
| 17 | +from scipy.sparse import csr_matrix | ||
| 18 | + | ||
| 19 | + | ||
| 20 | +__author__ = 'CMendezC' | ||
| 21 | + | ||
| 22 | +# Goal: training, crossvalidation and testing transcription factor structural domain sentences | ||
| 23 | + | ||
| 24 | +# Parameters: | ||
| 25 | +# 1) --inputPath Path to read input files. | ||
| 26 | +# 2) --inputTrainingData File to read training data. | ||
| 27 | +# 4) --inputTrainingClasses File to read training classes. | ||
| 28 | +# 3) --inputTestingData File to read testing data. | ||
| 29 | +# 4) --inputTestingClasses File to read testing classes. | ||
| 30 | +# 5) --outputModelPath Path to place output model. | ||
| 31 | +# 6) --outputModelFile File to place output model. | ||
| 32 | +# 7) --outputReportPath Path to place evaluation report. | ||
| 33 | +# 8) --outputReportFile File to place evaluation report. | ||
| 34 | +# 9) --classifier Classifier: BernoulliNB, SVM, kNN. | ||
| 35 | +# 10) --saveData Save matrices | ||
| 36 | +# 11) --kernel Kernel | ||
| 37 | +# 12) --reduction Feature selection or dimensionality reduction | ||
| 38 | +# 13) --removeStopWords Remove most frequent words | ||
| 39 | +# 14) --vectorizer Vectorizer: b=binary, f=frequency, t=tf-idf. | ||
| 40 | + | ||
| 41 | + | ||
| 42 | +# Ouput: | ||
| 43 | +# 1) Classification model and evaluation report. | ||
| 44 | + | ||
| 45 | +# Execution: | ||
| 46 | + | ||
| 47 | +# source activate python3 | ||
| 48 | +# python training-crossvalidation-testing-dom.py | ||
| 49 | +# --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/structural-domain-dataset | ||
| 50 | +# --inputTrainingData trainData.txt | ||
| 51 | +# --inputTrainingClasses trainClasses.txt | ||
| 52 | +# --inputTestingData testData.txt | ||
| 53 | +# --inputTestingClasses testClasses.txt | ||
| 54 | +# --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/structural-domain-dataset/models | ||
| 55 | +# --outputModelFile SVM-lineal-model.mod | ||
| 56 | +# --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/structural-domain-dataset/reports | ||
| 57 | +# --outputReportFile SVM-linear.txt | ||
| 58 | +# --classifier SVM | ||
| 59 | +# --saveData | ||
| 60 | +# --kernel linear | ||
| 61 | +# --reduction SVD200 | ||
| 62 | +# --removeStopWords | ||
| 63 | +# --vectorizer b | ||
| 64 | +# --ngrinitial 2 | ||
| 65 | +# --ngrfinal 2 | ||
| 66 | +# python training-crossvalidation-testing-dom.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/structural-domain-dataset --inputTrainingData trainData.txt --inputTrainingClasses trainClasses.txt --inputTestingData testData.txt --inputTestingClasses testClasses.txt --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/structural-domain-dataset/models --outputModelFile SVM-lineal-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/structural-domain-dataset/reports --outputReportFile SVM-linear.txt --classifier SVM --kernel linear --saveData --vectorizer b --ngrinitial 2 --ngrfinal 2 --removeStopWords | ||
| 67 | +# --reduction SVD200 | ||
| 68 | +# --removeStopWords | ||
| 69 | + | ||
| 70 | +########################################################### | ||
| 71 | +# MAIN PROGRAM # | ||
| 72 | +########################################################### | ||
| 73 | + | ||
| 74 | +if __name__ == "__main__": | ||
| 75 | + # Parameter definition | ||
| 76 | + parser = argparse.ArgumentParser(description='Training validation structural domain dataset.') | ||
| 77 | + parser.add_argument("--inputPath", dest="inputPath", | ||
| 78 | + help="Path to read input files", metavar="PATH") | ||
| 79 | + parser.add_argument("--inputTrainingData", dest="inputTrainingData", | ||
| 80 | + help="File to read training data", metavar="FILE") | ||
| 81 | + parser.add_argument("--inputTrainingClasses", dest="inputTrainingClasses", | ||
| 82 | + help="File to read training classes", metavar="FILE") | ||
| 83 | + parser.add_argument("--inputTestingData", dest="inputTestingData", | ||
| 84 | + help="File to read testing data", metavar="FILE") | ||
| 85 | + parser.add_argument("--inputTestingClasses", dest="inputTestingClasses", | ||
| 86 | + help="File to read testing classes", metavar="FILE") | ||
| 87 | + parser.add_argument("--outputModelPath", dest="outputModelPath", | ||
| 88 | + help="Path to place output model", metavar="PATH") | ||
| 89 | + parser.add_argument("--outputModelFile", dest="outputModelFile", | ||
| 90 | + help="File to place output model", metavar="FILE") | ||
| 91 | + parser.add_argument("--outputReportPath", dest="outputReportPath", | ||
| 92 | + help="Path to place evaluation report", metavar="PATH") | ||
| 93 | + parser.add_argument("--outputReportFile", dest="outputReportFile", | ||
| 94 | + help="File to place evaluation report", metavar="FILE") | ||
| 95 | + parser.add_argument("--classifier", dest="classifier", | ||
| 96 | + help="Classifier", metavar="NAME", | ||
| 97 | + choices=('BernoulliNB', 'SVM', 'kNN'), default='SVM') | ||
| 98 | + parser.add_argument("--saveData", dest="saveData", action='store_true', | ||
| 99 | + help="Save matrices") | ||
| 100 | + parser.add_argument("--kernel", dest="kernel", | ||
| 101 | + help="Kernel SVM", metavar="NAME", | ||
| 102 | + choices=('linear', 'rbf', 'poly'), default='linear') | ||
| 103 | + parser.add_argument("--reduction", dest="reduction", | ||
| 104 | + help="Feature selection or dimensionality reduction", metavar="NAME", | ||
| 105 | + choices=('SVD200', 'SVD300', 'CHI250', 'CHI2100'), default=None) | ||
| 106 | + parser.add_argument("--removeStopWords", default=False, | ||
| 107 | + action="store_true", dest="removeStopWords", | ||
| 108 | + help="Remove stop words") | ||
| 109 | + parser.add_argument("--ngrinitial", type=int, | ||
| 110 | + dest="ngrinitial", default=1, | ||
| 111 | + help="Initial n-gram", metavar="INTEGER") | ||
| 112 | + parser.add_argument("--ngrfinal", type=int, | ||
| 113 | + dest="ngrfinal", default=1, | ||
| 114 | + help="Final n-gram", metavar="INTEGER") | ||
| 115 | + parser.add_argument("--vectorizer", dest="vectorizer", required=True, | ||
| 116 | + help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR", | ||
| 117 | + choices=('b', 'f', 't'), default='b') | ||
| 118 | + | ||
| 119 | + args = parser.parse_args() | ||
| 120 | + | ||
| 121 | + # Printing parameter values | ||
| 122 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
| 123 | + print("Path to read input files: " + str(args.inputPath)) | ||
| 124 | + print("File to read training data: " + str(args.inputTrainingData)) | ||
| 125 | + print("File to read training classes: " + str(args.inputTrainingClasses)) | ||
| 126 | + print("File to read testing data: " + str(args.inputTestingData)) | ||
| 127 | + print("File to read testing classes: " + str(args.inputTestingClasses)) | ||
| 128 | + print("Path to place output model: " + str(args.outputModelPath)) | ||
| 129 | + print("File to place output model: " + str(args.outputModelFile)) | ||
| 130 | + print("Path to place evaluation report: " + str(args.outputReportPath)) | ||
| 131 | + print("File to place evaluation report: " + str(args.outputReportFile)) | ||
| 132 | + print("Classifier: " + str(args.classifier)) | ||
| 133 | + print("Save matrices: " + str(args.saveData)) | ||
| 134 | + print("Kernel: " + str(args.kernel)) | ||
| 135 | + print("Reduction: " + str(args.reduction)) | ||
| 136 | + print("Remove stop words: " + str(args.removeStopWords)) | ||
| 137 | + print("Initial ngram: " + str(args.ngrinitial)) | ||
| 138 | + print("Final ngram: " + str(args.ngrfinal)) | ||
| 139 | + print("Vectorizer: " + str(args.vectorizer)) | ||
| 140 | + | ||
| 141 | + # Start time | ||
| 142 | + t0 = time() | ||
| 143 | + | ||
| 144 | + if args.removeStopWords: | ||
| 145 | + pf = stopwords.words('english') | ||
| 146 | + else: | ||
| 147 | + pf = None | ||
| 148 | + | ||
| 149 | + y_train = [] | ||
| 150 | + trainingData = [] | ||
| 151 | + y_test = [] | ||
| 152 | + testingData = [] | ||
| 153 | + X_train = None | ||
| 154 | + X_test = None | ||
| 155 | + | ||
| 156 | + if args.saveData: | ||
| 157 | + print("Reading training data and true classes...") | ||
| 158 | + with open(os.path.join(args.inputPath, args.inputTrainingClasses), encoding='utf8', mode='r') \ | ||
| 159 | + as iFile: | ||
| 160 | + for line in iFile: | ||
| 161 | + line = line.strip('\r\n') | ||
| 162 | + y_train.append(line) | ||
| 163 | + with open(os.path.join(args.inputPath, args.inputTrainingData), encoding='utf8', mode='r') \ | ||
| 164 | + as iFile: | ||
| 165 | + for line in iFile: | ||
| 166 | + line = line.strip('\r\n') | ||
| 167 | + trainingData.append(line) | ||
| 168 | + print(" Done!") | ||
| 169 | + | ||
| 170 | + print("Reading testing data and true classes...") | ||
| 171 | + with open(os.path.join(args.inputPath, args.inputTestingClasses), encoding='utf8', mode='r') \ | ||
| 172 | + as iFile: | ||
| 173 | + for line in iFile: | ||
| 174 | + line = line.strip('\r\n') | ||
| 175 | + y_test.append(line) | ||
| 176 | + with open(os.path.join(args.inputPath, args.inputTestingData), encoding='utf8', mode='r') \ | ||
| 177 | + as iFile: | ||
| 178 | + for line in iFile: | ||
| 179 | + line = line.strip('\r\n') | ||
| 180 | + testingData.append(line) | ||
| 181 | + print(" Done!") | ||
| 182 | + | ||
| 183 | + # Create vectorizer | ||
| 184 | + print('Vectorization: {}'.format(args.vectorizer)) | ||
| 185 | + if args.vectorizer == "b": | ||
| 186 | + # Binary vectorizer | ||
| 187 | + vectorizer = CountVectorizer(ngram_range=(args.ngrinitial, args.ngrfinal), binary=True, stop_words=pf) | ||
| 188 | + elif args.vectorizer == "f": | ||
| 189 | + # Frequency vectorizer | ||
| 190 | + vectorizer = CountVectorizer(ngram_range=(args.ngrinitial, args.ngrfinal), stop_words=pf) | ||
| 191 | + else: | ||
| 192 | + # Binary vectorizer | ||
| 193 | + vectorizer = TfidfVectorizer(ngram_range=(args.ngrinitial, args.ngrfinal), stop_words=pf) | ||
| 194 | + | ||
| 195 | + X_train = csr_matrix(vectorizer.fit_transform(trainingData), dtype='double') | ||
| 196 | + X_test = csr_matrix(vectorizer.transform(testingData), dtype='double') | ||
| 197 | + | ||
| 198 | + print(" Saving matrix and classes...") | ||
| 199 | + joblib.dump(X_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb')) | ||
| 200 | + joblib.dump(y_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb')) | ||
| 201 | + joblib.dump(X_test, os.path.join(args.outputModelPath, args.inputTestingData + '.jlb')) | ||
| 202 | + joblib.dump(y_test, os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb')) | ||
| 203 | + print(" Done!") | ||
| 204 | + else: | ||
| 205 | + print(" Loading matrix and classes...") | ||
| 206 | + X_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb')) | ||
| 207 | + y_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb')) | ||
| 208 | + X_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingData + '.jlb')) | ||
| 209 | + y_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb')) | ||
| 210 | + print(" Done!") | ||
| 211 | + | ||
| 212 | + print(" Number of training classes: {}".format(len(y_train))) | ||
| 213 | + print(" Number of training class DOM: {}".format(y_train.count('DOM'))) | ||
| 214 | + print(" Number of training class OTHER: {}".format(y_train.count('OTHER'))) | ||
| 215 | + print(" Shape of training matrix: {}".format(X_train.shape)) | ||
| 216 | + | ||
| 217 | + print(" Number of testing classes: {}".format(len(y_test))) | ||
| 218 | + print(" Number of testing class DOM: {}".format(y_test.count('DOM'))) | ||
| 219 | + print(" Number of testing class OTHER: {}".format(y_test.count('OTHER'))) | ||
| 220 | + print(" Shape of testing matrix: {}".format(X_test.shape)) | ||
| 221 | + | ||
| 222 | + # Feature selection and dimensional reduction | ||
| 223 | + if args.reduction is not None: | ||
| 224 | + print('Performing dimensionality reduction or feature selection...', args.reduction) | ||
| 225 | + if args.reduction == 'SVD200': | ||
| 226 | + reduc = TruncatedSVD(n_components=200, random_state=42) | ||
| 227 | + X_train = reduc.fit_transform(X_train) | ||
| 228 | + if args.reduction == 'SVD300': | ||
| 229 | + reduc = TruncatedSVD(n_components=300, random_state=42) | ||
| 230 | + X_train = reduc.fit_transform(X_train) | ||
| 231 | + elif args.reduction == 'CHI250': | ||
| 232 | + reduc = SelectKBest(chi2, k=50) | ||
| 233 | + X_train = reduc.fit_transform(X_train, y_train) | ||
| 234 | + elif args.reduction == 'CHI2100': | ||
| 235 | + reduc = SelectKBest(chi2, k=100) | ||
| 236 | + X_train = reduc.fit_transform(X_train, y_train) | ||
| 237 | + print(" Done!") | ||
| 238 | + print(' New shape of training matrix: ', X_train.shape) | ||
| 239 | + | ||
| 240 | + jobs = -1 | ||
| 241 | + paramGrid = [] | ||
| 242 | + nIter = 20 | ||
| 243 | + crossV = 10 | ||
| 244 | + # New performance scorer | ||
| 245 | + myScorer = make_scorer(f1_score, average='weighted') | ||
| 246 | + print("Defining randomized grid search...") | ||
| 247 | + if args.classifier == 'SVM': | ||
| 248 | + # SVM | ||
| 249 | + classifier = SVC() | ||
| 250 | + if args.kernel == 'rbf': | ||
| 251 | + paramGrid = {'C': scipy.stats.expon(scale=100), | ||
| 252 | + 'gamma': scipy.stats.expon(scale=.1), | ||
| 253 | + 'kernel': ['rbf'], 'class_weight': ['balanced', None]} | ||
| 254 | + elif args.kernel == 'linear': | ||
| 255 | + paramGrid = {'C': scipy.stats.expon(scale=100), | ||
| 256 | + 'kernel': ['linear'], | ||
| 257 | + 'class_weight': ['balanced', None]} | ||
| 258 | + elif args.kernel == 'poly': | ||
| 259 | + paramGrid = {'C': scipy.stats.expon(scale=100), | ||
| 260 | + 'gamma': scipy.stats.expon(scale=.1), 'degree': [2, 3], | ||
| 261 | + 'kernel': ['poly'], 'class_weight': ['balanced', None]} | ||
| 262 | + myClassifier = model_selection.RandomizedSearchCV(classifier, | ||
| 263 | + paramGrid, n_iter=nIter, | ||
| 264 | + cv=crossV, n_jobs=jobs, verbose=3, scoring=myScorer) | ||
| 265 | + elif args.classifier == 'BernoulliNB': | ||
| 266 | + # BernoulliNB | ||
| 267 | + classifier = BernoulliNB() | ||
| 268 | + paramGrid = {'alpha': scipy.stats.expon(scale=1.0)} | ||
| 269 | + myClassifier = model_selection.RandomizedSearchCV(classifier, paramGrid, n_iter=nIter, | ||
| 270 | + cv=crossV, n_jobs=jobs, verbose=3, scoring=myScorer) | ||
| 271 | + elif args.classifier == 'MultinomialNB': | ||
| 272 | + # MultinomialNB | ||
| 273 | + classifier = MultinomialNB() | ||
| 274 | + paramGrid = {'alpha': scipy.stats.expon(scale=1.0)} | ||
| 275 | + myClassifier = model_selection.RandomizedSearchCV(classifier, paramGrid, n_iter=nIter, | ||
| 276 | + cv=crossV, n_jobs=jobs, verbose=3, scoring=myScorer) | ||
| 277 | + else: | ||
| 278 | + print("Bad classifier") | ||
| 279 | + exit() | ||
| 280 | + print(" Done!") | ||
| 281 | + | ||
| 282 | + print("Training...") | ||
| 283 | + myClassifier.fit(X_train, y_train) | ||
| 284 | + print(" Done!") | ||
| 285 | + | ||
| 286 | + print("Getting best model and hyperparameters") | ||
| 287 | + print('Best score {}: {}\n'.format(myScorer, myClassifier.best_score_)) | ||
| 288 | + print('Best parameters:\n') | ||
| 289 | + best_parameters = myClassifier.best_estimator_.get_params() | ||
| 290 | + for param in sorted(best_parameters.keys()): | ||
| 291 | + print("\t%s: %r\n" % (param, best_parameters[param])) | ||
| 292 | + theBestClassifier = myClassifier.best_estimator_ | ||
| 293 | + print(str(theBestClassifier) + '\n') | ||
| 294 | + print("FALTA ESCRIBIR EL MEJOR SCORE (\"myClassifier.best_score_\") A UN ARCHIVO") | ||
| 295 | + | ||
| 296 | + print("Testing (prediction in new data)...") | ||
| 297 | + if args.reduction is not None: | ||
| 298 | + X_test = reduc.transform(X_test) | ||
| 299 | + y_pred = myClassifier.predict(X_test) | ||
| 300 | + best_parameters = myClassifier.best_estimator_.get_params() | ||
| 301 | + print(" Done!") | ||
| 302 | + | ||
| 303 | + print("Saving report...") | ||
| 304 | + with open(os.path.join(args.outputReportPath, args.outputReportFile), mode='w', encoding='utf8') as oFile: | ||
| 305 | + oFile.write('********** EVALUATION REPORT **********\n') | ||
| 306 | + oFile.write('Reduction: {}\n'.format(args.reduction)) | ||
| 307 | + oFile.write('Classifier: {}\n'.format(args.classifier)) | ||
| 308 | + oFile.write('Kernel: {}\n'.format(args.kernel)) | ||
| 309 | + oFile.write('Accuracy: {}\n'.format(accuracy_score(y_test, y_pred))) | ||
| 310 | + oFile.write('Precision: {}\n'.format(precision_score(y_test, y_pred, average='weighted'))) | ||
| 311 | + oFile.write('Recall: {}\n'.format(recall_score(y_test, y_pred, average='weighted'))) | ||
| 312 | + oFile.write('F-score: {}\n'.format(f1_score(y_test, y_pred, average='weighted'))) | ||
| 313 | + oFile.write('Confusion matrix: \n') | ||
| 314 | + oFile.write(str(confusion_matrix(y_test, y_pred)) + '\n') | ||
| 315 | + oFile.write('Classification report: \n') | ||
| 316 | + oFile.write(classification_report(y_test, y_pred) + '\n') | ||
| 317 | + oFile.write('Best parameters: \n') | ||
| 318 | + for param in sorted(best_parameters.keys()): | ||
| 319 | + oFile.write("\t%s: %r\n" % (param, best_parameters[param])) | ||
| 320 | + | ||
| 321 | + print(" Done!") | ||
| 322 | + | ||
| 323 | + print("Training and testing done in: %fs" % (time() - t0)) |
-
Please register or login to post a comment