Carlos-Francisco Méndez-Cruz

Training, crossvalidation and testing dataset

1 +# -*- encoding: utf-8 -*-
2 +
3 +import os
4 +from time import time
5 +import argparse
6 +import scipy
7 +from sklearn import model_selection
8 +from sklearn.naive_bayes import MultinomialNB, BernoulliNB
9 +from sklearn.svm import SVC
10 +from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \
11 + classification_report, make_scorer
12 +from sklearn.externals import joblib
13 +from nltk.corpus import stopwords
14 +from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
15 +from sklearn.feature_selection import SelectKBest, chi2
16 +from sklearn.decomposition import TruncatedSVD
17 +from scipy.sparse import csr_matrix
18 +
19 +
20 +__author__ = 'CMendezC'
21 +
22 +# Goal: training, crossvalidation and testing transcription factor structural domain sentences
23 +
24 +# Parameters:
25 +# 1) --inputPath Path to read input files.
26 +# 2) --inputTrainingData File to read training data.
27 +# 4) --inputTrainingClasses File to read training classes.
28 +# 3) --inputTestingData File to read testing data.
29 +# 4) --inputTestingClasses File to read testing classes.
30 +# 5) --outputModelPath Path to place output model.
31 +# 6) --outputModelFile File to place output model.
32 +# 7) --outputReportPath Path to place evaluation report.
33 +# 8) --outputReportFile File to place evaluation report.
34 +# 9) --classifier Classifier: BernoulliNB, SVM, kNN.
35 +# 10) --saveData Save matrices
36 +# 11) --kernel Kernel
37 +# 12) --reduction Feature selection or dimensionality reduction
38 +# 13) --removeStopWords Remove most frequent words
39 +# 14) --vectorizer Vectorizer: b=binary, f=frequency, t=tf-idf.
40 +
41 +
42 +# Ouput:
43 +# 1) Classification model and evaluation report.
44 +
45 +# Execution:
46 +
47 +# source activate python3
48 +# python training-crossvalidation-testing-dom.py
49 +# --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/structural-domain-dataset
50 +# --inputTrainingData trainData.txt
51 +# --inputTrainingClasses trainClasses.txt
52 +# --inputTestingData testData.txt
53 +# --inputTestingClasses testClasses.txt
54 +# --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/structural-domain-dataset/models
55 +# --outputModelFile SVM-lineal-model.mod
56 +# --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/structural-domain-dataset/reports
57 +# --outputReportFile SVM-linear.txt
58 +# --classifier SVM
59 +# --saveData
60 +# --kernel linear
61 +# --reduction SVD200
62 +# --removeStopWords
63 +# --vectorizer b
64 +# --ngrinitial 2
65 +# --ngrfinal 2
66 +# python training-crossvalidation-testing-dom.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/structural-domain-dataset --inputTrainingData trainData.txt --inputTrainingClasses trainClasses.txt --inputTestingData testData.txt --inputTestingClasses testClasses.txt --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/structural-domain-dataset/models --outputModelFile SVM-lineal-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/structural-domain-dataset/reports --outputReportFile SVM-linear.txt --classifier SVM --kernel linear --saveData --vectorizer b --ngrinitial 2 --ngrfinal 2 --removeStopWords
67 +# --reduction SVD200
68 +# --removeStopWords
69 +
70 +###########################################################
71 +# MAIN PROGRAM #
72 +###########################################################
73 +
74 +if __name__ == "__main__":
75 + # Parameter definition
76 + parser = argparse.ArgumentParser(description='Training validation structural domain dataset.')
77 + parser.add_argument("--inputPath", dest="inputPath",
78 + help="Path to read input files", metavar="PATH")
79 + parser.add_argument("--inputTrainingData", dest="inputTrainingData",
80 + help="File to read training data", metavar="FILE")
81 + parser.add_argument("--inputTrainingClasses", dest="inputTrainingClasses",
82 + help="File to read training classes", metavar="FILE")
83 + parser.add_argument("--inputTestingData", dest="inputTestingData",
84 + help="File to read testing data", metavar="FILE")
85 + parser.add_argument("--inputTestingClasses", dest="inputTestingClasses",
86 + help="File to read testing classes", metavar="FILE")
87 + parser.add_argument("--outputModelPath", dest="outputModelPath",
88 + help="Path to place output model", metavar="PATH")
89 + parser.add_argument("--outputModelFile", dest="outputModelFile",
90 + help="File to place output model", metavar="FILE")
91 + parser.add_argument("--outputReportPath", dest="outputReportPath",
92 + help="Path to place evaluation report", metavar="PATH")
93 + parser.add_argument("--outputReportFile", dest="outputReportFile",
94 + help="File to place evaluation report", metavar="FILE")
95 + parser.add_argument("--classifier", dest="classifier",
96 + help="Classifier", metavar="NAME",
97 + choices=('BernoulliNB', 'SVM', 'kNN'), default='SVM')
98 + parser.add_argument("--saveData", dest="saveData", action='store_true',
99 + help="Save matrices")
100 + parser.add_argument("--kernel", dest="kernel",
101 + help="Kernel SVM", metavar="NAME",
102 + choices=('linear', 'rbf', 'poly'), default='linear')
103 + parser.add_argument("--reduction", dest="reduction",
104 + help="Feature selection or dimensionality reduction", metavar="NAME",
105 + choices=('SVD200', 'SVD300', 'CHI250', 'CHI2100'), default=None)
106 + parser.add_argument("--removeStopWords", default=False,
107 + action="store_true", dest="removeStopWords",
108 + help="Remove stop words")
109 + parser.add_argument("--ngrinitial", type=int,
110 + dest="ngrinitial", default=1,
111 + help="Initial n-gram", metavar="INTEGER")
112 + parser.add_argument("--ngrfinal", type=int,
113 + dest="ngrfinal", default=1,
114 + help="Final n-gram", metavar="INTEGER")
115 + parser.add_argument("--vectorizer", dest="vectorizer", required=True,
116 + help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR",
117 + choices=('b', 'f', 't'), default='b')
118 +
119 + args = parser.parse_args()
120 +
121 + # Printing parameter values
122 + print('-------------------------------- PARAMETERS --------------------------------')
123 + print("Path to read input files: " + str(args.inputPath))
124 + print("File to read training data: " + str(args.inputTrainingData))
125 + print("File to read training classes: " + str(args.inputTrainingClasses))
126 + print("File to read testing data: " + str(args.inputTestingData))
127 + print("File to read testing classes: " + str(args.inputTestingClasses))
128 + print("Path to place output model: " + str(args.outputModelPath))
129 + print("File to place output model: " + str(args.outputModelFile))
130 + print("Path to place evaluation report: " + str(args.outputReportPath))
131 + print("File to place evaluation report: " + str(args.outputReportFile))
132 + print("Classifier: " + str(args.classifier))
133 + print("Save matrices: " + str(args.saveData))
134 + print("Kernel: " + str(args.kernel))
135 + print("Reduction: " + str(args.reduction))
136 + print("Remove stop words: " + str(args.removeStopWords))
137 + print("Initial ngram: " + str(args.ngrinitial))
138 + print("Final ngram: " + str(args.ngrfinal))
139 + print("Vectorizer: " + str(args.vectorizer))
140 +
141 + # Start time
142 + t0 = time()
143 +
144 + if args.removeStopWords:
145 + pf = stopwords.words('english')
146 + else:
147 + pf = None
148 +
149 + y_train = []
150 + trainingData = []
151 + y_test = []
152 + testingData = []
153 + X_train = None
154 + X_test = None
155 +
156 + if args.saveData:
157 + print("Reading training data and true classes...")
158 + with open(os.path.join(args.inputPath, args.inputTrainingClasses), encoding='utf8', mode='r') \
159 + as iFile:
160 + for line in iFile:
161 + line = line.strip('\r\n')
162 + y_train.append(line)
163 + with open(os.path.join(args.inputPath, args.inputTrainingData), encoding='utf8', mode='r') \
164 + as iFile:
165 + for line in iFile:
166 + line = line.strip('\r\n')
167 + trainingData.append(line)
168 + print(" Done!")
169 +
170 + print("Reading testing data and true classes...")
171 + with open(os.path.join(args.inputPath, args.inputTestingClasses), encoding='utf8', mode='r') \
172 + as iFile:
173 + for line in iFile:
174 + line = line.strip('\r\n')
175 + y_test.append(line)
176 + with open(os.path.join(args.inputPath, args.inputTestingData), encoding='utf8', mode='r') \
177 + as iFile:
178 + for line in iFile:
179 + line = line.strip('\r\n')
180 + testingData.append(line)
181 + print(" Done!")
182 +
183 + # Create vectorizer
184 + print('Vectorization: {}'.format(args.vectorizer))
185 + if args.vectorizer == "b":
186 + # Binary vectorizer
187 + vectorizer = CountVectorizer(ngram_range=(args.ngrinitial, args.ngrfinal), binary=True, stop_words=pf)
188 + elif args.vectorizer == "f":
189 + # Frequency vectorizer
190 + vectorizer = CountVectorizer(ngram_range=(args.ngrinitial, args.ngrfinal), stop_words=pf)
191 + else:
192 + # Binary vectorizer
193 + vectorizer = TfidfVectorizer(ngram_range=(args.ngrinitial, args.ngrfinal), stop_words=pf)
194 +
195 + X_train = csr_matrix(vectorizer.fit_transform(trainingData), dtype='double')
196 + X_test = csr_matrix(vectorizer.transform(testingData), dtype='double')
197 +
198 + print(" Saving matrix and classes...")
199 + joblib.dump(X_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb'))
200 + joblib.dump(y_train, os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb'))
201 + joblib.dump(X_test, os.path.join(args.outputModelPath, args.inputTestingData + '.jlb'))
202 + joblib.dump(y_test, os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb'))
203 + print(" Done!")
204 + else:
205 + print(" Loading matrix and classes...")
206 + X_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.jlb'))
207 + y_train = joblib.load(os.path.join(args.outputModelPath, args.inputTrainingData + '.class.jlb'))
208 + X_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingData + '.jlb'))
209 + y_test = joblib.load(os.path.join(args.outputModelPath, args.inputTestingClasses + '.class.jlb'))
210 + print(" Done!")
211 +
212 + print(" Number of training classes: {}".format(len(y_train)))
213 + print(" Number of training class DOM: {}".format(y_train.count('DOM')))
214 + print(" Number of training class OTHER: {}".format(y_train.count('OTHER')))
215 + print(" Shape of training matrix: {}".format(X_train.shape))
216 +
217 + print(" Number of testing classes: {}".format(len(y_test)))
218 + print(" Number of testing class DOM: {}".format(y_test.count('DOM')))
219 + print(" Number of testing class OTHER: {}".format(y_test.count('OTHER')))
220 + print(" Shape of testing matrix: {}".format(X_test.shape))
221 +
222 + # Feature selection and dimensional reduction
223 + if args.reduction is not None:
224 + print('Performing dimensionality reduction or feature selection...', args.reduction)
225 + if args.reduction == 'SVD200':
226 + reduc = TruncatedSVD(n_components=200, random_state=42)
227 + X_train = reduc.fit_transform(X_train)
228 + if args.reduction == 'SVD300':
229 + reduc = TruncatedSVD(n_components=300, random_state=42)
230 + X_train = reduc.fit_transform(X_train)
231 + elif args.reduction == 'CHI250':
232 + reduc = SelectKBest(chi2, k=50)
233 + X_train = reduc.fit_transform(X_train, y_train)
234 + elif args.reduction == 'CHI2100':
235 + reduc = SelectKBest(chi2, k=100)
236 + X_train = reduc.fit_transform(X_train, y_train)
237 + print(" Done!")
238 + print(' New shape of training matrix: ', X_train.shape)
239 +
240 + jobs = -1
241 + paramGrid = []
242 + nIter = 20
243 + crossV = 10
244 + # New performance scorer
245 + myScorer = make_scorer(f1_score, average='weighted')
246 + print("Defining randomized grid search...")
247 + if args.classifier == 'SVM':
248 + # SVM
249 + classifier = SVC()
250 + if args.kernel == 'rbf':
251 + paramGrid = {'C': scipy.stats.expon(scale=100),
252 + 'gamma': scipy.stats.expon(scale=.1),
253 + 'kernel': ['rbf'], 'class_weight': ['balanced', None]}
254 + elif args.kernel == 'linear':
255 + paramGrid = {'C': scipy.stats.expon(scale=100),
256 + 'kernel': ['linear'],
257 + 'class_weight': ['balanced', None]}
258 + elif args.kernel == 'poly':
259 + paramGrid = {'C': scipy.stats.expon(scale=100),
260 + 'gamma': scipy.stats.expon(scale=.1), 'degree': [2, 3],
261 + 'kernel': ['poly'], 'class_weight': ['balanced', None]}
262 + myClassifier = model_selection.RandomizedSearchCV(classifier,
263 + paramGrid, n_iter=nIter,
264 + cv=crossV, n_jobs=jobs, verbose=3, scoring=myScorer)
265 + elif args.classifier == 'BernoulliNB':
266 + # BernoulliNB
267 + classifier = BernoulliNB()
268 + paramGrid = {'alpha': scipy.stats.expon(scale=1.0)}
269 + myClassifier = model_selection.RandomizedSearchCV(classifier, paramGrid, n_iter=nIter,
270 + cv=crossV, n_jobs=jobs, verbose=3, scoring=myScorer)
271 + elif args.classifier == 'MultinomialNB':
272 + # MultinomialNB
273 + classifier = MultinomialNB()
274 + paramGrid = {'alpha': scipy.stats.expon(scale=1.0)}
275 + myClassifier = model_selection.RandomizedSearchCV(classifier, paramGrid, n_iter=nIter,
276 + cv=crossV, n_jobs=jobs, verbose=3, scoring=myScorer)
277 + else:
278 + print("Bad classifier")
279 + exit()
280 + print(" Done!")
281 +
282 + print("Training...")
283 + myClassifier.fit(X_train, y_train)
284 + print(" Done!")
285 +
286 + print("Getting best model and hyperparameters")
287 + print('Best score {}: {}\n'.format(myScorer, myClassifier.best_score_))
288 + print('Best parameters:\n')
289 + best_parameters = myClassifier.best_estimator_.get_params()
290 + for param in sorted(best_parameters.keys()):
291 + print("\t%s: %r\n" % (param, best_parameters[param]))
292 + theBestClassifier = myClassifier.best_estimator_
293 + print(str(theBestClassifier) + '\n')
294 + print("FALTA ESCRIBIR EL MEJOR SCORE (\"myClassifier.best_score_\") A UN ARCHIVO")
295 +
296 + print("Testing (prediction in new data)...")
297 + if args.reduction is not None:
298 + X_test = reduc.transform(X_test)
299 + y_pred = myClassifier.predict(X_test)
300 + best_parameters = myClassifier.best_estimator_.get_params()
301 + print(" Done!")
302 +
303 + print("Saving report...")
304 + with open(os.path.join(args.outputReportPath, args.outputReportFile), mode='w', encoding='utf8') as oFile:
305 + oFile.write('********** EVALUATION REPORT **********\n')
306 + oFile.write('Reduction: {}\n'.format(args.reduction))
307 + oFile.write('Classifier: {}\n'.format(args.classifier))
308 + oFile.write('Kernel: {}\n'.format(args.kernel))
309 + oFile.write('Accuracy: {}\n'.format(accuracy_score(y_test, y_pred)))
310 + oFile.write('Precision: {}\n'.format(precision_score(y_test, y_pred, average='weighted')))
311 + oFile.write('Recall: {}\n'.format(recall_score(y_test, y_pred, average='weighted')))
312 + oFile.write('F-score: {}\n'.format(f1_score(y_test, y_pred, average='weighted')))
313 + oFile.write('Confusion matrix: \n')
314 + oFile.write(str(confusion_matrix(y_test, y_pred)) + '\n')
315 + oFile.write('Classification report: \n')
316 + oFile.write(classification_report(y_test, y_pred) + '\n')
317 + oFile.write('Best parameters: \n')
318 + for param in sorted(best_parameters.keys()):
319 + oFile.write("\t%s: %r\n" % (param, best_parameters[param]))
320 +
321 + print(" Done!")
322 +
323 + print("Training and testing done in: %fs" % (time() - t0))