Ignacio Arroyo

grid search classification

#from pdb import set_trace as st
from sklearn.cross_validation import train_test_split as splitt
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.ensemble import (ExtraTreesClassifier, RandomForestClassifier,
AdaBoostClassifier, GradientBoostingClassifier)
from sklearn.grid_search import GridSearchCV
import pandas as pd
from numpy import mean, std
#Classifier = KNeighborsClassifier # 0.6464
#Classifier = NearestCentroid # 0.5054
#Classifier = RandomForestClassifier # 0.49
#Classifier = LinearSVC # 0.5402
#Classifier = SGDClassifier # 0.664
class EstimatorSelectionHelper:
def __init__(self, models, params):
if not set(models.keys()).issubset(set(params.keys())):
missing_params = list(set(models.keys()) - set(params.keys()))
raise ValueError("Some estimators are missing parameters: %s" % missing_params)
self.models = models
self.params = params
self.keys = models.keys()
self.grid_searches = {}
def fit(self, X, y, cv=3, n_jobs=1, verbose=1, scoring=None, refit=False):
for key in self.keys:
print("Running GridSearchCV for %s." % key)
model = self.models[key]
params = self.params[key]
gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
verbose=verbose, scoring=scoring, refit=refit)
gs.fit(X,y)
self.grid_searches[key] = gs
def score_summary(self, sort_by='mean_score'):
def row(key, scores, params):
d = {
'estimator': key,
'min_score': min(scores),
'max_score': max(scores),
'mean_score': mean(scores),
'std_score': std(scores),
}
return pd.Series(dict(list(params.items()) + list(d.items())))
rows = [row(k, gsc.cv_validation_scores, gsc.parameters)
for k in self.keys
for gsc in self.grid_searches[k].grid_scores_]
df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
columns = columns + [c for c in df.columns if c not in columns]
return df[columns]
def get_abstracts(file_name, label):
f = open(file_name)
extract = {}
docs = []
empties = []
lines = f.readlines()
copyright = False
for i, ln in enumerate(lines):
if not ln.strip():
empties.append(i)
continue
elif ' doi: ' in ln:
for j in range(i, i + 10):
if not lines[j].strip():
title_idx = j + 1
break
continue
elif 'Copyright ' in ln:
copyright = True
elif 'DOI: ' in ln:
if 'PMCID: ' in lines[i + 1]:
extract['pmid'] = int(lines[i + 2].strip().split()[1])
elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]:
extract['pmid'] = int(lines[i + 1].strip().split()[1])
if copyright:
get = slice(empties[-3], empties[-2])
copyright = False
else:
get = slice(empties[-2], empties[-1])
extract['body'] = " ".join(lines[get]).replace("\n", ' ').replace(" ", ' ')
title = []
for j in range(title_idx, title_idx + 5):
if lines[j].strip():
title.append(lines[j])
else:
break
extract['title'] = " ".join(title).replace("\n", ' ').replace(" ", ' ')
extract['topic'] = label
docs.append(extract)
empties = []
extract = {}
return docs
filename="../data/ecoli_abstracts/not_useful_abstracts.txt"
labels = ['useless', 'useful']
abstracs = get_abstracts(file_name = filename, label = labels[0])
filename="../data/ecoli_abstracts/useful_abstracts.txt"
abstracs += get_abstracts(file_name = filename, label = labels[1])
X = [x['body'] for x in abstracs]
y = [1 if x['topic'] == 'useful' else 0 for x in abstracs]
models1 = {
'ExtraTreesClassifier': ExtraTreesClassifier(),
'RandomForestClassifier': RandomForestClassifier(),
'AdaBoostClassifier': AdaBoostClassifier(),
'GradientBoostingClassifier': GradientBoostingClassifier(),
'SVC': SVC()
}
params1 = {
'ExtraTreesClassifier': { 'n_estimators': [16, 32] },
'RandomForestClassifier': { 'n_estimators': [16, 32] },
'AdaBoostClassifier': { 'n_estimators': [16, 32] },
'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] },
'SVC': [
#{'kernel': ['linear'], 'C': [1, 10, 100, 150, 200, 300, 400]},
{'kernel': ['rbf'], 'C': [1, 10, 100, 150, 200, 300, 400], 'gamma': [0.001, 0.0001]},
{'kernel': ['poly'], 'C': [1, 10, 100, 150, 200, 300, 400], 'degree': [2, 3, 4, 5, 6]},
{'kernel': ['sigmoid'], 'C': [1, 10, 100, 150, 200, 300, 400], 'gamma': [0.001, 0.0001]},
]
}
clf = EstimatorSelectionHelper(models1, params1)
vectorizer = TfidfVectorizer(binary=True)
#ngram_range=(1, 3)
#)
#vectorizer = HashingVectorizer(non_negative=True)
print(vectorizer)
#svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20)
X = vectorizer.fit_transform(X)
#X = svd.fit_transform(X)
#X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
#from sklearn.feature_selection import chi2, SelectKBest
#ch2 = SelectKBest(chi2, k=200)
#X_train = ch2.fit_transform(X_train, y_train)
#X_test = ch2.transform(X_test)
#clf = MultinomialNB(alpha=.01)
#clf = Classifier(n_jobs=-1, n_iter=100)
#st()
clf.fit(X, y, scoring='f1', n_jobs=-1)
#pred = clf.predict(X_test)
#print(metrics.f1_score(y_test, pred, average='macro'))
print(clf.score_summary(sort_by='min_score'))