Ignacio Arroyo

grid search classification

1 +#from pdb import set_trace as st
2 +from sklearn.cross_validation import train_test_split as splitt
3 +from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
4 +from sklearn.decomposition import TruncatedSVD
5 +from sklearn.naive_bayes import MultinomialNB
6 +from sklearn.linear_model import SGDClassifier
7 +from sklearn.neighbors import KNeighborsClassifier
8 +from sklearn.neighbors import NearestCentroid
9 +from sklearn.ensemble import RandomForestClassifier
10 +from sklearn.svm import LinearSVC
11 +from sklearn.svm import SVC
12 +from sklearn import metrics
13 +from sklearn.ensemble import (ExtraTreesClassifier, RandomForestClassifier,
14 + AdaBoostClassifier, GradientBoostingClassifier)
15 +from sklearn.grid_search import GridSearchCV
16 +import pandas as pd
17 +from numpy import mean, std
18 +
19 +#Classifier = KNeighborsClassifier # 0.6464
20 +#Classifier = NearestCentroid # 0.5054
21 +#Classifier = RandomForestClassifier # 0.49
22 +#Classifier = LinearSVC # 0.5402
23 +#Classifier = SGDClassifier # 0.664
24 +
25 +class EstimatorSelectionHelper:
26 + def __init__(self, models, params):
27 + if not set(models.keys()).issubset(set(params.keys())):
28 + missing_params = list(set(models.keys()) - set(params.keys()))
29 + raise ValueError("Some estimators are missing parameters: %s" % missing_params)
30 + self.models = models
31 + self.params = params
32 + self.keys = models.keys()
33 + self.grid_searches = {}
34 +
35 + def fit(self, X, y, cv=3, n_jobs=1, verbose=1, scoring=None, refit=False):
36 + for key in self.keys:
37 + print("Running GridSearchCV for %s." % key)
38 + model = self.models[key]
39 + params = self.params[key]
40 + gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
41 + verbose=verbose, scoring=scoring, refit=refit)
42 + gs.fit(X,y)
43 + self.grid_searches[key] = gs
44 +
45 + def score_summary(self, sort_by='mean_score'):
46 + def row(key, scores, params):
47 + d = {
48 + 'estimator': key,
49 + 'min_score': min(scores),
50 + 'max_score': max(scores),
51 + 'mean_score': mean(scores),
52 + 'std_score': std(scores),
53 + }
54 + return pd.Series(dict(list(params.items()) + list(d.items())))
55 +
56 + rows = [row(k, gsc.cv_validation_scores, gsc.parameters)
57 + for k in self.keys
58 + for gsc in self.grid_searches[k].grid_scores_]
59 + df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
60 +
61 + columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
62 + columns = columns + [c for c in df.columns if c not in columns]
63 +
64 + return df[columns]
65 +
66 +
67 +def get_abstracts(file_name, label):
68 + f = open(file_name)
69 + extract = {}
70 + docs = []
71 + empties = []
72 + lines = f.readlines()
73 + copyright = False
74 +
75 + for i, ln in enumerate(lines):
76 + if not ln.strip():
77 + empties.append(i)
78 + continue
79 + elif ' doi: ' in ln:
80 + for j in range(i, i + 10):
81 + if not lines[j].strip():
82 + title_idx = j + 1
83 + break
84 + continue
85 +
86 + elif 'Copyright ' in ln:
87 + copyright = True
88 +
89 + elif 'DOI: ' in ln:
90 + if 'PMCID: ' in lines[i + 1]:
91 + extract['pmid'] = int(lines[i + 2].strip().split()[1])
92 + elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]:
93 + extract['pmid'] = int(lines[i + 1].strip().split()[1])
94 +
95 + if copyright:
96 + get = slice(empties[-3], empties[-2])
97 + copyright = False
98 + else:
99 + get = slice(empties[-2], empties[-1])
100 +
101 + extract['body'] = " ".join(lines[get]).replace("\n", ' ').replace(" ", ' ')
102 + title = []
103 + for j in range(title_idx, title_idx + 5):
104 + if lines[j].strip():
105 + title.append(lines[j])
106 + else:
107 + break
108 + extract['title'] = " ".join(title).replace("\n", ' ').replace(" ", ' ')
109 + extract['topic'] = label
110 + docs.append(extract)
111 + empties = []
112 + extract = {}
113 +
114 + return docs
115 +
116 +
117 +filename="../data/ecoli_abstracts/not_useful_abstracts.txt"
118 +labels = ['useless', 'useful']
119 +
120 +abstracs = get_abstracts(file_name = filename, label = labels[0])
121 +
122 +filename="../data/ecoli_abstracts/useful_abstracts.txt"
123 +
124 +abstracs += get_abstracts(file_name = filename, label = labels[1])
125 +
126 +X = [x['body'] for x in abstracs]
127 +y = [1 if x['topic'] == 'useful' else 0 for x in abstracs]
128 +
129 +models1 = {
130 + 'ExtraTreesClassifier': ExtraTreesClassifier(),
131 + 'RandomForestClassifier': RandomForestClassifier(),
132 + 'AdaBoostClassifier': AdaBoostClassifier(),
133 + 'GradientBoostingClassifier': GradientBoostingClassifier(),
134 + 'SVC': SVC()
135 +}
136 +
137 +params1 = {
138 + 'ExtraTreesClassifier': { 'n_estimators': [16, 32] },
139 + 'RandomForestClassifier': { 'n_estimators': [16, 32] },
140 + 'AdaBoostClassifier': { 'n_estimators': [16, 32] },
141 + 'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] },
142 + 'SVC': [
143 + #{'kernel': ['linear'], 'C': [1, 10, 100, 150, 200, 300, 400]},
144 + {'kernel': ['rbf'], 'C': [1, 10, 100, 150, 200, 300, 400], 'gamma': [0.001, 0.0001]},
145 + {'kernel': ['poly'], 'C': [1, 10, 100, 150, 200, 300, 400], 'degree': [2, 3, 4, 5, 6]},
146 + {'kernel': ['sigmoid'], 'C': [1, 10, 100, 150, 200, 300, 400], 'gamma': [0.001, 0.0001]},
147 + ]
148 +}
149 +
150 +clf = EstimatorSelectionHelper(models1, params1)
151 +
152 +vectorizer = TfidfVectorizer(binary=True)
153 + #ngram_range=(1, 3)
154 + #)
155 +#vectorizer = HashingVectorizer(non_negative=True)
156 +print(vectorizer)
157 +#svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20)
158 +X = vectorizer.fit_transform(X)
159 +#X = svd.fit_transform(X)
160 +
161 +#X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
162 +
163 +#from sklearn.feature_selection import chi2, SelectKBest
164 +#ch2 = SelectKBest(chi2, k=200)
165 +#X_train = ch2.fit_transform(X_train, y_train)
166 +#X_test = ch2.transform(X_test)
167 +
168 +#clf = MultinomialNB(alpha=.01)
169 +#clf = Classifier(n_jobs=-1, n_iter=100)
170 +#st()
171 +clf.fit(X, y, scoring='f1', n_jobs=-1)
172 +
173 +#pred = clf.predict(X_test)
174 +#print(metrics.f1_score(y_test, pred, average='macro'))
175 +print(clf.score_summary(sort_by='min_score'))