Showing
1 changed file
with
175 additions
and
0 deletions
get_abstracts.py
0 → 100644
| 1 | +#from pdb import set_trace as st | ||
| 2 | +from sklearn.cross_validation import train_test_split as splitt | ||
| 3 | +from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer | ||
| 4 | +from sklearn.decomposition import TruncatedSVD | ||
| 5 | +from sklearn.naive_bayes import MultinomialNB | ||
| 6 | +from sklearn.linear_model import SGDClassifier | ||
| 7 | +from sklearn.neighbors import KNeighborsClassifier | ||
| 8 | +from sklearn.neighbors import NearestCentroid | ||
| 9 | +from sklearn.ensemble import RandomForestClassifier | ||
| 10 | +from sklearn.svm import LinearSVC | ||
| 11 | +from sklearn.svm import SVC | ||
| 12 | +from sklearn import metrics | ||
| 13 | +from sklearn.ensemble import (ExtraTreesClassifier, RandomForestClassifier, | ||
| 14 | + AdaBoostClassifier, GradientBoostingClassifier) | ||
| 15 | +from sklearn.grid_search import GridSearchCV | ||
| 16 | +import pandas as pd | ||
| 17 | +from numpy import mean, std | ||
| 18 | + | ||
| 19 | +#Classifier = KNeighborsClassifier # 0.6464 | ||
| 20 | +#Classifier = NearestCentroid # 0.5054 | ||
| 21 | +#Classifier = RandomForestClassifier # 0.49 | ||
| 22 | +#Classifier = LinearSVC # 0.5402 | ||
| 23 | +#Classifier = SGDClassifier # 0.664 | ||
| 24 | + | ||
| 25 | +class EstimatorSelectionHelper: | ||
| 26 | + def __init__(self, models, params): | ||
| 27 | + if not set(models.keys()).issubset(set(params.keys())): | ||
| 28 | + missing_params = list(set(models.keys()) - set(params.keys())) | ||
| 29 | + raise ValueError("Some estimators are missing parameters: %s" % missing_params) | ||
| 30 | + self.models = models | ||
| 31 | + self.params = params | ||
| 32 | + self.keys = models.keys() | ||
| 33 | + self.grid_searches = {} | ||
| 34 | + | ||
| 35 | + def fit(self, X, y, cv=3, n_jobs=1, verbose=1, scoring=None, refit=False): | ||
| 36 | + for key in self.keys: | ||
| 37 | + print("Running GridSearchCV for %s." % key) | ||
| 38 | + model = self.models[key] | ||
| 39 | + params = self.params[key] | ||
| 40 | + gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs, | ||
| 41 | + verbose=verbose, scoring=scoring, refit=refit) | ||
| 42 | + gs.fit(X,y) | ||
| 43 | + self.grid_searches[key] = gs | ||
| 44 | + | ||
| 45 | + def score_summary(self, sort_by='mean_score'): | ||
| 46 | + def row(key, scores, params): | ||
| 47 | + d = { | ||
| 48 | + 'estimator': key, | ||
| 49 | + 'min_score': min(scores), | ||
| 50 | + 'max_score': max(scores), | ||
| 51 | + 'mean_score': mean(scores), | ||
| 52 | + 'std_score': std(scores), | ||
| 53 | + } | ||
| 54 | + return pd.Series(dict(list(params.items()) + list(d.items()))) | ||
| 55 | + | ||
| 56 | + rows = [row(k, gsc.cv_validation_scores, gsc.parameters) | ||
| 57 | + for k in self.keys | ||
| 58 | + for gsc in self.grid_searches[k].grid_scores_] | ||
| 59 | + df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False) | ||
| 60 | + | ||
| 61 | + columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score'] | ||
| 62 | + columns = columns + [c for c in df.columns if c not in columns] | ||
| 63 | + | ||
| 64 | + return df[columns] | ||
| 65 | + | ||
| 66 | + | ||
| 67 | +def get_abstracts(file_name, label): | ||
| 68 | + f = open(file_name) | ||
| 69 | + extract = {} | ||
| 70 | + docs = [] | ||
| 71 | + empties = [] | ||
| 72 | + lines = f.readlines() | ||
| 73 | + copyright = False | ||
| 74 | + | ||
| 75 | + for i, ln in enumerate(lines): | ||
| 76 | + if not ln.strip(): | ||
| 77 | + empties.append(i) | ||
| 78 | + continue | ||
| 79 | + elif ' doi: ' in ln: | ||
| 80 | + for j in range(i, i + 10): | ||
| 81 | + if not lines[j].strip(): | ||
| 82 | + title_idx = j + 1 | ||
| 83 | + break | ||
| 84 | + continue | ||
| 85 | + | ||
| 86 | + elif 'Copyright ' in ln: | ||
| 87 | + copyright = True | ||
| 88 | + | ||
| 89 | + elif 'DOI: ' in ln: | ||
| 90 | + if 'PMCID: ' in lines[i + 1]: | ||
| 91 | + extract['pmid'] = int(lines[i + 2].strip().split()[1]) | ||
| 92 | + elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]: | ||
| 93 | + extract['pmid'] = int(lines[i + 1].strip().split()[1]) | ||
| 94 | + | ||
| 95 | + if copyright: | ||
| 96 | + get = slice(empties[-3], empties[-2]) | ||
| 97 | + copyright = False | ||
| 98 | + else: | ||
| 99 | + get = slice(empties[-2], empties[-1]) | ||
| 100 | + | ||
| 101 | + extract['body'] = " ".join(lines[get]).replace("\n", ' ').replace(" ", ' ') | ||
| 102 | + title = [] | ||
| 103 | + for j in range(title_idx, title_idx + 5): | ||
| 104 | + if lines[j].strip(): | ||
| 105 | + title.append(lines[j]) | ||
| 106 | + else: | ||
| 107 | + break | ||
| 108 | + extract['title'] = " ".join(title).replace("\n", ' ').replace(" ", ' ') | ||
| 109 | + extract['topic'] = label | ||
| 110 | + docs.append(extract) | ||
| 111 | + empties = [] | ||
| 112 | + extract = {} | ||
| 113 | + | ||
| 114 | + return docs | ||
| 115 | + | ||
| 116 | + | ||
| 117 | +filename="../data/ecoli_abstracts/not_useful_abstracts.txt" | ||
| 118 | +labels = ['useless', 'useful'] | ||
| 119 | + | ||
| 120 | +abstracs = get_abstracts(file_name = filename, label = labels[0]) | ||
| 121 | + | ||
| 122 | +filename="../data/ecoli_abstracts/useful_abstracts.txt" | ||
| 123 | + | ||
| 124 | +abstracs += get_abstracts(file_name = filename, label = labels[1]) | ||
| 125 | + | ||
| 126 | +X = [x['body'] for x in abstracs] | ||
| 127 | +y = [1 if x['topic'] == 'useful' else 0 for x in abstracs] | ||
| 128 | + | ||
| 129 | +models1 = { | ||
| 130 | + 'ExtraTreesClassifier': ExtraTreesClassifier(), | ||
| 131 | + 'RandomForestClassifier': RandomForestClassifier(), | ||
| 132 | + 'AdaBoostClassifier': AdaBoostClassifier(), | ||
| 133 | + 'GradientBoostingClassifier': GradientBoostingClassifier(), | ||
| 134 | + 'SVC': SVC() | ||
| 135 | +} | ||
| 136 | + | ||
| 137 | +params1 = { | ||
| 138 | + 'ExtraTreesClassifier': { 'n_estimators': [16, 32] }, | ||
| 139 | + 'RandomForestClassifier': { 'n_estimators': [16, 32] }, | ||
| 140 | + 'AdaBoostClassifier': { 'n_estimators': [16, 32] }, | ||
| 141 | + 'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] }, | ||
| 142 | + 'SVC': [ | ||
| 143 | + #{'kernel': ['linear'], 'C': [1, 10, 100, 150, 200, 300, 400]}, | ||
| 144 | + {'kernel': ['rbf'], 'C': [1, 10, 100, 150, 200, 300, 400], 'gamma': [0.001, 0.0001]}, | ||
| 145 | + {'kernel': ['poly'], 'C': [1, 10, 100, 150, 200, 300, 400], 'degree': [2, 3, 4, 5, 6]}, | ||
| 146 | + {'kernel': ['sigmoid'], 'C': [1, 10, 100, 150, 200, 300, 400], 'gamma': [0.001, 0.0001]}, | ||
| 147 | + ] | ||
| 148 | +} | ||
| 149 | + | ||
| 150 | +clf = EstimatorSelectionHelper(models1, params1) | ||
| 151 | + | ||
| 152 | +vectorizer = TfidfVectorizer(binary=True) | ||
| 153 | + #ngram_range=(1, 3) | ||
| 154 | + #) | ||
| 155 | +#vectorizer = HashingVectorizer(non_negative=True) | ||
| 156 | +print(vectorizer) | ||
| 157 | +#svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20) | ||
| 158 | +X = vectorizer.fit_transform(X) | ||
| 159 | +#X = svd.fit_transform(X) | ||
| 160 | + | ||
| 161 | +#X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42) | ||
| 162 | + | ||
| 163 | +#from sklearn.feature_selection import chi2, SelectKBest | ||
| 164 | +#ch2 = SelectKBest(chi2, k=200) | ||
| 165 | +#X_train = ch2.fit_transform(X_train, y_train) | ||
| 166 | +#X_test = ch2.transform(X_test) | ||
| 167 | + | ||
| 168 | +#clf = MultinomialNB(alpha=.01) | ||
| 169 | +#clf = Classifier(n_jobs=-1, n_iter=100) | ||
| 170 | +#st() | ||
| 171 | +clf.fit(X, y, scoring='f1', n_jobs=-1) | ||
| 172 | + | ||
| 173 | +#pred = clf.predict(X_test) | ||
| 174 | +#print(metrics.f1_score(y_test, pred, average='macro')) | ||
| 175 | +print(clf.score_summary(sort_by='min_score')) |
-
Please register or login to post a comment