added pretrained models and ready to predict unseen abstracts
Showing
10 changed files
with
406 additions
and
34 deletions
1 | # This paper talks about (and reports) experimental data | 1 | # This paper talks about (and reports) experimental data |
2 | 2 | ||
3 | -Automatic discrimination of useless papers via machine learning of abstracts | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
3 | +Automatic discrimination of useless papers via machine learning of abstracts. | ||
4 | + | ||
5 | +The main method follows the next pipeline: | ||
6 | + | ||
7 | +### Training mode | ||
8 | +- Parse abstracts from two input files (classA and classB; see files format at the `data/` directory) | ||
9 | +- Transform abstracts into their TFIDF sparse representations | ||
10 | +- Train Support Vector Machines with different parameters by using GridSearch | ||
11 | +- Select the best estimator and save it at `model/svm_model.pkl` (default) | ||
12 | +- Save TFIDF transformation for keeping the training vocabulary (stored at `model/tfidf_model.pkl`) | ||
13 | + | ||
14 | +### Prediction mode | ||
15 | +- Parse abstracts from a unique input file | ||
16 | +- Transform abstracts into their TFIDF sparse representations | ||
17 | +- Predict useless/useful papers by means of their abstracts using pretrained Support Vector Machines | ||
18 | + | ||
19 | +# Usage | ||
20 | + | ||
21 | +For filtering unknown anstracts run | ||
22 | + | ||
23 | +```bash | ||
24 | +$ python filter_abstracts.py --input data/ecoli_abstracts/useful_abstracts.txt | ||
25 | +``` | ||
26 | +The predictions will be stored by default at `filter_output/`, unless a different directory is specified by means of the `--out` option. The default names containing the predicitons are | ||
27 | + | ||
28 | +- filter_output/useful.out | ||
29 | +- filter_output/useless.out | ||
30 | + | ||
31 | +The format of each file is: | ||
32 | + | ||
33 | +``` | ||
34 | +<PMID> \t <text of the abstract> | ||
35 | +... | ||
36 | +<PMID> \t <text of the abstract> | ||
37 | +``` | ||
38 | + | ||
39 | +For training a new model set the list of parameters at `model_params.conf` and then run | ||
40 | + | ||
41 | +```bash | ||
42 | +$ python filter_abstracts.py --classA data/ecoli_abstracts/not_useful_abstracts.txt --classB data/ecoli_abstracts/useful_abstracts.txt | ||
43 | +``` | ||
44 | + | ||
45 | +where `--classA` and `--classA` are used to specify input training files. In this example `data/ecoli_abstracts/useful_abstracts.txt` is the training files containing abstracts of papers reporting experimental data (the desired or useful class for us). | ... | ... |
... | @@ -28,6 +28,7 @@ class EstimatorSelectionHelper: | ... | @@ -28,6 +28,7 @@ class EstimatorSelectionHelper: |
28 | self.params = params | 28 | self.params = params |
29 | self.keys = models.keys() | 29 | self.keys = models.keys() |
30 | self.grid_searches = {} | 30 | self.grid_searches = {} |
31 | + self.best_estimator = {} | ||
31 | 32 | ||
32 | def fit(self, X, y, cv=3, n_jobs=1, verbose=1, scoring=None, refit=False): | 33 | def fit(self, X, y, cv=3, n_jobs=1, verbose=1, scoring=None, refit=False): |
33 | for key in self.keys: | 34 | for key in self.keys: |
... | @@ -40,24 +41,25 @@ class EstimatorSelectionHelper: | ... | @@ -40,24 +41,25 @@ class EstimatorSelectionHelper: |
40 | self.grid_searches[key] = gs | 41 | self.grid_searches[key] = gs |
41 | 42 | ||
42 | def score_summary(self, sort_by='mean_score'): | 43 | def score_summary(self, sort_by='mean_score'): |
43 | - def row(key, scores, params): | 44 | + def row(key, scores, params, model): |
44 | d = { | 45 | d = { |
45 | 'estimator': key, | 46 | 'estimator': key, |
46 | 'min_score': min(scores), | 47 | 'min_score': min(scores), |
47 | 'max_score': max(scores), | 48 | 'max_score': max(scores), |
48 | 'mean_score': mean(scores), | 49 | 'mean_score': mean(scores), |
49 | 'std_score': std(scores), | 50 | 'std_score': std(scores), |
51 | + 'model': model | ||
50 | } | 52 | } |
51 | return pd.Series(dict(list(params.items()) + list(d.items()))) | 53 | return pd.Series(dict(list(params.items()) + list(d.items()))) |
52 | 54 | ||
53 | - rows = [row(k, gsc.cv_validation_scores, gsc.parameters) | 55 | + rows = [row(k, gsc.cv_validation_scores, gsc.parameters, m) |
54 | for k in self.keys | 56 | for k in self.keys |
55 | - for gsc in self.grid_searches[k].grid_scores_] | 57 | + for gsc, m in zip(self.grid_searches[k].grid_scores_, self.grid_searches[k].best_estimator_)] |
56 | df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False) | 58 | df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False) |
57 | 59 | ||
58 | columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score'] | 60 | columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score'] |
59 | - columns = columns + [c for c in df.columns if c not in columns] | 61 | + columns = columns + [c for c in df.columns if (c not in columns and c != 'model')] |
60 | - | 62 | + self.best_estimator_ = df['model'][0] |
61 | return df[columns] | 63 | return df[columns] |
62 | 64 | ||
63 | 65 | ||
... | @@ -169,9 +171,9 @@ X = vectorizer.fit_transform(X) | ... | @@ -169,9 +171,9 @@ X = vectorizer.fit_transform(X) |
169 | #st() | 171 | #st() |
170 | clf.fit(X, y, scoring='f1', n_jobs=-1) | 172 | clf.fit(X, y, scoring='f1', n_jobs=-1) |
171 | 173 | ||
172 | -joblib.dump(clf.best_estimator_, 'model/svm_model.pkl') | ||
173 | -joblib.dump(clf.best_estimator_, 'model/tifidf_model.pkl') | ||
174 | - | ||
175 | #pred = clf.predict(X_test) | 174 | #pred = clf.predict(X_test) |
176 | #print(metrics.f1_score(y_test, pred, average='macro')) | 175 | #print(metrics.f1_score(y_test, pred, average='macro')) |
177 | print(clf.score_summary(sort_by='min_score')) | 176 | print(clf.score_summary(sort_by='min_score')) |
177 | + | ||
178 | +joblib.dump(clf.best_estimator_, 'model/svm_model.pkl') | ||
179 | +joblib.dump(vectorizer, 'model/tifidf_model.pkl') | ... | ... |
deprecated/classify_abstracts.py
0 → 100644
1 | +#from pdb import set_trace as st | ||
2 | +from sklearn.cross_validation import train_test_split as splitt | ||
3 | +from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer | ||
4 | +from sklearn.decomposition import TruncatedSVD | ||
5 | +from sklearn.naive_bayes import MultinomialNB | ||
6 | +from sklearn.linear_model import SGDClassifier | ||
7 | +from sklearn.neighbors import KNeighborsClassifier | ||
8 | +from sklearn.neighbors import NearestCentroid | ||
9 | +from sklearn.ensemble import RandomForestClassifier | ||
10 | +from sklearn.svm import LinearSVC | ||
11 | +from sklearn.svm import SVC | ||
12 | +from sklearn import metrics | ||
13 | +from sklearn.ensemble import (ExtraTreesClassifier, RandomForestClassifier, | ||
14 | + AdaBoostClassifier, GradientBoostingClassifier) | ||
15 | +from sklearn.grid_search import GridSearchCV | ||
16 | +from sklearn.externals import joblib | ||
17 | +import pandas as pd | ||
18 | +from numpy import mean, std | ||
19 | + | ||
20 | + | ||
21 | +class EstimatorSelectionHelper: | ||
22 | + "http://www.codiply.com/blog/hyperparameter-grid-search-across-multiple-models-in-scikit-learn/" | ||
23 | + def __init__(self, models, params): | ||
24 | + if not set(models.keys()).issubset(set(params.keys())): | ||
25 | + missing_params = list(set(models.keys()) - set(params.keys())) | ||
26 | + raise ValueError("Some estimators are missing parameters: %s" % missing_params) | ||
27 | + self.models = models | ||
28 | + self.params = params | ||
29 | + self.keys = models.keys() | ||
30 | + self.grid_searches = {} | ||
31 | + self.best_estimator = {} | ||
32 | + | ||
33 | + def fit(self, X, y, cv=3, n_jobs=1, verbose=1, scoring=None, refit=False): | ||
34 | + for key in self.keys: | ||
35 | + print("Running GridSearchCV for %s." % key) | ||
36 | + model = self.models[key] | ||
37 | + params = self.params[key] | ||
38 | + gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs, | ||
39 | + verbose=verbose, scoring=scoring, refit=refit) | ||
40 | + gs.fit(X,y) | ||
41 | + self.grid_searches[key] = gs | ||
42 | + | ||
43 | + def score_summary(self, sort_by='mean_score'): | ||
44 | + def row(key, scores, params, model): | ||
45 | + d = { | ||
46 | + 'estimator': key, | ||
47 | + 'min_score': min(scores), | ||
48 | + 'max_score': max(scores), | ||
49 | + 'mean_score': mean(scores), | ||
50 | + 'std_score': std(scores), | ||
51 | + 'model': model | ||
52 | + } | ||
53 | + return pd.Series(dict(list(params.items()) + list(d.items()))) | ||
54 | + | ||
55 | + rows = [row(k, gsc.cv_validation_scores, gsc.parameters, m) | ||
56 | + for k in self.keys | ||
57 | + for gsc, m in zip(self.grid_searches[k].grid_scores_, self.grid_searches[k].best_estimator_)] | ||
58 | + df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False) | ||
59 | + | ||
60 | + columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score'] | ||
61 | + columns = columns + [c for c in df.columns if (c not in columns and c != 'model')] | ||
62 | + self.best_estimator_ = df['model'][0] | ||
63 | + return df[columns] | ||
64 | + | ||
65 | + | ||
66 | +def get_abstracts(file_name, label): | ||
67 | + f = open(file_name) | ||
68 | + extract = {} | ||
69 | + docs = [] | ||
70 | + empties = [] | ||
71 | + lines = f.readlines() | ||
72 | + cpright = False | ||
73 | + | ||
74 | + for i, ln in enumerate(lines): | ||
75 | + if not ln.strip(): | ||
76 | + empties.append(i) | ||
77 | + continue | ||
78 | + elif ' doi: ' in ln: | ||
79 | + for j in range(i, i + 10): | ||
80 | + if not lines[j].strip(): | ||
81 | + title_idx = j + 1 | ||
82 | + break | ||
83 | + continue | ||
84 | + | ||
85 | + elif 'cpright ' in ln: | ||
86 | + cpright = True | ||
87 | + | ||
88 | + elif 'DOI: ' in ln: | ||
89 | + if 'PMCID: ' in lines[i + 1]: | ||
90 | + extract['pmid'] = int(lines[i + 2].strip().split()[1]) | ||
91 | + elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]: | ||
92 | + extract['pmid'] = int(lines[i + 1].strip().split()[1]) | ||
93 | + | ||
94 | + if cpright: | ||
95 | + get = slice(empties[-3], empties[-2]) | ||
96 | + cpright = False | ||
97 | + else: | ||
98 | + get = slice(empties[-2], empties[-1]) | ||
99 | + | ||
100 | + extract['body'] = " ".join(lines[get]).replace("\n", ' ').replace(" ", ' ') | ||
101 | + title = [] | ||
102 | + for j in range(title_idx, title_idx + 5): | ||
103 | + if lines[j].strip(): | ||
104 | + title.append(lines[j]) | ||
105 | + else: | ||
106 | + break | ||
107 | + extract['title'] = " ".join(title).replace("\n", ' ').replace(" ", ' ') | ||
108 | + extract['topic'] = label | ||
109 | + docs.append(extract) | ||
110 | + empties = [] | ||
111 | + extract = {} | ||
112 | + | ||
113 | + return docs | ||
114 | + | ||
115 | + | ||
116 | +filename = "data/ecoli_abstracts/not_useful_abstracts.txt" | ||
117 | +labels = ['useless', 'useful'] | ||
118 | + | ||
119 | +abstracs = get_abstracts(file_name=filename, label=labels[0]) | ||
120 | + | ||
121 | +filename = "data/ecoli_abstracts/useful_abstracts.txt" | ||
122 | + | ||
123 | +abstracs += get_abstracts(file_name=filename, label=labels[1]) | ||
124 | + | ||
125 | +X = [x['body'] for x in abstracs] | ||
126 | +y = [1 if x['topic'] == 'useful' else 0 for x in abstracs] | ||
127 | + | ||
128 | +models1 = { | ||
129 | + 'ExtraTreesClassifier': ExtraTreesClassifier(), | ||
130 | + 'RandomForestClassifier': RandomForestClassifier(), | ||
131 | + 'AdaBoostClassifier': AdaBoostClassifier(), | ||
132 | + 'GradientBoostingClassifier': GradientBoostingClassifier(), | ||
133 | + 'SVC': SVC() | ||
134 | +} | ||
135 | + | ||
136 | +params1 = { | ||
137 | + 'ExtraTreesClassifier': {'n_estimators': [16, 32]}, | ||
138 | + 'RandomForestClassifier': {'n_estimators': [16, 32]}, | ||
139 | + 'AdaBoostClassifier': {'n_estimators': [16, 32]}, | ||
140 | + 'GradientBoostingClassifier': {'n_estimators': [16, 32], | ||
141 | + 'learning_rate': [0.8, 1.0]}, | ||
142 | + 'SVC': [ | ||
143 | + {'kernel': ['rbf'], 'C': [1, 10, 100, 150, 200, 300, 350, 400], | ||
144 | + 'gamma': [0.1, 0.01, 0.001, 0.0001, 0.00001]}, | ||
145 | + {'kernel': ['poly'], 'C': [1, 10, 100, 150, 200, 300, 350, 400], | ||
146 | + 'degree': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 23, 26], | ||
147 | + 'coef0': [0.1, 0.2,0.3,0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]} | ||
148 | + ] | ||
149 | +} | ||
150 | + | ||
151 | +clf = EstimatorSelectionHelper(models1, params1) | ||
152 | + | ||
153 | +vectorizer = TfidfVectorizer(binary=True) | ||
154 | + #ngram_range=(1, 3) | ||
155 | + #) | ||
156 | +#vectorizer = HashingVectorizer(non_negative=True) | ||
157 | +print(vectorizer) | ||
158 | +#svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20) | ||
159 | +X = vectorizer.fit_transform(X) | ||
160 | +#X = svd.fit_transform(X) | ||
161 | + | ||
162 | +#X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42) | ||
163 | + | ||
164 | +#from sklearn.feature_selection import chi2, SelectKBest | ||
165 | +#ch2 = SelectKBest(chi2, k=200) | ||
166 | +#X_train = ch2.fit_transform(X_train, y_train) | ||
167 | +#X_test = ch2.transform(X_test) | ||
168 | + | ||
169 | +#clf = MultinomialNB(alpha=.01) | ||
170 | +#clf = Classifier(n_jobs=-1, n_iter=100) | ||
171 | +#st() | ||
172 | +clf.fit(X, y, scoring='f1', n_jobs=-1) | ||
173 | + | ||
174 | +#pred = clf.predict(X_test) | ||
175 | +#print(metrics.f1_score(y_test, pred, average='macro')) | ||
176 | +print(clf.score_summary(sort_by='min_score')) | ||
177 | + | ||
178 | +joblib.dump(clf.best_estimator_, 'model/svm_model.pkl') | ||
179 | +joblib.dump(vectorizer, 'model/tifidf_model.pkl') |
deprecated/report.txt
0 → 100644
1 | +TfidfVectorizer(analyzer='word', binary=True, decode_error='strict', | ||
2 | + dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', | ||
3 | + lowercase=True, max_df=1.0, max_features=None, min_df=1, | ||
4 | + ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True, | ||
5 | + stop_words=None, strip_accents=None, sublinear_tf=False, | ||
6 | + token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True, | ||
7 | + vocabulary=None) | ||
8 | +Running GridSearchCV for GradientBoostingClassifier. | ||
9 | +Fitting 3 folds for each of 4 candidates, totalling 12 fits | ||
10 | +Running GridSearchCV for AdaBoostClassifier. | ||
11 | +Fitting 3 folds for each of 2 candidates, totalling 6 fits | ||
12 | +Running GridSearchCV for ExtraTreesClassifier. | ||
13 | +Fitting 3 folds for each of 2 candidates, totalling 6 fits | ||
14 | +Running GridSearchCV for SVC. | ||
15 | +Fitting 3 folds for each of 63 candidates, totalling 189 fits | ||
16 | +Running GridSearchCV for RandomForestClassifier. | ||
17 | +Fitting 3 folds for each of 2 candidates, totalling 6 fits | ||
18 | + estimator min_score mean_score max_score std_score \ | ||
19 | +36 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
20 | +66 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
21 | +35 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
22 | +37 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
23 | +38 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
24 | +39 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
25 | +40 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
26 | +41 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
27 | +42 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
28 | +43 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
29 | +44 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
30 | +45 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
31 | +46 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
32 | +47 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
33 | +48 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
34 | +49 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
35 | +50 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
36 | +51 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
37 | +52 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
38 | +53 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
39 | +54 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
40 | +55 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
41 | +56 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
42 | +57 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
43 | +58 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
44 | +59 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
45 | +60 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
46 | +61 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
47 | +62 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
48 | +63 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
49 | +.. ... ... ... ... ... | ||
50 | +12 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
51 | +13 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
52 | +14 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
53 | +15 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
54 | +16 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
55 | +17 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
56 | +26 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
57 | +25 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
58 | +30 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
59 | +29 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
60 | +28 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
61 | +27 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
62 | +19 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
63 | +65 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
64 | +24 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
65 | +23 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
66 | +22 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
67 | +21 SVC 0.69697 0.702911 0.705882 0.00420147 | ||
68 | +18 SVC 0.686567 0.693502 0.69697 0.0049038 | ||
69 | +20 SVC 0.676923 0.691047 0.707692 0.0126874 | ||
70 | +7 ExtraTreesClassifier 0.619048 0.662524 0.688525 0.0309388 | ||
71 | +6 ExtraTreesClassifier 0.588235 0.611627 0.655738 0.0312098 | ||
72 | +1 GradientBoostingClassifier 0.577778 0.595982 0.610169 0.0135256 | ||
73 | +0 GradientBoostingClassifier 0.5 0.549894 0.596491 0.0394613 | ||
74 | +71 RandomForestClassifier 0.470588 0.557789 0.625 0.0646035 | ||
75 | +3 GradientBoostingClassifier 0.454545 0.548927 0.596491 0.0667386 | ||
76 | +2 GradientBoostingClassifier 0.439024 0.588593 0.701754 0.110305 | ||
77 | +5 AdaBoostClassifier 0.411765 0.489657 0.618182 0.0915596 | ||
78 | +4 AdaBoostClassifier 0.4 0.54013 0.655172 0.105673 | ||
79 | +72 RandomForestClassifier 0.380952 0.504177 0.631579 0.10236 | ||
80 | + | ||
81 | + C degree gamma kernel learning_rate n_estimators | ||
82 | +36 100 6 NaN poly NaN NaN | ||
83 | +66 200 NaN 0.0001 sigmoid NaN NaN | ||
84 | +35 100 5 NaN poly NaN NaN | ||
85 | +37 150 2 NaN poly NaN NaN | ||
86 | +38 150 3 NaN poly NaN NaN | ||
87 | +39 150 4 NaN poly NaN NaN | ||
88 | +40 150 5 NaN poly NaN NaN | ||
89 | +41 150 6 NaN poly NaN NaN | ||
90 | +42 200 2 NaN poly NaN NaN | ||
91 | +43 200 3 NaN poly NaN NaN | ||
92 | +44 200 4 NaN poly NaN NaN | ||
93 | +45 200 5 NaN poly NaN NaN | ||
94 | +46 200 6 NaN poly NaN NaN | ||
95 | +47 300 2 NaN poly NaN NaN | ||
96 | +48 300 3 NaN poly NaN NaN | ||
97 | +49 300 4 NaN poly NaN NaN | ||
98 | +50 300 5 NaN poly NaN NaN | ||
99 | +51 300 6 NaN poly NaN NaN | ||
100 | +52 400 2 NaN poly NaN NaN | ||
101 | +53 400 3 NaN poly NaN NaN | ||
102 | +54 400 4 NaN poly NaN NaN | ||
103 | +55 400 5 NaN poly NaN NaN | ||
104 | +56 400 6 NaN poly NaN NaN | ||
105 | +57 1 NaN 0.001 sigmoid NaN NaN | ||
106 | +58 1 NaN 0.0001 sigmoid NaN NaN | ||
107 | +59 10 NaN 0.001 sigmoid NaN NaN | ||
108 | +60 10 NaN 0.0001 sigmoid NaN NaN | ||
109 | +61 100 NaN 0.001 sigmoid NaN NaN | ||
110 | +62 100 NaN 0.0001 sigmoid NaN NaN | ||
111 | +63 150 NaN 0.001 sigmoid NaN NaN | ||
112 | +.. ... ... ... ... ... ... | ||
113 | +12 100 NaN 0.001 rbf NaN NaN | ||
114 | +13 100 NaN 0.0001 rbf NaN NaN | ||
115 | +14 150 NaN 0.001 rbf NaN NaN | ||
116 | +15 150 NaN 0.0001 rbf NaN NaN | ||
117 | +16 200 NaN 0.001 rbf NaN NaN | ||
118 | +17 200 NaN 0.0001 rbf NaN NaN | ||
119 | +26 1 6 NaN poly NaN NaN | ||
120 | +25 1 5 NaN poly NaN NaN | ||
121 | +30 10 5 NaN poly NaN NaN | ||
122 | +29 10 4 NaN poly NaN NaN | ||
123 | +28 10 3 NaN poly NaN NaN | ||
124 | +27 10 2 NaN poly NaN NaN | ||
125 | +19 300 NaN 0.0001 rbf NaN NaN | ||
126 | +65 200 NaN 0.001 sigmoid NaN NaN | ||
127 | +24 1 4 NaN poly NaN NaN | ||
128 | +23 1 3 NaN poly NaN NaN | ||
129 | +22 1 2 NaN poly NaN NaN | ||
130 | +21 400 NaN 0.0001 rbf NaN NaN | ||
131 | +18 300 NaN 0.001 rbf NaN NaN | ||
132 | +20 400 NaN 0.001 rbf NaN NaN | ||
133 | +7 NaN NaN NaN NaN NaN 32 | ||
134 | +6 NaN NaN NaN NaN NaN 16 | ||
135 | +1 NaN NaN NaN NaN 0.8 32 | ||
136 | +0 NaN NaN NaN NaN 0.8 16 | ||
137 | +71 NaN NaN NaN NaN NaN 16 | ||
138 | +3 NaN NaN NaN NaN 1 32 | ||
139 | +2 NaN NaN NaN NaN 1 16 | ||
140 | +5 NaN NaN NaN NaN NaN 32 | ||
141 | +4 NaN NaN NaN NaN NaN 16 | ||
142 | +72 NaN NaN NaN NaN NaN 32 | ||
143 | + | ||
144 | +[73 rows x 11 columns] |
1 | -from pdb import set_trace as st | 1 | +#from pdb import set_trace as st |
2 | from sklearn.cross_validation import train_test_split as splitt | 2 | from sklearn.cross_validation import train_test_split as splitt |
3 | from sklearn.feature_extraction.text import TfidfVectorizer | 3 | from sklearn.feature_extraction.text import TfidfVectorizer |
4 | from sklearn.model_selection import RandomizedSearchCV | 4 | from sklearn.model_selection import RandomizedSearchCV |
... | @@ -8,10 +8,12 @@ from sklearn.svm import SVC | ... | @@ -8,10 +8,12 @@ from sklearn.svm import SVC |
8 | import numpy as np | 8 | import numpy as np |
9 | import argparse | 9 | import argparse |
10 | import csv | 10 | import csv |
11 | +import os | ||
11 | from sklearn.externals import joblib | 12 | from sklearn.externals import joblib |
12 | from time import time | 13 | from time import time |
13 | from scipy.stats import randint as sp_randint | 14 | from scipy.stats import randint as sp_randint |
14 | from scipy.stats import expon | 15 | from scipy.stats import expon |
16 | +from sklearn.preprocessing import label_binarize | ||
15 | 17 | ||
16 | 18 | ||
17 | def get_abstracts(file_name, label): | 19 | def get_abstracts(file_name, label): |
... | @@ -33,7 +35,7 @@ def get_abstracts(file_name, label): | ... | @@ -33,7 +35,7 @@ def get_abstracts(file_name, label): |
33 | break | 35 | break |
34 | continue | 36 | continue |
35 | 37 | ||
36 | - elif 'Copyright ' in ln: | 38 | + elif 'Copyright ' in ln or 'Publish' in ln or u'\N{COPYRIGHT SIGN}' in ln: |
37 | copyright = True | 39 | copyright = True |
38 | 40 | ||
39 | elif 'DOI: ' in ln: | 41 | elif 'DOI: ' in ln: |
... | @@ -82,7 +84,7 @@ parser.add_argument("--svcmodel", help="Path to custom pretrained svc model" | ... | @@ -82,7 +84,7 @@ parser.add_argument("--svcmodel", help="Path to custom pretrained svc model" |
82 | 84 | ||
83 | args = parser.parse_args() | 85 | args = parser.parse_args() |
84 | 86 | ||
85 | -labels = {'useless': 0, 'useful': 1} | 87 | +labels = {0: 'useless', 1: 'useful'} |
86 | vectorizer = TfidfVectorizer(binary=True) | 88 | vectorizer = TfidfVectorizer(binary=True) |
87 | print(vectorizer) | 89 | print(vectorizer) |
88 | 90 | ||
... | @@ -103,11 +105,14 @@ if args.classA and args.classA and not args.input: | ... | @@ -103,11 +105,14 @@ if args.classA and args.classA and not args.input: |
103 | except ValueError: | 105 | except ValueError: |
104 | model_params[n].append(d[k]) | 106 | model_params[n].append(d[k]) |
105 | 107 | ||
106 | - abstracs = get_abstracts(file_name=args.classA, label=labels['useless']) | 108 | + model_params = {k: list(set(model_params[k])) for k in model_params} |
107 | - abstracs += get_abstracts(file_name=args.classB, label=labels['useful']) | 109 | + abstracs = get_abstracts(file_name=args.classA, label=labels[0]) |
110 | + abstracs += get_abstracts(file_name=args.classB, label=labels[1]) | ||
108 | 111 | ||
109 | - X = vectorizer.fit_transform([x['body'] for x in abstracs]) | 112 | + tfidf_model = vectorizer.fit([x['body'] for x in abstracs]) |
110 | - y = [x['topic'] for x in abstracs] | 113 | + X = vectorizer.transform([x['body'] for x in abstracs]) |
114 | + #y = [x['topic'] for x in abstracs] | ||
115 | + y = [0 if x['topic'] == 'useless' else 1 for x in abstracs] | ||
111 | 116 | ||
112 | #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42) | 117 | #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42) |
113 | 118 | ||
... | @@ -124,27 +129,29 @@ if args.classA and args.classA and not args.input: | ... | @@ -124,27 +129,29 @@ if args.classA and args.classA and not args.input: |
124 | " parameter settings." % ((time() - start), n_iter_search)) | 129 | " parameter settings." % ((time() - start), n_iter_search)) |
125 | 130 | ||
126 | print(clf.best_estimator_) | 131 | print(clf.best_estimator_) |
127 | - print(clf) | 132 | + print() |
128 | print(clf.best_score_) | 133 | print(clf.best_score_) |
129 | #print(metrics.f1_score(clf.predict(X_test), y_test)) | 134 | #print(metrics.f1_score(clf.predict(X_test), y_test)) |
130 | 135 | ||
131 | #joblib.dump(clf, 'model/svm_model.pkl') | 136 | #joblib.dump(clf, 'model/svm_model.pkl') |
132 | joblib.dump(clf.best_estimator_, 'model/svm_model.pkl') | 137 | joblib.dump(clf.best_estimator_, 'model/svm_model.pkl') |
133 | - joblib.dump(vectorizer, 'model/tifidf_model.pkl') | 138 | + joblib.dump(tfidf_model, 'model/tfidf_model.pkl') |
139 | + | ||
134 | else: | 140 | else: |
135 | 141 | ||
136 | clf = joblib.load(args.svcmodel) | 142 | clf = joblib.load(args.svcmodel) |
137 | vectorizer = joblib.load('model/tfidf_model.pkl') | 143 | vectorizer = joblib.load('model/tfidf_model.pkl') |
138 | - #filename=args.input #"data/ecoli_abstracts/not_useful_abstracts.txt" | ||
139 | abstracs = get_abstracts(file_name=args.input, label='unknown') | 144 | abstracs = get_abstracts(file_name=args.input, label='unknown') |
140 | - X = vectorizer.fit_transform([x['body'] for x in abstracs]) | 145 | + X = vectorizer.transform([x['body'] for x in abstracs]) |
141 | classes = clf.predict(X) | 146 | classes = clf.predict(X) |
142 | 147 | ||
143 | - with open(args.output + "/" + labels[0] + ".out", 'w') as f0, \ | 148 | + if not os.path.exists(args.out): |
144 | - open(args.output + "/" + labels[1] + ".out", 'w') as f1: | 149 | + os.makedirs(args.out) |
150 | + # Writing predictions to output files | ||
151 | + with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \ | ||
152 | + open(args.out + "/" + labels[1] + ".out", 'w') as f1: | ||
145 | for c, a in zip(classes, abstracs): | 153 | for c, a in zip(classes, abstracs): |
146 | if c == 0: | 154 | if c == 0: |
147 | f0.write("%d\t%s\n" % (a['pmid'], a['body'])) | 155 | f0.write("%d\t%s\n" % (a['pmid'], a['body'])) |
148 | elif c == 1: | 156 | elif c == 1: |
149 | f1.write("%d\t%s\n" % (a['pmid'], a['body'])) | 157 | f1.write("%d\t%s\n" % (a['pmid'], a['body'])) |
150 | -#clf.fit(X, y, scoring='f1', n_jobs=-1) | ... | ... |
filter_output/useful.out
0 → 100644
This diff could not be displayed because it is too large.
filter_output/useless.out
0 → 100644
File mode changed
No preview for this file type
model/tfidf_model.pkl
0 → 100644
No preview for this file type
1 | kernel,degree,coef0,C,gamma | 1 | kernel,degree,coef0,C,gamma |
2 | -poly,3,0.2,300,0 | ||
3 | -poly,11,0.9,150,0 | ||
4 | -rbf,0,0.5,100,0.0001 | ||
5 | linear,1,0.5,100,0.0 | 2 | linear,1,0.5,100,0.0 |
6 | -linear,1,1.5,100,0.0 | ||
7 | -linear,1,2.5,100,0.0 | ||
8 | -linear,1,3.5,100,0.0 | ||
9 | -linear,1,4.5,100,0.0 | ||
10 | -linear,1,1.5,150,0.0 | ||
11 | -linear,1,2.5,200,0.0 | ||
12 | -linear,1,3.5,300,0.0 | ||
13 | -linear,1,4.5,400,0.0 | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
3 | +linear,1,0.5,10,0.0 | ||
4 | +linear,1,0.5,50,0.0 | ||
5 | +linear,1,0.5,100,0.0 | ||
6 | +linear,1,0.5,5,0.0 | ||
7 | +linear,1,0.5,150,0.0 | ||
8 | +linear,1,0.5,200,0.0 | ||
9 | +linear,1,0.5,300,0.0 | ||
10 | +linear,1,0.5,400,0.0 | ||
11 | +poly,3,0.0,100,0.0 | ... | ... |
-
Please register or login to post a comment