iarroyof

added pretrained models and ready to predict unseen abstracts

# This paper talks about (and reports) experimental data
Automatic discrimination of useless papers via machine learning of abstracts
\ No newline at end of file
Automatic discrimination of useless papers via machine learning of abstracts.
The main method follows the next pipeline:
### Training mode
- Parse abstracts from two input files (classA and classB; see files format at the `data/` directory)
- Transform abstracts into their TFIDF sparse representations
- Train Support Vector Machines with different parameters by using GridSearch
- Select the best estimator and save it at `model/svm_model.pkl` (default)
- Save TFIDF transformation for keeping the training vocabulary (stored at `model/tfidf_model.pkl`)
### Prediction mode
- Parse abstracts from a unique input file
- Transform abstracts into their TFIDF sparse representations
- Predict useless/useful papers by means of their abstracts using pretrained Support Vector Machines
# Usage
For filtering unknown anstracts run
```bash
$ python filter_abstracts.py --input data/ecoli_abstracts/useful_abstracts.txt
```
The predictions will be stored by default at `filter_output/`, unless a different directory is specified by means of the `--out` option. The default names containing the predicitons are
- filter_output/useful.out
- filter_output/useless.out
The format of each file is:
```
<PMID> \t <text of the abstract>
...
<PMID> \t <text of the abstract>
```
For training a new model set the list of parameters at `model_params.conf` and then run
```bash
$ python filter_abstracts.py --classA data/ecoli_abstracts/not_useful_abstracts.txt --classB data/ecoli_abstracts/useful_abstracts.txt
```
where `--classA` and `--classA` are used to specify input training files. In this example `data/ecoli_abstracts/useful_abstracts.txt` is the training files containing abstracts of papers reporting experimental data (the desired or useful class for us).
......
......@@ -28,6 +28,7 @@ class EstimatorSelectionHelper:
self.params = params
self.keys = models.keys()
self.grid_searches = {}
self.best_estimator = {}
def fit(self, X, y, cv=3, n_jobs=1, verbose=1, scoring=None, refit=False):
for key in self.keys:
......@@ -40,24 +41,25 @@ class EstimatorSelectionHelper:
self.grid_searches[key] = gs
def score_summary(self, sort_by='mean_score'):
def row(key, scores, params):
def row(key, scores, params, model):
d = {
'estimator': key,
'min_score': min(scores),
'max_score': max(scores),
'mean_score': mean(scores),
'std_score': std(scores),
'model': model
}
return pd.Series(dict(list(params.items()) + list(d.items())))
rows = [row(k, gsc.cv_validation_scores, gsc.parameters)
rows = [row(k, gsc.cv_validation_scores, gsc.parameters, m)
for k in self.keys
for gsc in self.grid_searches[k].grid_scores_]
for gsc, m in zip(self.grid_searches[k].grid_scores_, self.grid_searches[k].best_estimator_)]
df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
columns = columns + [c for c in df.columns if c not in columns]
columns = columns + [c for c in df.columns if (c not in columns and c != 'model')]
self.best_estimator_ = df['model'][0]
return df[columns]
......@@ -169,9 +171,9 @@ X = vectorizer.fit_transform(X)
#st()
clf.fit(X, y, scoring='f1', n_jobs=-1)
joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
joblib.dump(clf.best_estimator_, 'model/tifidf_model.pkl')
#pred = clf.predict(X_test)
#print(metrics.f1_score(y_test, pred, average='macro'))
print(clf.score_summary(sort_by='min_score'))
joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
joblib.dump(vectorizer, 'model/tifidf_model.pkl')
......
#from pdb import set_trace as st
from sklearn.cross_validation import train_test_split as splitt
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.ensemble import (ExtraTreesClassifier, RandomForestClassifier,
AdaBoostClassifier, GradientBoostingClassifier)
from sklearn.grid_search import GridSearchCV
from sklearn.externals import joblib
import pandas as pd
from numpy import mean, std
class EstimatorSelectionHelper:
"http://www.codiply.com/blog/hyperparameter-grid-search-across-multiple-models-in-scikit-learn/"
def __init__(self, models, params):
if not set(models.keys()).issubset(set(params.keys())):
missing_params = list(set(models.keys()) - set(params.keys()))
raise ValueError("Some estimators are missing parameters: %s" % missing_params)
self.models = models
self.params = params
self.keys = models.keys()
self.grid_searches = {}
self.best_estimator = {}
def fit(self, X, y, cv=3, n_jobs=1, verbose=1, scoring=None, refit=False):
for key in self.keys:
print("Running GridSearchCV for %s." % key)
model = self.models[key]
params = self.params[key]
gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
verbose=verbose, scoring=scoring, refit=refit)
gs.fit(X,y)
self.grid_searches[key] = gs
def score_summary(self, sort_by='mean_score'):
def row(key, scores, params, model):
d = {
'estimator': key,
'min_score': min(scores),
'max_score': max(scores),
'mean_score': mean(scores),
'std_score': std(scores),
'model': model
}
return pd.Series(dict(list(params.items()) + list(d.items())))
rows = [row(k, gsc.cv_validation_scores, gsc.parameters, m)
for k in self.keys
for gsc, m in zip(self.grid_searches[k].grid_scores_, self.grid_searches[k].best_estimator_)]
df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
columns = columns + [c for c in df.columns if (c not in columns and c != 'model')]
self.best_estimator_ = df['model'][0]
return df[columns]
def get_abstracts(file_name, label):
f = open(file_name)
extract = {}
docs = []
empties = []
lines = f.readlines()
cpright = False
for i, ln in enumerate(lines):
if not ln.strip():
empties.append(i)
continue
elif ' doi: ' in ln:
for j in range(i, i + 10):
if not lines[j].strip():
title_idx = j + 1
break
continue
elif 'cpright ' in ln:
cpright = True
elif 'DOI: ' in ln:
if 'PMCID: ' in lines[i + 1]:
extract['pmid'] = int(lines[i + 2].strip().split()[1])
elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]:
extract['pmid'] = int(lines[i + 1].strip().split()[1])
if cpright:
get = slice(empties[-3], empties[-2])
cpright = False
else:
get = slice(empties[-2], empties[-1])
extract['body'] = " ".join(lines[get]).replace("\n", ' ').replace(" ", ' ')
title = []
for j in range(title_idx, title_idx + 5):
if lines[j].strip():
title.append(lines[j])
else:
break
extract['title'] = " ".join(title).replace("\n", ' ').replace(" ", ' ')
extract['topic'] = label
docs.append(extract)
empties = []
extract = {}
return docs
filename = "data/ecoli_abstracts/not_useful_abstracts.txt"
labels = ['useless', 'useful']
abstracs = get_abstracts(file_name=filename, label=labels[0])
filename = "data/ecoli_abstracts/useful_abstracts.txt"
abstracs += get_abstracts(file_name=filename, label=labels[1])
X = [x['body'] for x in abstracs]
y = [1 if x['topic'] == 'useful' else 0 for x in abstracs]
models1 = {
'ExtraTreesClassifier': ExtraTreesClassifier(),
'RandomForestClassifier': RandomForestClassifier(),
'AdaBoostClassifier': AdaBoostClassifier(),
'GradientBoostingClassifier': GradientBoostingClassifier(),
'SVC': SVC()
}
params1 = {
'ExtraTreesClassifier': {'n_estimators': [16, 32]},
'RandomForestClassifier': {'n_estimators': [16, 32]},
'AdaBoostClassifier': {'n_estimators': [16, 32]},
'GradientBoostingClassifier': {'n_estimators': [16, 32],
'learning_rate': [0.8, 1.0]},
'SVC': [
{'kernel': ['rbf'], 'C': [1, 10, 100, 150, 200, 300, 350, 400],
'gamma': [0.1, 0.01, 0.001, 0.0001, 0.00001]},
{'kernel': ['poly'], 'C': [1, 10, 100, 150, 200, 300, 350, 400],
'degree': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 23, 26],
'coef0': [0.1, 0.2,0.3,0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
]
}
clf = EstimatorSelectionHelper(models1, params1)
vectorizer = TfidfVectorizer(binary=True)
#ngram_range=(1, 3)
#)
#vectorizer = HashingVectorizer(non_negative=True)
print(vectorizer)
#svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20)
X = vectorizer.fit_transform(X)
#X = svd.fit_transform(X)
#X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
#from sklearn.feature_selection import chi2, SelectKBest
#ch2 = SelectKBest(chi2, k=200)
#X_train = ch2.fit_transform(X_train, y_train)
#X_test = ch2.transform(X_test)
#clf = MultinomialNB(alpha=.01)
#clf = Classifier(n_jobs=-1, n_iter=100)
#st()
clf.fit(X, y, scoring='f1', n_jobs=-1)
#pred = clf.predict(X_test)
#print(metrics.f1_score(y_test, pred, average='macro'))
print(clf.score_summary(sort_by='min_score'))
joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
joblib.dump(vectorizer, 'model/tifidf_model.pkl')
TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
stop_words=None, strip_accents=None, sublinear_tf=False,
token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
vocabulary=None)
Running GridSearchCV for GradientBoostingClassifier.
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Running GridSearchCV for AdaBoostClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Running GridSearchCV for ExtraTreesClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Running GridSearchCV for SVC.
Fitting 3 folds for each of 63 candidates, totalling 189 fits
Running GridSearchCV for RandomForestClassifier.
Fitting 3 folds for each of 2 candidates, totalling 6 fits
estimator min_score mean_score max_score std_score \
36 SVC 0.69697 0.702911 0.705882 0.00420147
66 SVC 0.69697 0.702911 0.705882 0.00420147
35 SVC 0.69697 0.702911 0.705882 0.00420147
37 SVC 0.69697 0.702911 0.705882 0.00420147
38 SVC 0.69697 0.702911 0.705882 0.00420147
39 SVC 0.69697 0.702911 0.705882 0.00420147
40 SVC 0.69697 0.702911 0.705882 0.00420147
41 SVC 0.69697 0.702911 0.705882 0.00420147
42 SVC 0.69697 0.702911 0.705882 0.00420147
43 SVC 0.69697 0.702911 0.705882 0.00420147
44 SVC 0.69697 0.702911 0.705882 0.00420147
45 SVC 0.69697 0.702911 0.705882 0.00420147
46 SVC 0.69697 0.702911 0.705882 0.00420147
47 SVC 0.69697 0.702911 0.705882 0.00420147
48 SVC 0.69697 0.702911 0.705882 0.00420147
49 SVC 0.69697 0.702911 0.705882 0.00420147
50 SVC 0.69697 0.702911 0.705882 0.00420147
51 SVC 0.69697 0.702911 0.705882 0.00420147
52 SVC 0.69697 0.702911 0.705882 0.00420147
53 SVC 0.69697 0.702911 0.705882 0.00420147
54 SVC 0.69697 0.702911 0.705882 0.00420147
55 SVC 0.69697 0.702911 0.705882 0.00420147
56 SVC 0.69697 0.702911 0.705882 0.00420147
57 SVC 0.69697 0.702911 0.705882 0.00420147
58 SVC 0.69697 0.702911 0.705882 0.00420147
59 SVC 0.69697 0.702911 0.705882 0.00420147
60 SVC 0.69697 0.702911 0.705882 0.00420147
61 SVC 0.69697 0.702911 0.705882 0.00420147
62 SVC 0.69697 0.702911 0.705882 0.00420147
63 SVC 0.69697 0.702911 0.705882 0.00420147
.. ... ... ... ... ...
12 SVC 0.69697 0.702911 0.705882 0.00420147
13 SVC 0.69697 0.702911 0.705882 0.00420147
14 SVC 0.69697 0.702911 0.705882 0.00420147
15 SVC 0.69697 0.702911 0.705882 0.00420147
16 SVC 0.69697 0.702911 0.705882 0.00420147
17 SVC 0.69697 0.702911 0.705882 0.00420147
26 SVC 0.69697 0.702911 0.705882 0.00420147
25 SVC 0.69697 0.702911 0.705882 0.00420147
30 SVC 0.69697 0.702911 0.705882 0.00420147
29 SVC 0.69697 0.702911 0.705882 0.00420147
28 SVC 0.69697 0.702911 0.705882 0.00420147
27 SVC 0.69697 0.702911 0.705882 0.00420147
19 SVC 0.69697 0.702911 0.705882 0.00420147
65 SVC 0.69697 0.702911 0.705882 0.00420147
24 SVC 0.69697 0.702911 0.705882 0.00420147
23 SVC 0.69697 0.702911 0.705882 0.00420147
22 SVC 0.69697 0.702911 0.705882 0.00420147
21 SVC 0.69697 0.702911 0.705882 0.00420147
18 SVC 0.686567 0.693502 0.69697 0.0049038
20 SVC 0.676923 0.691047 0.707692 0.0126874
7 ExtraTreesClassifier 0.619048 0.662524 0.688525 0.0309388
6 ExtraTreesClassifier 0.588235 0.611627 0.655738 0.0312098
1 GradientBoostingClassifier 0.577778 0.595982 0.610169 0.0135256
0 GradientBoostingClassifier 0.5 0.549894 0.596491 0.0394613
71 RandomForestClassifier 0.470588 0.557789 0.625 0.0646035
3 GradientBoostingClassifier 0.454545 0.548927 0.596491 0.0667386
2 GradientBoostingClassifier 0.439024 0.588593 0.701754 0.110305
5 AdaBoostClassifier 0.411765 0.489657 0.618182 0.0915596
4 AdaBoostClassifier 0.4 0.54013 0.655172 0.105673
72 RandomForestClassifier 0.380952 0.504177 0.631579 0.10236
C degree gamma kernel learning_rate n_estimators
36 100 6 NaN poly NaN NaN
66 200 NaN 0.0001 sigmoid NaN NaN
35 100 5 NaN poly NaN NaN
37 150 2 NaN poly NaN NaN
38 150 3 NaN poly NaN NaN
39 150 4 NaN poly NaN NaN
40 150 5 NaN poly NaN NaN
41 150 6 NaN poly NaN NaN
42 200 2 NaN poly NaN NaN
43 200 3 NaN poly NaN NaN
44 200 4 NaN poly NaN NaN
45 200 5 NaN poly NaN NaN
46 200 6 NaN poly NaN NaN
47 300 2 NaN poly NaN NaN
48 300 3 NaN poly NaN NaN
49 300 4 NaN poly NaN NaN
50 300 5 NaN poly NaN NaN
51 300 6 NaN poly NaN NaN
52 400 2 NaN poly NaN NaN
53 400 3 NaN poly NaN NaN
54 400 4 NaN poly NaN NaN
55 400 5 NaN poly NaN NaN
56 400 6 NaN poly NaN NaN
57 1 NaN 0.001 sigmoid NaN NaN
58 1 NaN 0.0001 sigmoid NaN NaN
59 10 NaN 0.001 sigmoid NaN NaN
60 10 NaN 0.0001 sigmoid NaN NaN
61 100 NaN 0.001 sigmoid NaN NaN
62 100 NaN 0.0001 sigmoid NaN NaN
63 150 NaN 0.001 sigmoid NaN NaN
.. ... ... ... ... ... ...
12 100 NaN 0.001 rbf NaN NaN
13 100 NaN 0.0001 rbf NaN NaN
14 150 NaN 0.001 rbf NaN NaN
15 150 NaN 0.0001 rbf NaN NaN
16 200 NaN 0.001 rbf NaN NaN
17 200 NaN 0.0001 rbf NaN NaN
26 1 6 NaN poly NaN NaN
25 1 5 NaN poly NaN NaN
30 10 5 NaN poly NaN NaN
29 10 4 NaN poly NaN NaN
28 10 3 NaN poly NaN NaN
27 10 2 NaN poly NaN NaN
19 300 NaN 0.0001 rbf NaN NaN
65 200 NaN 0.001 sigmoid NaN NaN
24 1 4 NaN poly NaN NaN
23 1 3 NaN poly NaN NaN
22 1 2 NaN poly NaN NaN
21 400 NaN 0.0001 rbf NaN NaN
18 300 NaN 0.001 rbf NaN NaN
20 400 NaN 0.001 rbf NaN NaN
7 NaN NaN NaN NaN NaN 32
6 NaN NaN NaN NaN NaN 16
1 NaN NaN NaN NaN 0.8 32
0 NaN NaN NaN NaN 0.8 16
71 NaN NaN NaN NaN NaN 16
3 NaN NaN NaN NaN 1 32
2 NaN NaN NaN NaN 1 16
5 NaN NaN NaN NaN NaN 32
4 NaN NaN NaN NaN NaN 16
72 NaN NaN NaN NaN NaN 32
[73 rows x 11 columns]
from pdb import set_trace as st
#from pdb import set_trace as st
from sklearn.cross_validation import train_test_split as splitt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
......@@ -8,10 +8,12 @@ from sklearn.svm import SVC
import numpy as np
import argparse
import csv
import os
from sklearn.externals import joblib
from time import time
from scipy.stats import randint as sp_randint
from scipy.stats import expon
from sklearn.preprocessing import label_binarize
def get_abstracts(file_name, label):
......@@ -33,7 +35,7 @@ def get_abstracts(file_name, label):
break
continue
elif 'Copyright ' in ln:
elif 'Copyright ' in ln or 'Publish' in ln or u'\N{COPYRIGHT SIGN}' in ln:
copyright = True
elif 'DOI: ' in ln:
......@@ -82,7 +84,7 @@ parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
args = parser.parse_args()
labels = {'useless': 0, 'useful': 1}
labels = {0: 'useless', 1: 'useful'}
vectorizer = TfidfVectorizer(binary=True)
print(vectorizer)
......@@ -103,11 +105,14 @@ if args.classA and args.classA and not args.input:
except ValueError:
model_params[n].append(d[k])
abstracs = get_abstracts(file_name=args.classA, label=labels['useless'])
abstracs += get_abstracts(file_name=args.classB, label=labels['useful'])
model_params = {k: list(set(model_params[k])) for k in model_params}
abstracs = get_abstracts(file_name=args.classA, label=labels[0])
abstracs += get_abstracts(file_name=args.classB, label=labels[1])
X = vectorizer.fit_transform([x['body'] for x in abstracs])
y = [x['topic'] for x in abstracs]
tfidf_model = vectorizer.fit([x['body'] for x in abstracs])
X = vectorizer.transform([x['body'] for x in abstracs])
#y = [x['topic'] for x in abstracs]
y = [0 if x['topic'] == 'useless' else 1 for x in abstracs]
#X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
......@@ -124,27 +129,29 @@ if args.classA and args.classA and not args.input:
" parameter settings." % ((time() - start), n_iter_search))
print(clf.best_estimator_)
print(clf)
print()
print(clf.best_score_)
#print(metrics.f1_score(clf.predict(X_test), y_test))
#joblib.dump(clf, 'model/svm_model.pkl')
joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
joblib.dump(vectorizer, 'model/tifidf_model.pkl')
joblib.dump(tfidf_model, 'model/tfidf_model.pkl')
else:
clf = joblib.load(args.svcmodel)
vectorizer = joblib.load('model/tfidf_model.pkl')
#filename=args.input #"data/ecoli_abstracts/not_useful_abstracts.txt"
abstracs = get_abstracts(file_name=args.input, label='unknown')
X = vectorizer.fit_transform([x['body'] for x in abstracs])
X = vectorizer.transform([x['body'] for x in abstracs])
classes = clf.predict(X)
with open(args.output + "/" + labels[0] + ".out", 'w') as f0, \
open(args.output + "/" + labels[1] + ".out", 'w') as f1:
if not os.path.exists(args.out):
os.makedirs(args.out)
# Writing predictions to output files
with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \
open(args.out + "/" + labels[1] + ".out", 'w') as f1:
for c, a in zip(classes, abstracs):
if c == 0:
f0.write("%d\t%s\n" % (a['pmid'], a['body']))
elif c == 1:
f1.write("%d\t%s\n" % (a['pmid'], a['body']))
#clf.fit(X, y, scoring='f1', n_jobs=-1)
......
This diff could not be displayed because it is too large.
File mode changed
No preview for this file type
No preview for this file type
kernel,degree,coef0,C,gamma
poly,3,0.2,300,0
poly,11,0.9,150,0
rbf,0,0.5,100,0.0001
linear,1,0.5,100,0.0
linear,1,1.5,100,0.0
linear,1,2.5,100,0.0
linear,1,3.5,100,0.0
linear,1,4.5,100,0.0
linear,1,1.5,150,0.0
linear,1,2.5,200,0.0
linear,1,3.5,300,0.0
linear,1,4.5,400,0.0
\ No newline at end of file
linear,1,0.5,10,0.0
linear,1,0.5,50,0.0
linear,1,0.5,100,0.0
linear,1,0.5,5,0.0
linear,1,0.5,150,0.0
linear,1,0.5,200,0.0
linear,1,0.5,300,0.0
linear,1,0.5,400,0.0
poly,3,0.0,100,0.0
......