filter_papers.py
3.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#from pdb import set_trace as st
from sklearn.cross_validation import train_test_split as splitt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.svm import SVC
import numpy as np
import argparse
import csv
import os
from sklearn.externals import joblib
from time import time
from scipy.stats import randint as sp_randint
from scipy.stats import expon
from sklearn.preprocessing import label_binarize
from sklearn.datasets import load_files
parser = argparse.ArgumentParser(
description="This script separates biomedical papers that"
"report data from biomedical experiments from those that do not.")
parser.add_argument("--input", help="Input file containing the to"
"be predited.")
parser.add_argument("--traind", help="Input directory containing the papers of"
"two classes to be learned.")
parser.add_argument("--out", help="Path to the output directory "
"(default='./filter_output')", default="filter_output")
parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
"(default='./model/svm_model.paper.pkl')", default="model/svm_model.paper.pkl")
args = parser.parse_args()
data=load_files(container_path=args.traind, encoding=None,
decode_error='replace')
labels = data.target_names
vectorizer = TfidfVectorizer(binary=True)
print(vectorizer)
if args.train and not args.input:
f0 = open("model_params.conf")
n_iter_search = 10
params = [p for p in csv.DictReader(f0)]
f0.close()
names = list(params[0].keys())
model_params = {n: [] for n in names}
for n in names:
for d in params:
for k in d:
if k == n:
try:
model_params[n].append(float(d[k]))
except ValueError:
model_params[n].append(d[k])
model_params = {k: list(set(model_params[k])) for k in model_params}
papers = data.data
tfidf_model = vectorizer.fit(papers)
X = vectorizer.transform(papers)
#y = [x['topic'] for x in abstracs]
y = data.target
#X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
clf = GridSearchCV(clf, cv=3,
param_grid=model_params,
# clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
n_jobs=-1, scoring='f1')
start = time()
clf.fit(X, y)
#clf.fit(X_train, y_train)
print("GridSearch took %.2f seconds for %d candidates"
" parameter settings." % ((time() - start), n_iter_search))
print(clf.best_estimator_)
print()
print(clf.best_score_)
#print(metrics.f1_score(clf.predict(X_test), y_test))
#joblib.dump(clf, 'model/svm_model.pkl')
joblib.dump(clf.best_estimator_, 'model/svm_model.paper.pkl')
joblib.dump(tfidf_model, 'model/tfidf_model.paper.pkl')
else:
data=load_files(container_path=args.input, encoding=None,
decode_error='replace')
clf = joblib.load(args.svcmodel)
vectorizer = joblib.load('model/tfidf_model.paper.pkl')
papers = data.data
X = vectorizer.transform(papers)
classes = clf.predict(X)
if not os.path.exists(args.out):
os.makedirs(args.out)
# Writing predictions to output files
with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \
open(args.out + "/" + labels[1] + ".out", 'w') as f1:
for c, a in zip(classes, papers):
if c == 0:
f0.write("%d\t%s\n" % (a['title'], a['body']))
elif c == 1:
f1.write("%d\t%s\n" % (a['title'], a['body']))