Ignacio Arroyo

testing pretrained model

......@@ -13,14 +13,10 @@ from sklearn import metrics
from sklearn.ensemble import (ExtraTreesClassifier, RandomForestClassifier,
AdaBoostClassifier, GradientBoostingClassifier)
from sklearn.grid_search import GridSearchCV
from sklearn.externals import joblib
import pandas as pd
from numpy import mean, std
#Classifier = KNeighborsClassifier # 0.6464
#Classifier = NearestCentroid # 0.5054
#Classifier = RandomForestClassifier # 0.49
#Classifier = LinearSVC # 0.5402
#Classifier = SGDClassifier # 0.664
class EstimatorSelectionHelper:
"http://www.codiply.com/blog/hyperparameter-grid-search-across-multiple-models-in-scikit-learn/"
......@@ -71,7 +67,7 @@ def get_abstracts(file_name, label):
docs = []
empties = []
lines = f.readlines()
copyright = False
cpright = False
for i, ln in enumerate(lines):
if not ln.strip():
......@@ -84,8 +80,8 @@ def get_abstracts(file_name, label):
break
continue
elif 'Copyright ' in ln:
copyright = True
elif 'cpright ' in ln:
cpright = True
elif 'DOI: ' in ln:
if 'PMCID: ' in lines[i + 1]:
......@@ -93,9 +89,9 @@ def get_abstracts(file_name, label):
elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]:
extract['pmid'] = int(lines[i + 1].strip().split()[1])
if copyright:
if cpright:
get = slice(empties[-3], empties[-2])
copyright = False
cpright = False
else:
get = slice(empties[-2], empties[-1])
......@@ -115,14 +111,14 @@ def get_abstracts(file_name, label):
return docs
filename="data/ecoli_abstracts/not_useful_abstracts.txt"
filename = "data/ecoli_abstracts/not_useful_abstracts.txt"
labels = ['useless', 'useful']
abstracs = get_abstracts(file_name = filename, label = labels[0])
abstracs = get_abstracts(file_name=filename, label=labels[0])
filename="data/ecoli_abstracts/useful_abstracts.txt"
filename = "data/ecoli_abstracts/useful_abstracts.txt"
abstracs += get_abstracts(file_name = filename, label = labels[1])
abstracs += get_abstracts(file_name=filename, label=labels[1])
X = [x['body'] for x in abstracs]
y = [1 if x['topic'] == 'useful' else 0 for x in abstracs]
......@@ -136,15 +132,17 @@ models1 = {
}
params1 = {
'ExtraTreesClassifier': { 'n_estimators': [16, 32] },
'RandomForestClassifier': { 'n_estimators': [16, 32] },
'AdaBoostClassifier': { 'n_estimators': [16, 32] },
'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] },
'ExtraTreesClassifier': {'n_estimators': [16, 32]},
'RandomForestClassifier': {'n_estimators': [16, 32]},
'AdaBoostClassifier': {'n_estimators': [16, 32]},
'GradientBoostingClassifier': {'n_estimators': [16, 32],
'learning_rate': [0.8, 1.0]},
'SVC': [
#{'kernel': ['linear'], 'C': [1, 10, 100, 150, 200, 300, 400]},
{'kernel': ['rbf'], 'C': [1, 10, 100, 150, 200, 300, 400], 'gamma': [0.001, 0.0001]},
{'kernel': ['poly'], 'C': [1, 10, 100, 150, 200, 300, 400], 'degree': [2, 3, 4, 5, 6]},
{'kernel': ['sigmoid'], 'C': [1, 10, 100, 150, 200, 300, 400], 'gamma': [0.001, 0.0001]},
{'kernel': ['rbf'], 'C': [1, 10, 100, 150, 200, 300, 350, 400],
'gamma': [0.1, 0.01, 0.001, 0.0001, 0.00001]},
{'kernel': ['poly'], 'C': [1, 10, 100, 150, 200, 300, 350, 400],
'degree': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 23, 26],
'coef0': [0.1, 0.2,0.3,0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
]
}
......@@ -171,6 +169,9 @@ X = vectorizer.fit_transform(X)
#st()
clf.fit(X, y, scoring='f1', n_jobs=-1)
joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
joblib.dump(clf.best_estimator_, 'model/tifidf_model.pkl')
#pred = clf.predict(X_test)
#print(metrics.f1_score(y_test, pred, average='macro'))
print(clf.score_summary(sort_by='min_score'))
......
from pdb import set_trace as st
from sklearn.cross_validation import train_test_split as splitt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.svm import SVC
import numpy as np
import argparse
import csv
from sklearn.externals import joblib
from time import time
from scipy.stats import randint as sp_randint
from scipy.stats import expon
def get_abstracts(file_name, label):
f = open(file_name)
extract = {}
docs = []
empties = []
lines = f.readlines()
copyright = False
for i, ln in enumerate(lines):
if not ln.strip():
empties.append(i)
continue
elif ' doi: ' in ln:
for j in range(i, i + 10):
if not lines[j].strip():
title_idx = j + 1
break
continue
elif 'Copyright ' in ln:
copyright = True
elif 'DOI: ' in ln:
if 'PMCID: ' in lines[i + 1]:
extract['pmid'] = int(lines[i + 2].strip().split()[1])
elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]:
extract['pmid'] = int(lines[i + 1].strip().split()[1])
if copyright:
get = slice(empties[-3], empties[-2])
copyright = False
else:
get = slice(empties[-2], empties[-1])
extract['body'] = " ".join(lines[get]).replace("\n", ' '
).replace(" ", ' ')
title = []
for j in range(title_idx, title_idx + 5):
if lines[j].strip():
title.append(lines[j])
else:
break
extract['title'] = " ".join(title).replace("\n", ' '
).replace(" ", ' ')
extract['topic'] = label
docs.append(extract)
empties = []
extract = {}
return docs
parser = argparse.ArgumentParser(
description="This script separates abstracts of biomedical papers that"
"report data from biomedical experiments from those that do not.")
parser.add_argument("--input", help="Input file containing the abstracts to"
"be predited.")
parser.add_argument("--classA", help="Input file containing the abstracts of"
"class A to be learned.")
parser.add_argument("--classB", help="Input file containing the abstracts of"
"class B to be learned.")
parser.add_argument("--out", help="Path to the output directory "
"(default='./filter_output')", default="filter_output")
parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
"(default='./model/svm_model.pkl')", default="model/svm_model.pkl")
args = parser.parse_args()
labels = {'useless': 0, 'useful': 1}
vectorizer = TfidfVectorizer(binary=True)
print(vectorizer)
if args.classA and args.classA and not args.input:
f0 = open("model_params.conf")
n_iter_search = 10
params = [p for p in csv.DictReader(f0)]
f0.close()
names = list(params[0].keys())
model_params = {n: [] for n in names}
for n in names:
for d in params:
for k in d:
if k == n:
try:
model_params[n].append(float(d[k]))
except ValueError:
model_params[n].append(d[k])
abstracs = get_abstracts(file_name=args.classA, label=labels['useless'])
abstracs += get_abstracts(file_name=args.classB, label=labels['useful'])
X = vectorizer.fit_transform([x['body'] for x in abstracs])
y = [x['topic'] for x in abstracs]
#X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
clf = GridSearchCV(clf, cv=3,
param_grid=model_params,
# clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
n_jobs=-1, scoring='f1')
start = time()
clf.fit(X, y)
#clf.fit(X_train, y_train)
print("GridSearch took %.2f seconds for %d candidates"
" parameter settings." % ((time() - start), n_iter_search))
print(clf.best_estimator_)
print(clf)
print(clf.best_score_)
#print(metrics.f1_score(clf.predict(X_test), y_test))
#joblib.dump(clf, 'model/svm_model.pkl')
joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
joblib.dump(vectorizer, 'model/tifidf_model.pkl')
else:
clf = joblib.load(args.svcmodel)
vectorizer = joblib.load('model/tfidf_model.pkl')
#filename=args.input #"data/ecoli_abstracts/not_useful_abstracts.txt"
abstracs = get_abstracts(file_name=args.input, label='unknown')
X = vectorizer.fit_transform([x['body'] for x in abstracs])
classes = clf.predict(X)
with open(args.output + "/" + labels[0] + ".out", 'w') as f0, \
open(args.output + "/" + labels[1] + ".out", 'w') as f1:
for c, a in zip(classes, abstracs):
if c == 0:
f0.write("%d\t%s\n" % (a['pmid'], a['body']))
elif c == 1:
f1.write("%d\t%s\n" % (a['pmid'], a['body']))
#clf.fit(X, y, scoring='f1', n_jobs=-1)
No preview for this file type
No preview for this file type
kernel,degree,coef0,C,gamma
poly,3,0.2,300,0
poly,11,0.9,150,0
rbf,0,0.5,100,0.0001
linear,1,0.5,100,0.0
linear,1,1.5,100,0.0
linear,1,2.5,100,0.0
linear,1,3.5,100,0.0
linear,1,4.5,100,0.0
linear,1,1.5,150,0.0
linear,1,2.5,200,0.0
linear,1,3.5,300,0.0
linear,1,4.5,400,0.0
\ No newline at end of file