iarroyof

Final version for abstracts

#from pdb import set_trace as st
from sklearn.cross_validation import train_test_split as splitt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.svm import SVC
......@@ -12,9 +9,6 @@ import csv
import os
from sklearn.externals import joblib
from time import time
from scipy.stats import randint as sp_randint
from scipy.stats import expon
from sklearn.preprocessing import label_binarize
def get_abstracts(file_name, label):
......@@ -75,22 +69,21 @@ parser = argparse.ArgumentParser(
parser.add_argument("--input", help="Input file containing the abstracts to"
"be predited.")
parser.add_argument("--classA", help="Input file containing the abstracts of"
"class A to be learned.")
" class useless to be learned.")
parser.add_argument("--classB", help="Input file containing the abstracts of"
"class B to be learned.")
" class USEFUL to be learned.")
parser.add_argument("--out", help="Path to the output directory "
"(default='./filter_output')", default="filter_output")
parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
"(default='./model/svm_model.pkl')", default="model/svm_model.pkl")
"(default='./model_binClass/svm_model.pkl')", default="model_binClass/svm_model.pkl")
args = parser.parse_args()
labels = {0: 'useless', 1: 'useful'}
vectorizer = TfidfVectorizer(binary=True)
print(vectorizer)
if args.classA and args.classB and not args.input:
f0 = open("model_params.conf")
f0 = open("model_params_binClass.conf")
n_iter_search = 10
params = [p for p in csv.DictReader(f0)]
f0.close()
......@@ -115,38 +108,38 @@ if args.classA and args.classB and not args.input:
svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20)
svd_model = svd.fit(X)
X = svd_model.transform(X)
#y = [x['topic'] for x in abstracs]
y = [0 if x['topic'] == 'useless' else 1 for x in abstracs]
#X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
clf = GridSearchCV(clf, cv=3,
param_grid=model_params,
# clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
clf = SVC()
clf = GridSearchCV(clf, cv=3, param_grid=model_params,
n_jobs=-1, scoring='f1')
start = time()
clf.fit(X, y)
#clf.fit(X_train, y_train)
print("GridSearch took %.2f seconds for %d candidates"
" parameter settings." % ((time() - start), n_iter_search))
print()
print("The best model parameters:")
print(vectorizer)
print(svd)
print(clf.best_estimator_)
print()
print("The best F1 score:")
print(clf.best_score_)
#print(metrics.f1_score(clf.predict(X_test), y_test))
#joblib.dump(clf, 'model/svm_model.pkl')
joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
joblib.dump(tfidf_model, 'model/tfidf_model.pkl')
joblib.dump(svd_model, 'model/svd_model.pkl')
joblib.dump(clf.best_estimator_, 'model_binClass/svm_model.pkl')
joblib.dump(tfidf_model, 'model_binClass/tfidf_model.pkl')
joblib.dump(svd_model, 'model_binClass/svd_model.pkl')
else:
clf = joblib.load(args.svcmodel)
vectorizer = joblib.load('model/tfidf_model.pkl')
svd = joblib.load('model/svd_model.pkl')
vectorizer = joblib.load('model_binClass/tfidf_model.pkl')
svd = joblib.load('model_binClass/svd_model.pkl')
print(vectorizer)
print(svd)
print(clf)
abstracs = get_abstracts(file_name=args.input, label='unknown')
X = vectorizer.transform([x['body'] for x in abstracs])
X = svd.transform(X)
......@@ -162,3 +155,5 @@ else:
f0.write("%d\t%s\n" % (a['pmid'], a['body']))
elif c == 1:
f1.write("%d\t%s\n" % (a['pmid'], a['body']))
print ("FINISHED!!")
......
#from pdb import set_trace as st
from sklearn.cross_validation import train_test_split as splitt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.svm import OneClassSVM
import numpy as np
import argparse
import csv
import os
from sklearn.externals import joblib
from time import time
from scipy.stats import randint as sp_randint
from scipy.stats import expon
from sklearn.preprocessing import label_binarize
def get_abstracts(file_name, label):
......@@ -75,22 +70,22 @@ parser = argparse.ArgumentParser(
parser.add_argument("--input", help="Input file containing the abstracts to"
"be predited.")
parser.add_argument("--classA", help="Input file containing the abstracts of"
"class A to be learned.")
" class USEFUL to be learned.")
parser.add_argument("--classB", help="Input file containing the abstracts of"
"class B to be learned.")
" class useless to be learned.")
parser.add_argument("--out", help="Path to the output directory "
"(default='./filter_output')", default="filter_output")
parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
"(default='./model/svm_model.pkl')", default="model/svm_model.pkl")
"(default='./model/svm_model.pkl')", default="model_oneClass/svm_model.pkl")
args = parser.parse_args()
labels = {0: 'useless', 1: 'useful'}
vectorizer = TfidfVectorizer(binary=True)
print(vectorizer)
if args.classA and args.classB and not args.input:
f0 = open("model_params.conf")
f0 = open("model_params_oneClass.conf")
n_iter_search = 10
params = [p for p in csv.DictReader(f0)]
f0.close()
......@@ -115,38 +110,38 @@ if args.classA and args.classB and not args.input:
svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20)
svd_model = svd.fit(X)
X = svd_model.transform(X)
#y = [x['topic'] for x in abstracs]
y = [0 if x['topic'] == 'useless' else 1 for x in abstracs]
#X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
y = [-1 if x['topic'] == 'useless' else 1 for x in abstracs]
clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
clf = GridSearchCV(clf, cv=3,
param_grid=model_params,
# clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
clf = OneClassSVM()
clf = GridSearchCV(clf, cv=3, param_grid=model_params,
n_jobs=-1, scoring='f1')
start = time()
clf.fit(X, y)
#clf.fit(X_train, y_train)
print("GridSearch took %.2f seconds for %d candidates"
" parameter settings." % ((time() - start), n_iter_search))
print()
print("The best model parameters:")
print(vectorizer)
print(svd)
print(clf.best_estimator_)
print()
print("The best F1 score:")
print(clf.best_score_)
#print(metrics.f1_score(clf.predict(X_test), y_test))
#joblib.dump(clf, 'model/svm_model.pkl')
joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
joblib.dump(tfidf_model, 'model/tfidf_model.pkl')
joblib.dump(svd_model, 'model/svd_model.pkl')
joblib.dump(clf.best_estimator_, 'model_oneClass/svm_model.pkl')
joblib.dump(tfidf_model, 'model_oneClass/tfidf_model.pkl')
joblib.dump(svd_model, 'model_oneClass/svd_model.pkl')
else:
clf = joblib.load(args.svcmodel)
vectorizer = joblib.load('model/tfidf_model.pkl')
svd = joblib.load('model/svd_model.pkl')
vectorizer = joblib.load('model_oneClass/tfidf_model.pkl')
svd = joblib.load('model_oneClass/svd_model.pkl')
print(vectorizer)
print(svd)
print(clf)
abstracs = get_abstracts(file_name=args.input, label='unknown')
X = vectorizer.transform([x['body'] for x in abstracs])
X = svd.transform(X)
......@@ -158,7 +153,9 @@ else:
with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \
open(args.out + "/" + labels[1] + ".out", 'w') as f1:
for c, a in zip(classes, abstracs):
if c == 0:
if c == 1:
f0.write("%d\t%s\n" % (a['pmid'], a['body']))
elif c == 1:
elif c == -1:
f1.write("%d\t%s\n" % (a['pmid'], a['body']))
print("FINISHED!!")
......
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
kernel,degree,coef0,C,gamma
linear,1,0.5,100,0.0
linear,1,0.5,10,0.0
linear,1,0.5,50,0.0
linear,1,0.5,100,0.0
linear,1,0.5,5,0.0
linear,1,0.5,150,0.0
linear,1,0.5,200,0.0
linear,1,0.5,300,0.0
linear,1,0.5,400,0.0
linear,1,0.5,1.0,0.0
linear,1,0.5,5.0,0.0
kernel,degree,coef0,nu,gamma
linear,1,0.5,1.0,0.0
linear,1,0.5,0.9,0.0
linear,1,0.5,0.8,0.0
linear,1,0.5,0.7,0.0
linear,1,0.5,0.6,0.0
linear,1,0.5,0.5,0.0
linear,1,0.5,0.4,0.0
linear,1,0.5,0.3,0.0
linear,1,0.5,0.2,0.0
linear,1,0.5,0.1,0.0
rbf,1,0.5,1.0,2.0
rbf,1,0.5,0.9,0.0001
rbf,1,0.5,0.8,0.0001
rbf,1,0.5,0.7,0.0001
rbf,1,0.5,0.6,0.001
rbf,1,0.5,0.5,0.001
rbf,1,0.5,0.4,0.001
rbf,1,0.5,0.7,0.0001
rbf,1,0.5,0.4,0.0001
rbf,1,0.5,0.5,0.0001
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.