filter_papers.py 3.92 KB
#from pdb import set_trace as st
from sklearn.cross_validation import train_test_split as splitt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.svm import SVC
import numpy as np
import argparse
import csv
import os
from sklearn.externals import joblib
from time import time
from scipy.stats import randint as sp_randint
from scipy.stats import expon
from sklearn.preprocessing import label_binarize
from sklearn.datasets import load_files


parser = argparse.ArgumentParser(
    description="This script separates biomedical papers that"
            "report data from biomedical experiments from those that do not.")
parser.add_argument("--input", help="Input file containing the  to"
                                "be predited.")
parser.add_argument("--traind", help="Input directory containing the papers of"
                                "two classes to be learned.")
parser.add_argument("--out", help="Path to the output directory "
                     "(default='./filter_output')", default="filter_output")
parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
        "(default='./model/svm_model.paper.pkl')", default="model/svm_model.paper.pkl")

args = parser.parse_args()

data=load_files(container_path=args.traind, encoding=None, 
                                                    decode_error='replace')
labels = data.target_names

vectorizer = TfidfVectorizer(binary=True)
print(vectorizer)

if args.train and not args.input:
    f0 = open("model_params.conf")
    n_iter_search = 10
    params = [p for p in csv.DictReader(f0)]
    f0.close()
    names = list(params[0].keys())
    model_params = {n: [] for n in names}

    for n in names:
        for d in params:
            for k in d:
                if k == n:
                    try:
                        model_params[n].append(float(d[k]))
                    except ValueError:
                        model_params[n].append(d[k])

    model_params = {k: list(set(model_params[k])) for k in model_params}
    papers = data.data

    tfidf_model = vectorizer.fit(papers)
    X = vectorizer.transform(papers)
    #y = [x['topic'] for x in abstracs]
    y = data.target    

    #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)

    clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
    clf = GridSearchCV(clf, cv=3,
        param_grid=model_params,
    # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
                                 n_jobs=-1, scoring='f1')
    start = time()
    clf.fit(X, y)

    #clf.fit(X_train, y_train)
    print("GridSearch took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

    print(clf.best_estimator_)
    print()
    print(clf.best_score_)
    #print(metrics.f1_score(clf.predict(X_test), y_test))

    #joblib.dump(clf, 'model/svm_model.pkl')
    joblib.dump(clf.best_estimator_, 'model/svm_model.paper.pkl')
    joblib.dump(tfidf_model, 'model/tfidf_model.paper.pkl')

else:

    data=load_files(container_path=args.input, encoding=None,
                                                    decode_error='replace')
    clf = joblib.load(args.svcmodel)
    vectorizer = joblib.load('model/tfidf_model.paper.pkl')
    papers = data.data
    X = vectorizer.transform(papers)
    classes = clf.predict(X)

    if not os.path.exists(args.out):
        os.makedirs(args.out)
    # Writing predictions to output files
    with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \
                    open(args.out + "/" + labels[1] + ".out", 'w') as f1:
        for c, a in zip(classes, papers):
            if c == 0:
                f0.write("%d\t%s\n" % (a['title'], a['body']))
            elif c == 1:
                f1.write("%d\t%s\n" % (a['title'], a['body']))