filter_papers.py 5.18 KB
#from pdb import set_trace as st
from sklearn.cross_validation import train_test_split as splitt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score, classification_report
from sklearn.svm import SVC
import numpy as np
import argparse
import csv
import os
from sklearn.externals import joblib
from time import time
from scipy.stats import randint as sp_randint
from scipy.stats import expon
from sklearn.preprocessing import label_binarize
from sklearn.datasets import load_files

# CMC: Run example
# python3.4 filter_papers.py --traind /home/cmendezc/gitlab_repositories/useless/data/TEXT_FILES

parser = argparse.ArgumentParser(
    description="This script separates biomedical papers that"
            "report data from biomedical experiments from those that do not.")
parser.add_argument("--input", help="Input directory containing the papers to"
                                "be predited.")
parser.add_argument("--traind", help="Input directory containing the papers of"
                                "two classes to be learned.")
parser.add_argument("--out", help="Path to the output directory "
                     "(default='./filter_output')", default="filter_output")
parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
        "(default='./model_binClass/svm_model.paper.pkl')", default="model_binClass/svm_model.paper.pkl")
parser.add_argument("--split", default=False,
                  action="store_true", dest="split",
                  help="Automatic split training/test of input data ")

args = parser.parse_args()
labels = {0: 'useless', 1: 'useful'}

if args.traind and not args.input:
    data=load_files(container_path=args.traind, encoding=None, 
                                                    decode_error='replace')
    labels = data.target_names
    vectorizer = TfidfVectorizer(binary=True)
    print(vectorizer)
    f0 = open("model_params_binClass.conf")
    n_iter_search = 10
    params = [p for p in csv.DictReader(f0)]
    f0.close()
    names = list(params[0].keys())
    model_params = {n: [] for n in names}

    for n in names:
        for d in params:
            for k in d:
                if k == n:
                    try:
                        model_params[n].append(float(d[k]))
                    except ValueError:
                        model_params[n].append(d[k])

    model_params = {k: list(set(model_params[k])) for k in model_params}

    # CMC: separate in training - validation datasets
    if args.split:
        X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size = 0.25, random_state = 42)
        tfidf_model = vectorizer.fit(X_train)
        X = vectorizer.transform(X_train)
        y = y_train
    else:
        #y = [x['topic'] for x in abstracs]
        # Original Nacho:
        tfidf_model = vectorizer.fit(data.data)
        X = vectorizer.transform(data.data)
        y = data.target


    #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)

    clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
    clf = GridSearchCV(clf, cv=3,
        param_grid=model_params,
    # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
                                 n_jobs=-1, scoring='f1')
    start = time()
    clf.fit(X, y)

    #clf.fit(X_train, y_train)
    print("GridSearch took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))

    print(clf.best_estimator_)
    print()
    print(clf.best_score_)
    #print(metrics.f1_score(clf.predict(X_test), y_test))

    #joblib.dump(clf, 'model/svm_model.pkl')
    joblib.dump(clf.best_estimator_, 'model_binClass/svm_model.paper.pkl')
    joblib.dump(tfidf_model, 'model_binClass/tfidf_model.paper.pkl')

    if args.split:
        X = vectorizer.transform(X_test)
        y_pred = clf.predict(X)
        print("Test results: ")
        print("Precision: {}".format(precision_score(y_test, y_pred)))
        print("Recall: {}".format(recall_score(y_test, y_pred)))
        print("F-score: {}".format(f1_score(y_test, y_pred)))
        print("Classification report:")
        print(classification_report(y_test, y_pred))
else:
    from pdb import set_trace as st
    data=load_files(container_path=args.input, encoding=None,
                                                    decode_error='replace')
    clf = joblib.load(args.svcmodel)
    vectorizer = joblib.load('model_binClass/tfidf_model.paper.pkl')
    X = vectorizer.transform(data.data)

    classes = clf.predict(X)
    st()
    if not os.path.exists(args.out):
        os.makedirs(args.out)
    # Writing predictions to output files
    with open(args.out + "/" + labels[0] + "-binClass-paper.out", 'w') as f0, \
                    open(args.out + "/" + labels[1] + "-binClass-paper.out", 'w') as f1:
        for c, a in zip(classes, data):
            if c == 0:
                f0.write("%d\n" % (a['title']))
            elif c == 1:
                f1.write("%d\n" % (a['title']))