Carlos-Francisco Méndez-Cruz

Final version binClass for papers

......@@ -3,7 +3,8 @@ from sklearn.cross_validation import train_test_split as splitt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score, classification_report
from sklearn.svm import SVC
import numpy as np
import argparse
......@@ -16,18 +17,23 @@ from scipy.stats import expon
from sklearn.preprocessing import label_binarize
from sklearn.datasets import load_files
# CMC: Run example
# python3.4 filter_papers.py --traind /home/cmendezc/gitlab_repositories/useless/data/TEXT_FILES
parser = argparse.ArgumentParser(
description="This script separates biomedical papers that"
"report data from biomedical experiments from those that do not.")
parser.add_argument("--input", help="Input file containing the to"
parser.add_argument("--input", help="Input directory containing the papers to"
"be predited.")
parser.add_argument("--traind", help="Input directory containing the papers of"
"two classes to be learned.")
parser.add_argument("--out", help="Path to the output directory "
"(default='./filter_output')", default="filter_output")
parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
"(default='./model/svm_model.paper.pkl')", default="model/svm_model.paper.pkl")
"(default='./model_binClass/svm_model.paper.pkl')", default="model_binClass/svm_model.paper.pkl")
parser.add_argument("--split", default=False,
action="store_true", dest="split",
help="Automatic split training/test of input data ")
args = parser.parse_args()
labels = {0: 'useless', 1: 'useful'}
......@@ -56,10 +62,19 @@ if args.traind and not args.input:
model_params = {k: list(set(model_params[k])) for k in model_params}
tfidf_model = vectorizer.fit(data.data)
X = vectorizer.transform(data.data)
#y = [x['topic'] for x in abstracs]
y = data.target
# CMC: separate in training - validation datasets
if args.split:
X_train, X_test, y_train, y_test = train_test_split(data.data, labels, test_size = 0.25, random_state = 42)
tfidf_model = vectorizer.fit(X_train)
X = vectorizer.transform(X_train)
y = y_train
else:
#y = [x['topic'] for x in abstracs]
# Original Nacho:
tfidf_model = vectorizer.fit(data.data)
X = vectorizer.transform(data.data)
y = data.target
#X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
......@@ -81,15 +96,22 @@ if args.traind and not args.input:
#print(metrics.f1_score(clf.predict(X_test), y_test))
#joblib.dump(clf, 'model/svm_model.pkl')
joblib.dump(clf.best_estimator_, 'model/svm_model.paper.pkl')
joblib.dump(tfidf_model, 'model/tfidf_model.paper.pkl')
joblib.dump(clf.best_estimator_, 'model_binClass/svm_model.paper.pkl')
joblib.dump(tfidf_model, 'model_binClass/tfidf_model.paper.pkl')
if args.split:
X = vectorizer.transform(X_test)
y_pred = clf.predict(X)
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
else:
from pdb import set_trace as st
data=load_files(container_path=args.input, encoding=None,
decode_error='replace')
clf = joblib.load(args.svcmodel)
vectorizer = joblib.load('model/tfidf_model.paper.pkl')
vectorizer = joblib.load('model_binClass/tfidf_model.paper.pkl')
X = vectorizer.transform(data.data)
classes = clf.predict(X)
......@@ -97,10 +119,10 @@ else:
if not os.path.exists(args.out):
os.makedirs(args.out)
# Writing predictions to output files
with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \
open(args.out + "/" + labels[1] + ".out", 'w') as f1:
for c, a in zip(classes, papers):
with open(args.out + "/" + labels[0] + "-binClass-paper.out", 'w') as f0, \
open(args.out + "/" + labels[1] + "-binClass-paper.out", 'w') as f1:
for c, a in zip(classes, data):
if c == 0:
f0.write("%d\t%s\n" % (a['title'], a['body']))
f0.write("%d\n" % (a['title']))
elif c == 1:
f1.write("%d\t%s\n" % (a['title'], a['body']))
f1.write("%d\n" % (a['title']))
......