testing pretrained model

Ignacio Arroyo
Commit cf814cd82081f2dde279dfa53c3275712782f312 cf814cd8 1 parent e2cb0d9e
Showing 5 changed files with 186 additions and 22 deletions
classify_abstracts.py
filter_abstracts.py
model/svm_model.pkl
model/tifidf_model.pkl
model_params.conf
--- a/classify_abstracts.py
View file @cf814cd
+++ b/classify_abstracts.py
View file @cf814cd
@@ -13,14 +13,10 @@ from sklearn import metrics
 from sklearn.ensemble import (ExtraTreesClassifier, RandomForestClassifier,
                               AdaBoostClassifier, GradientBoostingClassifier)
 from sklearn.grid_search import GridSearchCV
+ from sklearn.externals import joblib
 import pandas as pd
 from numpy import mean, std
 
- #Classifier = KNeighborsClassifier # 0.6464
- #Classifier = NearestCentroid # 0.5054
- #Classifier = RandomForestClassifier # 0.49
- #Classifier = LinearSVC # 0.5402
- #Classifier = SGDClassifier # 0.664
 
 class EstimatorSelectionHelper:
     "http://www.codiply.com/blog/hyperparameter-grid-search-across-multiple-models-in-scikit-learn/"
@@ -71,7 +67,7 @@ def get_abstracts(file_name, label):
     docs = []
     empties = []
     lines = f.readlines()
-     copyright = False
+     cpright = False
 
     for i, ln in enumerate(lines):
         if not ln.strip():
@@ -84,8 +80,8 @@ def get_abstracts(file_name, label):
                     break
             continue
 
-         elif 'Copyright ' in ln:
-             copyright = True
+         elif 'cpright ' in ln:
+             cpright = True
 
         elif 'DOI: ' in ln:
             if 'PMCID: ' in lines[i + 1]:
@@ -93,9 +89,9 @@ def get_abstracts(file_name, label):
             elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]:
                 extract['pmid'] = int(lines[i + 1].strip().split()[1])
 
-             if copyright:
+             if cpright:
                 get = slice(empties[-3], empties[-2])
-                 copyright = False
+                 cpright = False
             else:
                 get = slice(empties[-2], empties[-1])
 
@@ -115,14 +111,14 @@ def get_abstracts(file_name, label):
     return docs
 
 
- filename="data/ecoli_abstracts/not_useful_abstracts.txt"
+ filename = "data/ecoli_abstracts/not_useful_abstracts.txt"
 labels = ['useless', 'useful']
 
- abstracs = get_abstracts(file_name = filename, label = labels[0])
+ abstracs = get_abstracts(file_name=filename, label=labels[0])
 
- filename="data/ecoli_abstracts/useful_abstracts.txt"
+ filename = "data/ecoli_abstracts/useful_abstracts.txt"
 
- abstracs += get_abstracts(file_name = filename, label = labels[1])
+ abstracs += get_abstracts(file_name=filename, label=labels[1])
 
 X = [x['body'] for x in abstracs]
 y = [1 if x['topic'] == 'useful' else 0 for x in abstracs]
@@ -136,15 +132,17 @@ models1 = {
 }
 
 params1 = {
-     'ExtraTreesClassifier': { 'n_estimators': [16, 32] },
-     'RandomForestClassifier': { 'n_estimators': [16, 32] },
-     'AdaBoostClassifier':  { 'n_estimators': [16, 32] },
-     'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] },
+     'ExtraTreesClassifier': {'n_estimators': [16, 32]},
+     'RandomForestClassifier': {'n_estimators': [16, 32]},
+     'AdaBoostClassifier': {'n_estimators': [16, 32]},
+     'GradientBoostingClassifier': {'n_estimators': [16, 32],
+                                     'learning_rate': [0.8, 1.0]},
     'SVC': [
-         #{'kernel': ['linear'], 'C': [1, 10, 100, 150, 200, 300, 400]},
-         {'kernel': ['rbf'], 'C': [1, 10, 100, 150, 200, 300, 400], 'gamma': [0.001, 0.0001]},
-         {'kernel': ['poly'], 'C': [1, 10, 100, 150, 200, 300, 400], 'degree': [2, 3, 4, 5, 6]},
-         {'kernel': ['sigmoid'], 'C': [1, 10, 100, 150, 200, 300, 400], 'gamma': [0.001, 0.0001]},
+         {'kernel': ['rbf'], 'C': [1, 10, 100, 150, 200, 300, 350, 400],
+         'gamma': [0.1, 0.01, 0.001, 0.0001, 0.00001]},
+         {'kernel': ['poly'], 'C': [1, 10, 100, 150, 200, 300, 350, 400],
+             'degree': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 23, 26],
+             'coef0': [0.1, 0.2,0.3,0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
     ]
 }
 
@@ -171,6 +169,9 @@ X = vectorizer.fit_transform(X)
 #st()
 clf.fit(X, y, scoring='f1', n_jobs=-1)
 
+ joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
+ joblib.dump(clf.best_estimator_, 'model/tifidf_model.pkl')
+ 
 #pred = clf.predict(X_test)
 #print(metrics.f1_score(y_test, pred, average='macro'))
 print(clf.score_summary(sort_by='min_score'))
--- a/filter_abstracts.py 0 → 100644
View file @cf814cd
+++ b/filter_abstracts.py 0 → 100644
View file @cf814cd
+ from pdb import set_trace as st
+ from sklearn.cross_validation import train_test_split as splitt
+ from sklearn.feature_extraction.text import TfidfVectorizer
+ from sklearn.model_selection import RandomizedSearchCV
+ from sklearn.model_selection import GridSearchCV
+ from sklearn import metrics
+ from sklearn.svm import SVC
+ import numpy as np
+ import argparse
+ import csv
+ from sklearn.externals import joblib
+ from time import time
+ from scipy.stats import randint as sp_randint
+ from scipy.stats import expon
+ 
+ 
+ def get_abstracts(file_name, label):
+     f = open(file_name)
+     extract = {}
+     docs = []
+     empties = []
+     lines = f.readlines()
+     copyright = False
+ 
+     for i, ln in enumerate(lines):
+         if not ln.strip():
+             empties.append(i)
+             continue
+         elif ' doi: ' in ln:
+             for j in range(i, i + 10):
+                 if not lines[j].strip():
+                     title_idx = j + 1
+                     break
+             continue
+ 
+         elif 'Copyright ' in ln:
+             copyright = True
+ 
+         elif 'DOI: ' in ln:
+             if 'PMCID: ' in lines[i + 1]:
+                 extract['pmid'] = int(lines[i + 2].strip().split()[1])
+             elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]:
+                 extract['pmid'] = int(lines[i + 1].strip().split()[1])
+ 
+             if copyright:
+                 get = slice(empties[-3], empties[-2])
+                 copyright = False
+             else:
+                 get = slice(empties[-2], empties[-1])
+ 
+             extract['body'] = " ".join(lines[get]).replace("\n", ' '
+                                                         ).replace("  ", ' ')
+             title = []
+             for j in range(title_idx, title_idx + 5):
+                 if lines[j].strip():
+                     title.append(lines[j])
+                 else:
+                     break
+             extract['title'] = " ".join(title).replace("\n", ' '
+                                                         ).replace("  ", ' ')
+             extract['topic'] = label
+             docs.append(extract)
+             empties = []
+             extract = {}
+ 
+     return docs
+ 
+ 
+ parser = argparse.ArgumentParser(
+     description="This script separates abstracts of biomedical papers that"
+             "report data from biomedical experiments from those that do not.")
+ parser.add_argument("--input", help="Input file containing the abstracts to"
+                                 "be predited.")
+ parser.add_argument("--classA", help="Input file containing the abstracts of"
+                                 "class A to be learned.")
+ parser.add_argument("--classB", help="Input file containing the abstracts of"
+                                 "class B to be learned.")
+ parser.add_argument("--out", help="Path to the output directory "
+                      "(default='./filter_output')", default="filter_output")
+ parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
+         "(default='./model/svm_model.pkl')", default="model/svm_model.pkl")
+ 
+ args = parser.parse_args()
+ 
+ labels = {'useless': 0, 'useful': 1}
+ vectorizer = TfidfVectorizer(binary=True)
+ print(vectorizer)
+ 
+ if args.classA and args.classA and not args.input:
+     f0 = open("model_params.conf")
+     n_iter_search = 10
+     params = [p for p in csv.DictReader(f0)]
+     f0.close()
+     names = list(params[0].keys())
+     model_params = {n: [] for n in names}
+ 
+     for n in names:
+         for d in params:
+             for k in d:
+                 if k == n:
+                     try:
+                         model_params[n].append(float(d[k]))
+                     except ValueError:
+                         model_params[n].append(d[k])
+ 
+     abstracs = get_abstracts(file_name=args.classA, label=labels['useless'])
+     abstracs += get_abstracts(file_name=args.classB, label=labels['useful'])
+ 
+     X = vectorizer.fit_transform([x['body'] for x in abstracs])
+     y = [x['topic'] for x in abstracs]
+ 
+     #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
+ 
+     clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
+     clf = GridSearchCV(clf, cv=3,
+         param_grid=model_params,
+     # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
+                                  n_jobs=-1, scoring='f1')
+     start = time()
+     clf.fit(X, y)
+ 
+     #clf.fit(X_train, y_train)
+     print("GridSearch took %.2f seconds for %d candidates"
+       " parameter settings." % ((time() - start), n_iter_search))
+ 
+     print(clf.best_estimator_)
+     print(clf)
+     print(clf.best_score_)
+     #print(metrics.f1_score(clf.predict(X_test), y_test))
+ 
+     #joblib.dump(clf, 'model/svm_model.pkl')
+     joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
+     joblib.dump(vectorizer, 'model/tifidf_model.pkl')
+ else:
+ 
+     clf = joblib.load(args.svcmodel)
+     vectorizer = joblib.load('model/tfidf_model.pkl')
+     #filename=args.input #"data/ecoli_abstracts/not_useful_abstracts.txt"
+     abstracs = get_abstracts(file_name=args.input, label='unknown')
+     X = vectorizer.fit_transform([x['body'] for x in abstracs])
+     classes = clf.predict(X)
+ 
+     with open(args.output + "/" + labels[0] + ".out", 'w') as f0, \
+                     open(args.output + "/" + labels[1] + ".out", 'w') as f1:
+         for c, a in zip(classes, abstracs):
+             if c == 0:
+                 f0.write("%d\t%s\n" % (a['pmid'], a['body']))
+             elif c == 1:
+                 f1.write("%d\t%s\n" % (a['pmid'], a['body']))
+ #clf.fit(X, y, scoring='f1', n_jobs=-1)
--- a/model/svm_model.pkl 0 → 100644
View file @cf814cd
+++ b/model/svm_model.pkl 0 → 100644
View file @cf814cd
--- a/model/tifidf_model.pkl 0 → 100644
View file @cf814cd
+++ b/model/tifidf_model.pkl 0 → 100644
View file @cf814cd
--- a/model_params.conf 0 → 100644
View file @cf814cd
+++ b/model_params.conf 0 → 100644
View file @cf814cd
+ kernel,degree,coef0,C,gamma
+ poly,3,0.2,300,0
+ poly,11,0.9,150,0
+ rbf,0,0.5,100,0.0001
+ linear,1,0.5,100,0.0
+ linear,1,1.5,100,0.0
+ linear,1,2.5,100,0.0
+ linear,1,3.5,100,0.0
+ linear,1,4.5,100,0.0
+ linear,1,1.5,150,0.0
+ linear,1,2.5,200,0.0
+ linear,1,3.5,300,0.0
+ linear,1,4.5,400,0.0
\ No newline at end of file