testing pretrained model

Ignacio Arroyo
Commit cf814cd82081f2dde279dfa53c3275712782f312 cf814cd8 1 parent e2cb0d9e
Showing 5 changed files with 186 additions and 22 deletions
classify_abstracts.py
filter_abstracts.py
model/svm_model.pkl
model/tifidf_model.pkl
model_params.conf
--- a/classify_abstracts.py
View file @cf814cd
+++ b/classify_abstracts.py
View file @cf814cd
@@ -13,14 +13,10 @@ from sklearn import metrics
 from sklearn.ensemble import (ExtraTreesClassifier, RandomForestClassifier,
                               AdaBoostClassifier, GradientBoostingClassifier)
 from sklearn.grid_search import GridSearchCV
+from sklearn.externals import joblib
 import pandas as pd
 from numpy import mean, std
-#Classifier = KNeighborsClassifier # 0.6464
-#Classifier = NearestCentroid # 0.5054
-#Classifier = RandomForestClassifier # 0.49
-#Classifier = LinearSVC # 0.5402
-#Classifier = SGDClassifier # 0.664
 class EstimatorSelectionHelper:
     "http://www.codiply.com/blog/hyperparameter-grid-search-across-multiple-models-in-scikit-learn/"
@@ -71,7 +67,7 @@ def get_abstracts(file_name, label):
     docs = []
     empties = []
     lines = f.readlines()
-    copyright = False
+    cpright = False
     for i, ln in enumerate(lines):
         if not ln.strip():
@@ -84,8 +80,8 @@ def get_abstracts(file_name, label):
                     break
             continue
-        elif 'Copyright ' in ln:
+        elif 'cpright ' in ln:
-            copyright = True
+            cpright = True
         elif 'DOI: ' in ln:
             if 'PMCID: ' in lines[i + 1]:
@@ -93,9 +89,9 @@ def get_abstracts(file_name, label):
             elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]:
                 extract['pmid'] = int(lines[i + 1].strip().split()[1])
-            if copyright:
+            if cpright:
                 get = slice(empties[-3], empties[-2])
-                copyright = False
+                cpright = False
             else:
                 get = slice(empties[-2], empties[-1])
@@ -115,14 +111,14 @@ def get_abstracts(file_name, label):
     return docs
-filename="data/ecoli_abstracts/not_useful_abstracts.txt"
+filename = "data/ecoli_abstracts/not_useful_abstracts.txt"
 labels = ['useless', 'useful']
-abstracs = get_abstracts(file_name = filename, label = labels[0])
+abstracs = get_abstracts(file_name=filename, label=labels[0])
-filename="data/ecoli_abstracts/useful_abstracts.txt"
+filename = "data/ecoli_abstracts/useful_abstracts.txt"
-abstracs += get_abstracts(file_name = filename, label = labels[1])
+abstracs += get_abstracts(file_name=filename, label=labels[1])
 X = [x['body'] for x in abstracs]
 y = [1 if x['topic'] == 'useful' else 0 for x in abstracs]
@@ -136,15 +132,17 @@ models1 = {
 }
 params1 = {
-    'ExtraTreesClassifier': { 'n_estimators': [16, 32] },
+    'ExtraTreesClassifier': {'n_estimators': [16, 32]},
-    'RandomForestClassifier': { 'n_estimators': [16, 32] },
+    'RandomForestClassifier': {'n_estimators': [16, 32]},
-    'AdaBoostClassifier':  { 'n_estimators': [16, 32] },
+    'AdaBoostClassifier': {'n_estimators': [16, 32]},
-    'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] },
+    'GradientBoostingClassifier': {'n_estimators': [16, 32],
+                                    'learning_rate': [0.8, 1.0]},
     'SVC': [
-        #{'kernel': ['linear'], 'C': [1, 10, 100, 150, 200, 300, 400]},
+        {'kernel': ['rbf'], 'C': [1, 10, 100, 150, 200, 300, 350, 400],
-        {'kernel': ['rbf'], 'C': [1, 10, 100, 150, 200, 300, 400], 'gamma': [0.001, 0.0001]},
+        'gamma': [0.1, 0.01, 0.001, 0.0001, 0.00001]},
-        {'kernel': ['poly'], 'C': [1, 10, 100, 150, 200, 300, 400], 'degree': [2, 3, 4, 5, 6]},
+        {'kernel': ['poly'], 'C': [1, 10, 100, 150, 200, 300, 350, 400],
-        {'kernel': ['sigmoid'], 'C': [1, 10, 100, 150, 200, 300, 400], 'gamma': [0.001, 0.0001]},
+            'degree': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 23, 26],
+            'coef0': [0.1, 0.2,0.3,0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
     ]
 }
@@ -171,6 +169,9 @@ X = vectorizer.fit_transform(X)
 #st()
 clf.fit(X, y, scoring='f1', n_jobs=-1)
+joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
+joblib.dump(clf.best_estimator_, 'model/tifidf_model.pkl')
+
 #pred = clf.predict(X_test)
 #print(metrics.f1_score(y_test, pred, average='macro'))
 print(clf.score_summary(sort_by='min_score'))
--- a/filter_abstracts.py 0 → 100644
View file @cf814cd
+++ b/filter_abstracts.py 0 → 100644
View file @cf814cd
+from pdb import set_trace as st
+from sklearn.cross_validation import train_test_split as splitt
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.model_selection import RandomizedSearchCV
+from sklearn.model_selection import GridSearchCV
+from sklearn import metrics
+from sklearn.svm import SVC
+import numpy as np
+import argparse
+import csv
+from sklearn.externals import joblib
+from time import time
+from scipy.stats import randint as sp_randint
+from scipy.stats import expon
+
+
+def get_abstracts(file_name, label):
+    f = open(file_name)
+    extract = {}
+    docs = []
+    empties = []
+    lines = f.readlines()
+    copyright = False
+
+    for i, ln in enumerate(lines):
+        if not ln.strip():
+            empties.append(i)
+            continue
+        elif ' doi: ' in ln:
+            for j in range(i, i + 10):
+                if not lines[j].strip():
+                    title_idx = j + 1
+                    break
+            continue
+
+        elif 'Copyright ' in ln:
+            copyright = True
+
+        elif 'DOI: ' in ln:
+            if 'PMCID: ' in lines[i + 1]:
+                extract['pmid'] = int(lines[i + 2].strip().split()[1])
+            elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]:
+                extract['pmid'] = int(lines[i + 1].strip().split()[1])
+
+            if copyright:
+                get = slice(empties[-3], empties[-2])
+                copyright = False
+            else:
+                get = slice(empties[-2], empties[-1])
+
+            extract['body'] = " ".join(lines[get]).replace("\n", ' '
+                                                        ).replace("  ", ' ')
+            title = []
+            for j in range(title_idx, title_idx + 5):
+                if lines[j].strip():
+                    title.append(lines[j])
+                else:
+                    break
+            extract['title'] = " ".join(title).replace("\n", ' '
+                                                        ).replace("  ", ' ')
+            extract['topic'] = label
+            docs.append(extract)
+            empties = []
+            extract = {}
+
+    return docs
+
+
+parser = argparse.ArgumentParser(
+    description="This script separates abstracts of biomedical papers that"
+            "report data from biomedical experiments from those that do not.")
+parser.add_argument("--input", help="Input file containing the abstracts to"
+                                "be predited.")
+parser.add_argument("--classA", help="Input file containing the abstracts of"
+                                "class A to be learned.")
+parser.add_argument("--classB", help="Input file containing the abstracts of"
+                                "class B to be learned.")
+parser.add_argument("--out", help="Path to the output directory "
+                     "(default='./filter_output')", default="filter_output")
+parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
+        "(default='./model/svm_model.pkl')", default="model/svm_model.pkl")
+
+args = parser.parse_args()
+
+labels = {'useless': 0, 'useful': 1}
+vectorizer = TfidfVectorizer(binary=True)
+print(vectorizer)
+
+if args.classA and args.classA and not args.input:
+    f0 = open("model_params.conf")
+    n_iter_search = 10
+    params = [p for p in csv.DictReader(f0)]
+    f0.close()
+    names = list(params[0].keys())
+    model_params = {n: [] for n in names}
+
+    for n in names:
+        for d in params:
+            for k in d:
+                if k == n:
+                    try:
+                        model_params[n].append(float(d[k]))
+                    except ValueError:
+                        model_params[n].append(d[k])
+
+    abstracs = get_abstracts(file_name=args.classA, label=labels['useless'])
+    abstracs += get_abstracts(file_name=args.classB, label=labels['useful'])
+
+    X = vectorizer.fit_transform([x['body'] for x in abstracs])
+    y = [x['topic'] for x in abstracs]
+
+    #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
+
+    clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
+    clf = GridSearchCV(clf, cv=3,
+        param_grid=model_params,
+    # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
+                                 n_jobs=-1, scoring='f1')
+    start = time()
+    clf.fit(X, y)
+
+    #clf.fit(X_train, y_train)
+    print("GridSearch took %.2f seconds for %d candidates"
+      " parameter settings." % ((time() - start), n_iter_search))
+
+    print(clf.best_estimator_)
+    print(clf)
+    print(clf.best_score_)
+    #print(metrics.f1_score(clf.predict(X_test), y_test))
+
+    #joblib.dump(clf, 'model/svm_model.pkl')
+    joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
+    joblib.dump(vectorizer, 'model/tifidf_model.pkl')
+else:
+
+    clf = joblib.load(args.svcmodel)
+    vectorizer = joblib.load('model/tfidf_model.pkl')
+    #filename=args.input #"data/ecoli_abstracts/not_useful_abstracts.txt"
+    abstracs = get_abstracts(file_name=args.input, label='unknown')
+    X = vectorizer.fit_transform([x['body'] for x in abstracs])
+    classes = clf.predict(X)
+
+    with open(args.output + "/" + labels[0] + ".out", 'w') as f0, \
+                    open(args.output + "/" + labels[1] + ".out", 'w') as f1:
+        for c, a in zip(classes, abstracs):
+            if c == 0:
+                f0.write("%d\t%s\n" % (a['pmid'], a['body']))
+            elif c == 1:
+                f1.write("%d\t%s\n" % (a['pmid'], a['body']))
+#clf.fit(X, y, scoring='f1', n_jobs=-1)
--- a/model/svm_model.pkl 0 → 100644
View file @cf814cd
+++ b/model/svm_model.pkl 0 → 100644
View file @cf814cd
--- a/model/tifidf_model.pkl 0 → 100644
View file @cf814cd
+++ b/model/tifidf_model.pkl 0 → 100644
View file @cf814cd
--- a/model_params.conf 0 → 100644
View file @cf814cd
+++ b/model_params.conf 0 → 100644
View file @cf814cd
+kernel,degree,coef0,C,gamma
+poly,3,0.2,300,0
+poly,11,0.9,150,0
+rbf,0,0.5,100,0.0001
+linear,1,0.5,100,0.0
+linear,1,1.5,100,0.0
+linear,1,2.5,100,0.0
+linear,1,3.5,100,0.0
+linear,1,4.5,100,0.0
+linear,1,1.5,150,0.0
+linear,1,2.5,200,0.0
+linear,1,3.5,300,0.0
+linear,1,4.5,400,0.0
\ No newline at end of file