modified readme

iarroyof
Commit d4ced3342229375e6708bb982d8353261895632a d4ced334 1 parent cbc7767d
Showing 2 changed files with 111 additions and 3 deletions
README.md
filter_papers.py
--- a/README.md
View file @d4ced33
+++ b/README.md
View file @d4ced33
@@ -18,15 +18,15 @@ The main method follows the next pipeline:
 # Usage
-For filtering unknown anstracts run
+For filtering unknown abstracts run
 ```bash
 $ python filter_abstracts.py --input data/ecoli_abstracts/useful_abstracts.txt
 ```
 The predictions will be stored by default at `filter_output/`, unless a different directory is specified by means of the `--out` option. The default names containing the predicitons are 
-- filter_output/useful.out
+- `filter_output/useful.out`
-- filter_output/useless.out
+- `filter_output/useless.out`
 The format of each file is:
--- a/filter_papers.py 0 → 100644
View file @d4ced33
+++ b/filter_papers.py 0 → 100644
View file @d4ced33
+#from pdb import set_trace as st
+from sklearn.cross_validation import train_test_split as splitt
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.model_selection import RandomizedSearchCV
+from sklearn.model_selection import GridSearchCV
+from sklearn import metrics
+from sklearn.svm import SVC
+import numpy as np
+import argparse
+import csv
+import os
+from sklearn.externals import joblib
+from time import time
+from scipy.stats import randint as sp_randint
+from scipy.stats import expon
+from sklearn.preprocessing import label_binarize
+from sklearn.datasets import load_files
+
+
+parser = argparse.ArgumentParser(
+    description="This script separates biomedical papers that"
+            "report data from biomedical experiments from those that do not.")
+parser.add_argument("--input", help="Input file containing the  to"
+                                "be predited.")
+parser.add_argument("--traind", help="Input directory containing the papers of"
+                                "two classes to be learned.")
+parser.add_argument("--out", help="Path to the output directory "
+                     "(default='./filter_output')", default="filter_output")
+parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
+        "(default='./model/svm_model.paper.pkl')", default="model/svm_model.paper.pkl")
+
+args = parser.parse_args()
+
+data=load_files(container_path=args.traind, encoding=None, 
+                                                    decode_error='replace')
+labels = data.target_names
+
+vectorizer = TfidfVectorizer(binary=True)
+print(vectorizer)
+
+if args.train and not args.input:
+    f0 = open("model_params.conf")
+    n_iter_search = 10
+    params = [p for p in csv.DictReader(f0)]
+    f0.close()
+    names = list(params[0].keys())
+    model_params = {n: [] for n in names}
+
+    for n in names:
+        for d in params:
+            for k in d:
+                if k == n:
+                    try:
+                        model_params[n].append(float(d[k]))
+                    except ValueError:
+                        model_params[n].append(d[k])
+
+    model_params = {k: list(set(model_params[k])) for k in model_params}
+    papers = data.data
+
+    tfidf_model = vectorizer.fit(papers)
+    X = vectorizer.transform(papers)
+    #y = [x['topic'] for x in abstracs]
+    y = data.target    
+
+    #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
+
+    clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
+    clf = GridSearchCV(clf, cv=3,
+        param_grid=model_params,
+    # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
+                                 n_jobs=-1, scoring='f1')
+    start = time()
+    clf.fit(X, y)
+
+    #clf.fit(X_train, y_train)
+    print("GridSearch took %.2f seconds for %d candidates"
+      " parameter settings." % ((time() - start), n_iter_search))
+
+    print(clf.best_estimator_)
+    print()
+    print(clf.best_score_)
+    #print(metrics.f1_score(clf.predict(X_test), y_test))
+
+    #joblib.dump(clf, 'model/svm_model.pkl')
+    joblib.dump(clf.best_estimator_, 'model/svm_model.paper.pkl')
+    joblib.dump(tfidf_model, 'model/tfidf_model.paper.pkl')
+
+else:
+
+    data=load_files(container_path=args.input, encoding=None,
+                                                    decode_error='replace')
+    clf = joblib.load(args.svcmodel)
+    vectorizer = joblib.load('model/tfidf_model.paper.pkl')
+    papers = data.data
+    X = vectorizer.transform(papers)
+    classes = clf.predict(X)
+
+    if not os.path.exists(args.out):
+        os.makedirs(args.out)
+    # Writing predictions to output files
+    with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \
+                    open(args.out + "/" + labels[1] + ".out", 'w') as f1:
+        for c, a in zip(classes, papers):
+            if c == 0:
+                f0.write("%d\t%s\n" % (a['title'], a['body']))
+            elif c == 1:
+                f1.write("%d\t%s\n" % (a['title'], a['body']))