modified readme

iarroyof
Commit d4ced3342229375e6708bb982d8353261895632a d4ced334 1 parent cbc7767d
Showing 2 changed files with 111 additions and 3 deletions
README.md
filter_papers.py
--- a/README.md
View file @d4ced33
+++ b/README.md
View file @d4ced33
@@ -18,15 +18,15 @@ The main method follows the next pipeline:
 
 # Usage
 
- For filtering unknown anstracts run
+ For filtering unknown abstracts run
 
 ```bash
 $ python filter_abstracts.py --input data/ecoli_abstracts/useful_abstracts.txt
 ```
 The predictions will be stored by default at `filter_output/`, unless a different directory is specified by means of the `--out` option. The default names containing the predicitons are 
 
- - filter_output/useful.out
- - filter_output/useless.out
+ - `filter_output/useful.out`
+ - `filter_output/useless.out`
 
 The format of each file is:
 
--- a/filter_papers.py 0 → 100644
View file @d4ced33
+++ b/filter_papers.py 0 → 100644
View file @d4ced33
+ #from pdb import set_trace as st
+ from sklearn.cross_validation import train_test_split as splitt
+ from sklearn.feature_extraction.text import TfidfVectorizer
+ from sklearn.model_selection import RandomizedSearchCV
+ from sklearn.model_selection import GridSearchCV
+ from sklearn import metrics
+ from sklearn.svm import SVC
+ import numpy as np
+ import argparse
+ import csv
+ import os
+ from sklearn.externals import joblib
+ from time import time
+ from scipy.stats import randint as sp_randint
+ from scipy.stats import expon
+ from sklearn.preprocessing import label_binarize
+ from sklearn.datasets import load_files
+ 
+ 
+ parser = argparse.ArgumentParser(
+     description="This script separates biomedical papers that"
+             "report data from biomedical experiments from those that do not.")
+ parser.add_argument("--input", help="Input file containing the  to"
+                                 "be predited.")
+ parser.add_argument("--traind", help="Input directory containing the papers of"
+                                 "two classes to be learned.")
+ parser.add_argument("--out", help="Path to the output directory "
+                      "(default='./filter_output')", default="filter_output")
+ parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
+         "(default='./model/svm_model.paper.pkl')", default="model/svm_model.paper.pkl")
+ 
+ args = parser.parse_args()
+ 
+ data=load_files(container_path=args.traind, encoding=None, 
+                                                     decode_error='replace')
+ labels = data.target_names
+ 
+ vectorizer = TfidfVectorizer(binary=True)
+ print(vectorizer)
+ 
+ if args.train and not args.input:
+     f0 = open("model_params.conf")
+     n_iter_search = 10
+     params = [p for p in csv.DictReader(f0)]
+     f0.close()
+     names = list(params[0].keys())
+     model_params = {n: [] for n in names}
+ 
+     for n in names:
+         for d in params:
+             for k in d:
+                 if k == n:
+                     try:
+                         model_params[n].append(float(d[k]))
+                     except ValueError:
+                         model_params[n].append(d[k])
+ 
+     model_params = {k: list(set(model_params[k])) for k in model_params}
+     papers = data.data
+ 
+     tfidf_model = vectorizer.fit(papers)
+     X = vectorizer.transform(papers)
+     #y = [x['topic'] for x in abstracs]
+     y = data.target    
+ 
+     #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
+ 
+     clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
+     clf = GridSearchCV(clf, cv=3,
+         param_grid=model_params,
+     # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
+                                  n_jobs=-1, scoring='f1')
+     start = time()
+     clf.fit(X, y)
+ 
+     #clf.fit(X_train, y_train)
+     print("GridSearch took %.2f seconds for %d candidates"
+       " parameter settings." % ((time() - start), n_iter_search))
+ 
+     print(clf.best_estimator_)
+     print()
+     print(clf.best_score_)
+     #print(metrics.f1_score(clf.predict(X_test), y_test))
+ 
+     #joblib.dump(clf, 'model/svm_model.pkl')
+     joblib.dump(clf.best_estimator_, 'model/svm_model.paper.pkl')
+     joblib.dump(tfidf_model, 'model/tfidf_model.paper.pkl')
+ 
+ else:
+ 
+     data=load_files(container_path=args.input, encoding=None,
+                                                     decode_error='replace')
+     clf = joblib.load(args.svcmodel)
+     vectorizer = joblib.load('model/tfidf_model.paper.pkl')
+     papers = data.data
+     X = vectorizer.transform(papers)
+     classes = clf.predict(X)
+ 
+     if not os.path.exists(args.out):
+         os.makedirs(args.out)
+     # Writing predictions to output files
+     with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \
+                     open(args.out + "/" + labels[1] + ".out", 'w') as f1:
+         for c, a in zip(classes, papers):
+             if c == 0:
+                 f0.write("%d\t%s\n" % (a['title'], a['body']))
+             elif c == 1:
+                 f1.write("%d\t%s\n" % (a['title'], a['body']))