iarroyof

modified readme

...@@ -18,15 +18,15 @@ The main method follows the next pipeline: ...@@ -18,15 +18,15 @@ The main method follows the next pipeline:
18 18
19 # Usage 19 # Usage
20 20
21 -For filtering unknown anstracts run 21 +For filtering unknown abstracts run
22 22
23 ```bash 23 ```bash
24 $ python filter_abstracts.py --input data/ecoli_abstracts/useful_abstracts.txt 24 $ python filter_abstracts.py --input data/ecoli_abstracts/useful_abstracts.txt
25 ``` 25 ```
26 The predictions will be stored by default at `filter_output/`, unless a different directory is specified by means of the `--out` option. The default names containing the predicitons are 26 The predictions will be stored by default at `filter_output/`, unless a different directory is specified by means of the `--out` option. The default names containing the predicitons are
27 27
28 -- filter_output/useful.out 28 +- `filter_output/useful.out`
29 -- filter_output/useless.out 29 +- `filter_output/useless.out`
30 30
31 The format of each file is: 31 The format of each file is:
32 32
......
1 +#from pdb import set_trace as st
2 +from sklearn.cross_validation import train_test_split as splitt
3 +from sklearn.feature_extraction.text import TfidfVectorizer
4 +from sklearn.model_selection import RandomizedSearchCV
5 +from sklearn.model_selection import GridSearchCV
6 +from sklearn import metrics
7 +from sklearn.svm import SVC
8 +import numpy as np
9 +import argparse
10 +import csv
11 +import os
12 +from sklearn.externals import joblib
13 +from time import time
14 +from scipy.stats import randint as sp_randint
15 +from scipy.stats import expon
16 +from sklearn.preprocessing import label_binarize
17 +from sklearn.datasets import load_files
18 +
19 +
20 +parser = argparse.ArgumentParser(
21 + description="This script separates biomedical papers that"
22 + "report data from biomedical experiments from those that do not.")
23 +parser.add_argument("--input", help="Input file containing the to"
24 + "be predited.")
25 +parser.add_argument("--traind", help="Input directory containing the papers of"
26 + "two classes to be learned.")
27 +parser.add_argument("--out", help="Path to the output directory "
28 + "(default='./filter_output')", default="filter_output")
29 +parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
30 + "(default='./model/svm_model.paper.pkl')", default="model/svm_model.paper.pkl")
31 +
32 +args = parser.parse_args()
33 +
34 +data=load_files(container_path=args.traind, encoding=None,
35 + decode_error='replace')
36 +labels = data.target_names
37 +
38 +vectorizer = TfidfVectorizer(binary=True)
39 +print(vectorizer)
40 +
41 +if args.train and not args.input:
42 + f0 = open("model_params.conf")
43 + n_iter_search = 10
44 + params = [p for p in csv.DictReader(f0)]
45 + f0.close()
46 + names = list(params[0].keys())
47 + model_params = {n: [] for n in names}
48 +
49 + for n in names:
50 + for d in params:
51 + for k in d:
52 + if k == n:
53 + try:
54 + model_params[n].append(float(d[k]))
55 + except ValueError:
56 + model_params[n].append(d[k])
57 +
58 + model_params = {k: list(set(model_params[k])) for k in model_params}
59 + papers = data.data
60 +
61 + tfidf_model = vectorizer.fit(papers)
62 + X = vectorizer.transform(papers)
63 + #y = [x['topic'] for x in abstracs]
64 + y = data.target
65 +
66 + #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
67 +
68 + clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
69 + clf = GridSearchCV(clf, cv=3,
70 + param_grid=model_params,
71 + # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
72 + n_jobs=-1, scoring='f1')
73 + start = time()
74 + clf.fit(X, y)
75 +
76 + #clf.fit(X_train, y_train)
77 + print("GridSearch took %.2f seconds for %d candidates"
78 + " parameter settings." % ((time() - start), n_iter_search))
79 +
80 + print(clf.best_estimator_)
81 + print()
82 + print(clf.best_score_)
83 + #print(metrics.f1_score(clf.predict(X_test), y_test))
84 +
85 + #joblib.dump(clf, 'model/svm_model.pkl')
86 + joblib.dump(clf.best_estimator_, 'model/svm_model.paper.pkl')
87 + joblib.dump(tfidf_model, 'model/tfidf_model.paper.pkl')
88 +
89 +else:
90 +
91 + data=load_files(container_path=args.input, encoding=None,
92 + decode_error='replace')
93 + clf = joblib.load(args.svcmodel)
94 + vectorizer = joblib.load('model/tfidf_model.paper.pkl')
95 + papers = data.data
96 + X = vectorizer.transform(papers)
97 + classes = clf.predict(X)
98 +
99 + if not os.path.exists(args.out):
100 + os.makedirs(args.out)
101 + # Writing predictions to output files
102 + with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \
103 + open(args.out + "/" + labels[1] + ".out", 'w') as f1:
104 + for c, a in zip(classes, papers):
105 + if c == 0:
106 + f0.write("%d\t%s\n" % (a['title'], a['body']))
107 + elif c == 1:
108 + f1.write("%d\t%s\n" % (a['title'], a['body']))