Showing
2 changed files
with
111 additions
and
3 deletions
... | @@ -18,15 +18,15 @@ The main method follows the next pipeline: | ... | @@ -18,15 +18,15 @@ The main method follows the next pipeline: |
18 | 18 | ||
19 | # Usage | 19 | # Usage |
20 | 20 | ||
21 | -For filtering unknown anstracts run | 21 | +For filtering unknown abstracts run |
22 | 22 | ||
23 | ```bash | 23 | ```bash |
24 | $ python filter_abstracts.py --input data/ecoli_abstracts/useful_abstracts.txt | 24 | $ python filter_abstracts.py --input data/ecoli_abstracts/useful_abstracts.txt |
25 | ``` | 25 | ``` |
26 | The predictions will be stored by default at `filter_output/`, unless a different directory is specified by means of the `--out` option. The default names containing the predicitons are | 26 | The predictions will be stored by default at `filter_output/`, unless a different directory is specified by means of the `--out` option. The default names containing the predicitons are |
27 | 27 | ||
28 | -- filter_output/useful.out | 28 | +- `filter_output/useful.out` |
29 | -- filter_output/useless.out | 29 | +- `filter_output/useless.out` |
30 | 30 | ||
31 | The format of each file is: | 31 | The format of each file is: |
32 | 32 | ... | ... |
filter_papers.py
0 → 100644
1 | +#from pdb import set_trace as st | ||
2 | +from sklearn.cross_validation import train_test_split as splitt | ||
3 | +from sklearn.feature_extraction.text import TfidfVectorizer | ||
4 | +from sklearn.model_selection import RandomizedSearchCV | ||
5 | +from sklearn.model_selection import GridSearchCV | ||
6 | +from sklearn import metrics | ||
7 | +from sklearn.svm import SVC | ||
8 | +import numpy as np | ||
9 | +import argparse | ||
10 | +import csv | ||
11 | +import os | ||
12 | +from sklearn.externals import joblib | ||
13 | +from time import time | ||
14 | +from scipy.stats import randint as sp_randint | ||
15 | +from scipy.stats import expon | ||
16 | +from sklearn.preprocessing import label_binarize | ||
17 | +from sklearn.datasets import load_files | ||
18 | + | ||
19 | + | ||
20 | +parser = argparse.ArgumentParser( | ||
21 | + description="This script separates biomedical papers that" | ||
22 | + "report data from biomedical experiments from those that do not.") | ||
23 | +parser.add_argument("--input", help="Input file containing the to" | ||
24 | + "be predited.") | ||
25 | +parser.add_argument("--traind", help="Input directory containing the papers of" | ||
26 | + "two classes to be learned.") | ||
27 | +parser.add_argument("--out", help="Path to the output directory " | ||
28 | + "(default='./filter_output')", default="filter_output") | ||
29 | +parser.add_argument("--svcmodel", help="Path to custom pretrained svc model" | ||
30 | + "(default='./model/svm_model.paper.pkl')", default="model/svm_model.paper.pkl") | ||
31 | + | ||
32 | +args = parser.parse_args() | ||
33 | + | ||
34 | +data=load_files(container_path=args.traind, encoding=None, | ||
35 | + decode_error='replace') | ||
36 | +labels = data.target_names | ||
37 | + | ||
38 | +vectorizer = TfidfVectorizer(binary=True) | ||
39 | +print(vectorizer) | ||
40 | + | ||
41 | +if args.train and not args.input: | ||
42 | + f0 = open("model_params.conf") | ||
43 | + n_iter_search = 10 | ||
44 | + params = [p for p in csv.DictReader(f0)] | ||
45 | + f0.close() | ||
46 | + names = list(params[0].keys()) | ||
47 | + model_params = {n: [] for n in names} | ||
48 | + | ||
49 | + for n in names: | ||
50 | + for d in params: | ||
51 | + for k in d: | ||
52 | + if k == n: | ||
53 | + try: | ||
54 | + model_params[n].append(float(d[k])) | ||
55 | + except ValueError: | ||
56 | + model_params[n].append(d[k]) | ||
57 | + | ||
58 | + model_params = {k: list(set(model_params[k])) for k in model_params} | ||
59 | + papers = data.data | ||
60 | + | ||
61 | + tfidf_model = vectorizer.fit(papers) | ||
62 | + X = vectorizer.transform(papers) | ||
63 | + #y = [x['topic'] for x in abstracs] | ||
64 | + y = data.target | ||
65 | + | ||
66 | + #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42) | ||
67 | + | ||
68 | + clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9) | ||
69 | + clf = GridSearchCV(clf, cv=3, | ||
70 | + param_grid=model_params, | ||
71 | + # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search, | ||
72 | + n_jobs=-1, scoring='f1') | ||
73 | + start = time() | ||
74 | + clf.fit(X, y) | ||
75 | + | ||
76 | + #clf.fit(X_train, y_train) | ||
77 | + print("GridSearch took %.2f seconds for %d candidates" | ||
78 | + " parameter settings." % ((time() - start), n_iter_search)) | ||
79 | + | ||
80 | + print(clf.best_estimator_) | ||
81 | + print() | ||
82 | + print(clf.best_score_) | ||
83 | + #print(metrics.f1_score(clf.predict(X_test), y_test)) | ||
84 | + | ||
85 | + #joblib.dump(clf, 'model/svm_model.pkl') | ||
86 | + joblib.dump(clf.best_estimator_, 'model/svm_model.paper.pkl') | ||
87 | + joblib.dump(tfidf_model, 'model/tfidf_model.paper.pkl') | ||
88 | + | ||
89 | +else: | ||
90 | + | ||
91 | + data=load_files(container_path=args.input, encoding=None, | ||
92 | + decode_error='replace') | ||
93 | + clf = joblib.load(args.svcmodel) | ||
94 | + vectorizer = joblib.load('model/tfidf_model.paper.pkl') | ||
95 | + papers = data.data | ||
96 | + X = vectorizer.transform(papers) | ||
97 | + classes = clf.predict(X) | ||
98 | + | ||
99 | + if not os.path.exists(args.out): | ||
100 | + os.makedirs(args.out) | ||
101 | + # Writing predictions to output files | ||
102 | + with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \ | ||
103 | + open(args.out + "/" + labels[1] + ".out", 'w') as f1: | ||
104 | + for c, a in zip(classes, papers): | ||
105 | + if c == 0: | ||
106 | + f0.write("%d\t%s\n" % (a['title'], a['body'])) | ||
107 | + elif c == 1: | ||
108 | + f1.write("%d\t%s\n" % (a['title'], a['body'])) |
-
Please register or login to post a comment