Showing
5 changed files
with
186 additions
and
22 deletions
... | @@ -13,14 +13,10 @@ from sklearn import metrics | ... | @@ -13,14 +13,10 @@ from sklearn import metrics |
13 | from sklearn.ensemble import (ExtraTreesClassifier, RandomForestClassifier, | 13 | from sklearn.ensemble import (ExtraTreesClassifier, RandomForestClassifier, |
14 | AdaBoostClassifier, GradientBoostingClassifier) | 14 | AdaBoostClassifier, GradientBoostingClassifier) |
15 | from sklearn.grid_search import GridSearchCV | 15 | from sklearn.grid_search import GridSearchCV |
16 | +from sklearn.externals import joblib | ||
16 | import pandas as pd | 17 | import pandas as pd |
17 | from numpy import mean, std | 18 | from numpy import mean, std |
18 | 19 | ||
19 | -#Classifier = KNeighborsClassifier # 0.6464 | ||
20 | -#Classifier = NearestCentroid # 0.5054 | ||
21 | -#Classifier = RandomForestClassifier # 0.49 | ||
22 | -#Classifier = LinearSVC # 0.5402 | ||
23 | -#Classifier = SGDClassifier # 0.664 | ||
24 | 20 | ||
25 | class EstimatorSelectionHelper: | 21 | class EstimatorSelectionHelper: |
26 | "http://www.codiply.com/blog/hyperparameter-grid-search-across-multiple-models-in-scikit-learn/" | 22 | "http://www.codiply.com/blog/hyperparameter-grid-search-across-multiple-models-in-scikit-learn/" |
... | @@ -71,7 +67,7 @@ def get_abstracts(file_name, label): | ... | @@ -71,7 +67,7 @@ def get_abstracts(file_name, label): |
71 | docs = [] | 67 | docs = [] |
72 | empties = [] | 68 | empties = [] |
73 | lines = f.readlines() | 69 | lines = f.readlines() |
74 | - copyright = False | 70 | + cpright = False |
75 | 71 | ||
76 | for i, ln in enumerate(lines): | 72 | for i, ln in enumerate(lines): |
77 | if not ln.strip(): | 73 | if not ln.strip(): |
... | @@ -84,8 +80,8 @@ def get_abstracts(file_name, label): | ... | @@ -84,8 +80,8 @@ def get_abstracts(file_name, label): |
84 | break | 80 | break |
85 | continue | 81 | continue |
86 | 82 | ||
87 | - elif 'Copyright ' in ln: | 83 | + elif 'cpright ' in ln: |
88 | - copyright = True | 84 | + cpright = True |
89 | 85 | ||
90 | elif 'DOI: ' in ln: | 86 | elif 'DOI: ' in ln: |
91 | if 'PMCID: ' in lines[i + 1]: | 87 | if 'PMCID: ' in lines[i + 1]: |
... | @@ -93,9 +89,9 @@ def get_abstracts(file_name, label): | ... | @@ -93,9 +89,9 @@ def get_abstracts(file_name, label): |
93 | elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]: | 89 | elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]: |
94 | extract['pmid'] = int(lines[i + 1].strip().split()[1]) | 90 | extract['pmid'] = int(lines[i + 1].strip().split()[1]) |
95 | 91 | ||
96 | - if copyright: | 92 | + if cpright: |
97 | get = slice(empties[-3], empties[-2]) | 93 | get = slice(empties[-3], empties[-2]) |
98 | - copyright = False | 94 | + cpright = False |
99 | else: | 95 | else: |
100 | get = slice(empties[-2], empties[-1]) | 96 | get = slice(empties[-2], empties[-1]) |
101 | 97 | ||
... | @@ -115,14 +111,14 @@ def get_abstracts(file_name, label): | ... | @@ -115,14 +111,14 @@ def get_abstracts(file_name, label): |
115 | return docs | 111 | return docs |
116 | 112 | ||
117 | 113 | ||
118 | -filename="data/ecoli_abstracts/not_useful_abstracts.txt" | 114 | +filename = "data/ecoli_abstracts/not_useful_abstracts.txt" |
119 | labels = ['useless', 'useful'] | 115 | labels = ['useless', 'useful'] |
120 | 116 | ||
121 | -abstracs = get_abstracts(file_name = filename, label = labels[0]) | 117 | +abstracs = get_abstracts(file_name=filename, label=labels[0]) |
122 | 118 | ||
123 | -filename="data/ecoli_abstracts/useful_abstracts.txt" | 119 | +filename = "data/ecoli_abstracts/useful_abstracts.txt" |
124 | 120 | ||
125 | -abstracs += get_abstracts(file_name = filename, label = labels[1]) | 121 | +abstracs += get_abstracts(file_name=filename, label=labels[1]) |
126 | 122 | ||
127 | X = [x['body'] for x in abstracs] | 123 | X = [x['body'] for x in abstracs] |
128 | y = [1 if x['topic'] == 'useful' else 0 for x in abstracs] | 124 | y = [1 if x['topic'] == 'useful' else 0 for x in abstracs] |
... | @@ -136,15 +132,17 @@ models1 = { | ... | @@ -136,15 +132,17 @@ models1 = { |
136 | } | 132 | } |
137 | 133 | ||
138 | params1 = { | 134 | params1 = { |
139 | - 'ExtraTreesClassifier': { 'n_estimators': [16, 32] }, | 135 | + 'ExtraTreesClassifier': {'n_estimators': [16, 32]}, |
140 | - 'RandomForestClassifier': { 'n_estimators': [16, 32] }, | 136 | + 'RandomForestClassifier': {'n_estimators': [16, 32]}, |
141 | - 'AdaBoostClassifier': { 'n_estimators': [16, 32] }, | 137 | + 'AdaBoostClassifier': {'n_estimators': [16, 32]}, |
142 | - 'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] }, | 138 | + 'GradientBoostingClassifier': {'n_estimators': [16, 32], |
139 | + 'learning_rate': [0.8, 1.0]}, | ||
143 | 'SVC': [ | 140 | 'SVC': [ |
144 | - #{'kernel': ['linear'], 'C': [1, 10, 100, 150, 200, 300, 400]}, | 141 | + {'kernel': ['rbf'], 'C': [1, 10, 100, 150, 200, 300, 350, 400], |
145 | - {'kernel': ['rbf'], 'C': [1, 10, 100, 150, 200, 300, 400], 'gamma': [0.001, 0.0001]}, | 142 | + 'gamma': [0.1, 0.01, 0.001, 0.0001, 0.00001]}, |
146 | - {'kernel': ['poly'], 'C': [1, 10, 100, 150, 200, 300, 400], 'degree': [2, 3, 4, 5, 6]}, | 143 | + {'kernel': ['poly'], 'C': [1, 10, 100, 150, 200, 300, 350, 400], |
147 | - {'kernel': ['sigmoid'], 'C': [1, 10, 100, 150, 200, 300, 400], 'gamma': [0.001, 0.0001]}, | 144 | + 'degree': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 23, 26], |
145 | + 'coef0': [0.1, 0.2,0.3,0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]} | ||
148 | ] | 146 | ] |
149 | } | 147 | } |
150 | 148 | ||
... | @@ -171,6 +169,9 @@ X = vectorizer.fit_transform(X) | ... | @@ -171,6 +169,9 @@ X = vectorizer.fit_transform(X) |
171 | #st() | 169 | #st() |
172 | clf.fit(X, y, scoring='f1', n_jobs=-1) | 170 | clf.fit(X, y, scoring='f1', n_jobs=-1) |
173 | 171 | ||
172 | +joblib.dump(clf.best_estimator_, 'model/svm_model.pkl') | ||
173 | +joblib.dump(clf.best_estimator_, 'model/tifidf_model.pkl') | ||
174 | + | ||
174 | #pred = clf.predict(X_test) | 175 | #pred = clf.predict(X_test) |
175 | #print(metrics.f1_score(y_test, pred, average='macro')) | 176 | #print(metrics.f1_score(y_test, pred, average='macro')) |
176 | print(clf.score_summary(sort_by='min_score')) | 177 | print(clf.score_summary(sort_by='min_score')) | ... | ... |
filter_abstracts.py
0 → 100644
1 | +from pdb import set_trace as st | ||
2 | +from sklearn.cross_validation import train_test_split as splitt | ||
3 | +from sklearn.feature_extraction.text import TfidfVectorizer | ||
4 | +from sklearn.model_selection import RandomizedSearchCV | ||
5 | +from sklearn.model_selection import GridSearchCV | ||
6 | +from sklearn import metrics | ||
7 | +from sklearn.svm import SVC | ||
8 | +import numpy as np | ||
9 | +import argparse | ||
10 | +import csv | ||
11 | +from sklearn.externals import joblib | ||
12 | +from time import time | ||
13 | +from scipy.stats import randint as sp_randint | ||
14 | +from scipy.stats import expon | ||
15 | + | ||
16 | + | ||
17 | +def get_abstracts(file_name, label): | ||
18 | + f = open(file_name) | ||
19 | + extract = {} | ||
20 | + docs = [] | ||
21 | + empties = [] | ||
22 | + lines = f.readlines() | ||
23 | + copyright = False | ||
24 | + | ||
25 | + for i, ln in enumerate(lines): | ||
26 | + if not ln.strip(): | ||
27 | + empties.append(i) | ||
28 | + continue | ||
29 | + elif ' doi: ' in ln: | ||
30 | + for j in range(i, i + 10): | ||
31 | + if not lines[j].strip(): | ||
32 | + title_idx = j + 1 | ||
33 | + break | ||
34 | + continue | ||
35 | + | ||
36 | + elif 'Copyright ' in ln: | ||
37 | + copyright = True | ||
38 | + | ||
39 | + elif 'DOI: ' in ln: | ||
40 | + if 'PMCID: ' in lines[i + 1]: | ||
41 | + extract['pmid'] = int(lines[i + 2].strip().split()[1]) | ||
42 | + elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]: | ||
43 | + extract['pmid'] = int(lines[i + 1].strip().split()[1]) | ||
44 | + | ||
45 | + if copyright: | ||
46 | + get = slice(empties[-3], empties[-2]) | ||
47 | + copyright = False | ||
48 | + else: | ||
49 | + get = slice(empties[-2], empties[-1]) | ||
50 | + | ||
51 | + extract['body'] = " ".join(lines[get]).replace("\n", ' ' | ||
52 | + ).replace(" ", ' ') | ||
53 | + title = [] | ||
54 | + for j in range(title_idx, title_idx + 5): | ||
55 | + if lines[j].strip(): | ||
56 | + title.append(lines[j]) | ||
57 | + else: | ||
58 | + break | ||
59 | + extract['title'] = " ".join(title).replace("\n", ' ' | ||
60 | + ).replace(" ", ' ') | ||
61 | + extract['topic'] = label | ||
62 | + docs.append(extract) | ||
63 | + empties = [] | ||
64 | + extract = {} | ||
65 | + | ||
66 | + return docs | ||
67 | + | ||
68 | + | ||
69 | +parser = argparse.ArgumentParser( | ||
70 | + description="This script separates abstracts of biomedical papers that" | ||
71 | + "report data from biomedical experiments from those that do not.") | ||
72 | +parser.add_argument("--input", help="Input file containing the abstracts to" | ||
73 | + "be predited.") | ||
74 | +parser.add_argument("--classA", help="Input file containing the abstracts of" | ||
75 | + "class A to be learned.") | ||
76 | +parser.add_argument("--classB", help="Input file containing the abstracts of" | ||
77 | + "class B to be learned.") | ||
78 | +parser.add_argument("--out", help="Path to the output directory " | ||
79 | + "(default='./filter_output')", default="filter_output") | ||
80 | +parser.add_argument("--svcmodel", help="Path to custom pretrained svc model" | ||
81 | + "(default='./model/svm_model.pkl')", default="model/svm_model.pkl") | ||
82 | + | ||
83 | +args = parser.parse_args() | ||
84 | + | ||
85 | +labels = {'useless': 0, 'useful': 1} | ||
86 | +vectorizer = TfidfVectorizer(binary=True) | ||
87 | +print(vectorizer) | ||
88 | + | ||
89 | +if args.classA and args.classA and not args.input: | ||
90 | + f0 = open("model_params.conf") | ||
91 | + n_iter_search = 10 | ||
92 | + params = [p for p in csv.DictReader(f0)] | ||
93 | + f0.close() | ||
94 | + names = list(params[0].keys()) | ||
95 | + model_params = {n: [] for n in names} | ||
96 | + | ||
97 | + for n in names: | ||
98 | + for d in params: | ||
99 | + for k in d: | ||
100 | + if k == n: | ||
101 | + try: | ||
102 | + model_params[n].append(float(d[k])) | ||
103 | + except ValueError: | ||
104 | + model_params[n].append(d[k]) | ||
105 | + | ||
106 | + abstracs = get_abstracts(file_name=args.classA, label=labels['useless']) | ||
107 | + abstracs += get_abstracts(file_name=args.classB, label=labels['useful']) | ||
108 | + | ||
109 | + X = vectorizer.fit_transform([x['body'] for x in abstracs]) | ||
110 | + y = [x['topic'] for x in abstracs] | ||
111 | + | ||
112 | + #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42) | ||
113 | + | ||
114 | + clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9) | ||
115 | + clf = GridSearchCV(clf, cv=3, | ||
116 | + param_grid=model_params, | ||
117 | + # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search, | ||
118 | + n_jobs=-1, scoring='f1') | ||
119 | + start = time() | ||
120 | + clf.fit(X, y) | ||
121 | + | ||
122 | + #clf.fit(X_train, y_train) | ||
123 | + print("GridSearch took %.2f seconds for %d candidates" | ||
124 | + " parameter settings." % ((time() - start), n_iter_search)) | ||
125 | + | ||
126 | + print(clf.best_estimator_) | ||
127 | + print(clf) | ||
128 | + print(clf.best_score_) | ||
129 | + #print(metrics.f1_score(clf.predict(X_test), y_test)) | ||
130 | + | ||
131 | + #joblib.dump(clf, 'model/svm_model.pkl') | ||
132 | + joblib.dump(clf.best_estimator_, 'model/svm_model.pkl') | ||
133 | + joblib.dump(vectorizer, 'model/tifidf_model.pkl') | ||
134 | +else: | ||
135 | + | ||
136 | + clf = joblib.load(args.svcmodel) | ||
137 | + vectorizer = joblib.load('model/tfidf_model.pkl') | ||
138 | + #filename=args.input #"data/ecoli_abstracts/not_useful_abstracts.txt" | ||
139 | + abstracs = get_abstracts(file_name=args.input, label='unknown') | ||
140 | + X = vectorizer.fit_transform([x['body'] for x in abstracs]) | ||
141 | + classes = clf.predict(X) | ||
142 | + | ||
143 | + with open(args.output + "/" + labels[0] + ".out", 'w') as f0, \ | ||
144 | + open(args.output + "/" + labels[1] + ".out", 'w') as f1: | ||
145 | + for c, a in zip(classes, abstracs): | ||
146 | + if c == 0: | ||
147 | + f0.write("%d\t%s\n" % (a['pmid'], a['body'])) | ||
148 | + elif c == 1: | ||
149 | + f1.write("%d\t%s\n" % (a['pmid'], a['body'])) | ||
150 | +#clf.fit(X, y, scoring='f1', n_jobs=-1) |
model/svm_model.pkl
0 → 100644
No preview for this file type
model/tifidf_model.pkl
0 → 100644
No preview for this file type
model_params.conf
0 → 100644
1 | +kernel,degree,coef0,C,gamma | ||
2 | +poly,3,0.2,300,0 | ||
3 | +poly,11,0.9,150,0 | ||
4 | +rbf,0,0.5,100,0.0001 | ||
5 | +linear,1,0.5,100,0.0 | ||
6 | +linear,1,1.5,100,0.0 | ||
7 | +linear,1,2.5,100,0.0 | ||
8 | +linear,1,3.5,100,0.0 | ||
9 | +linear,1,4.5,100,0.0 | ||
10 | +linear,1,1.5,150,0.0 | ||
11 | +linear,1,2.5,200,0.0 | ||
12 | +linear,1,3.5,300,0.0 | ||
13 | +linear,1,4.5,400,0.0 | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment