iarroyof

added pretrained models and ready to predict unseen abstracts

1 # This paper talks about (and reports) experimental data 1 # This paper talks about (and reports) experimental data
2 2
3 -Automatic discrimination of useless papers via machine learning of abstracts
...\ No newline at end of file ...\ No newline at end of file
3 +Automatic discrimination of useless papers via machine learning of abstracts.
4 +
5 +The main method follows the next pipeline:
6 +
7 +### Training mode
8 +- Parse abstracts from two input files (classA and classB; see files format at the `data/` directory)
9 +- Transform abstracts into their TFIDF sparse representations
10 +- Train Support Vector Machines with different parameters by using GridSearch
11 +- Select the best estimator and save it at `model/svm_model.pkl` (default)
12 +- Save TFIDF transformation for keeping the training vocabulary (stored at `model/tfidf_model.pkl`)
13 +
14 +### Prediction mode
15 +- Parse abstracts from a unique input file
16 +- Transform abstracts into their TFIDF sparse representations
17 +- Predict useless/useful papers by means of their abstracts using pretrained Support Vector Machines
18 +
19 +# Usage
20 +
21 +For filtering unknown anstracts run
22 +
23 +```bash
24 +$ python filter_abstracts.py --input data/ecoli_abstracts/useful_abstracts.txt
25 +```
26 +The predictions will be stored by default at `filter_output/`, unless a different directory is specified by means of the `--out` option. The default names containing the predicitons are
27 +
28 +- filter_output/useful.out
29 +- filter_output/useless.out
30 +
31 +The format of each file is:
32 +
33 +```
34 +<PMID> \t <text of the abstract>
35 +...
36 +<PMID> \t <text of the abstract>
37 +```
38 +
39 +For training a new model set the list of parameters at `model_params.conf` and then run
40 +
41 +```bash
42 +$ python filter_abstracts.py --classA data/ecoli_abstracts/not_useful_abstracts.txt --classB data/ecoli_abstracts/useful_abstracts.txt
43 +```
44 +
45 +where `--classA` and `--classA` are used to specify input training files. In this example `data/ecoli_abstracts/useful_abstracts.txt` is the training files containing abstracts of papers reporting experimental data (the desired or useful class for us).
......
...@@ -28,6 +28,7 @@ class EstimatorSelectionHelper: ...@@ -28,6 +28,7 @@ class EstimatorSelectionHelper:
28 self.params = params 28 self.params = params
29 self.keys = models.keys() 29 self.keys = models.keys()
30 self.grid_searches = {} 30 self.grid_searches = {}
31 + self.best_estimator = {}
31 32
32 def fit(self, X, y, cv=3, n_jobs=1, verbose=1, scoring=None, refit=False): 33 def fit(self, X, y, cv=3, n_jobs=1, verbose=1, scoring=None, refit=False):
33 for key in self.keys: 34 for key in self.keys:
...@@ -40,24 +41,25 @@ class EstimatorSelectionHelper: ...@@ -40,24 +41,25 @@ class EstimatorSelectionHelper:
40 self.grid_searches[key] = gs 41 self.grid_searches[key] = gs
41 42
42 def score_summary(self, sort_by='mean_score'): 43 def score_summary(self, sort_by='mean_score'):
43 - def row(key, scores, params): 44 + def row(key, scores, params, model):
44 d = { 45 d = {
45 'estimator': key, 46 'estimator': key,
46 'min_score': min(scores), 47 'min_score': min(scores),
47 'max_score': max(scores), 48 'max_score': max(scores),
48 'mean_score': mean(scores), 49 'mean_score': mean(scores),
49 'std_score': std(scores), 50 'std_score': std(scores),
51 + 'model': model
50 } 52 }
51 return pd.Series(dict(list(params.items()) + list(d.items()))) 53 return pd.Series(dict(list(params.items()) + list(d.items())))
52 54
53 - rows = [row(k, gsc.cv_validation_scores, gsc.parameters) 55 + rows = [row(k, gsc.cv_validation_scores, gsc.parameters, m)
54 for k in self.keys 56 for k in self.keys
55 - for gsc in self.grid_searches[k].grid_scores_] 57 + for gsc, m in zip(self.grid_searches[k].grid_scores_, self.grid_searches[k].best_estimator_)]
56 df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False) 58 df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
57 59
58 columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score'] 60 columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
59 - columns = columns + [c for c in df.columns if c not in columns] 61 + columns = columns + [c for c in df.columns if (c not in columns and c != 'model')]
60 - 62 + self.best_estimator_ = df['model'][0]
61 return df[columns] 63 return df[columns]
62 64
63 65
...@@ -169,9 +171,9 @@ X = vectorizer.fit_transform(X) ...@@ -169,9 +171,9 @@ X = vectorizer.fit_transform(X)
169 #st() 171 #st()
170 clf.fit(X, y, scoring='f1', n_jobs=-1) 172 clf.fit(X, y, scoring='f1', n_jobs=-1)
171 173
172 -joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
173 -joblib.dump(clf.best_estimator_, 'model/tifidf_model.pkl')
174 -
175 #pred = clf.predict(X_test) 174 #pred = clf.predict(X_test)
176 #print(metrics.f1_score(y_test, pred, average='macro')) 175 #print(metrics.f1_score(y_test, pred, average='macro'))
177 print(clf.score_summary(sort_by='min_score')) 176 print(clf.score_summary(sort_by='min_score'))
177 +
178 +joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
179 +joblib.dump(vectorizer, 'model/tifidf_model.pkl')
......
1 +#from pdb import set_trace as st
2 +from sklearn.cross_validation import train_test_split as splitt
3 +from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
4 +from sklearn.decomposition import TruncatedSVD
5 +from sklearn.naive_bayes import MultinomialNB
6 +from sklearn.linear_model import SGDClassifier
7 +from sklearn.neighbors import KNeighborsClassifier
8 +from sklearn.neighbors import NearestCentroid
9 +from sklearn.ensemble import RandomForestClassifier
10 +from sklearn.svm import LinearSVC
11 +from sklearn.svm import SVC
12 +from sklearn import metrics
13 +from sklearn.ensemble import (ExtraTreesClassifier, RandomForestClassifier,
14 + AdaBoostClassifier, GradientBoostingClassifier)
15 +from sklearn.grid_search import GridSearchCV
16 +from sklearn.externals import joblib
17 +import pandas as pd
18 +from numpy import mean, std
19 +
20 +
21 +class EstimatorSelectionHelper:
22 + "http://www.codiply.com/blog/hyperparameter-grid-search-across-multiple-models-in-scikit-learn/"
23 + def __init__(self, models, params):
24 + if not set(models.keys()).issubset(set(params.keys())):
25 + missing_params = list(set(models.keys()) - set(params.keys()))
26 + raise ValueError("Some estimators are missing parameters: %s" % missing_params)
27 + self.models = models
28 + self.params = params
29 + self.keys = models.keys()
30 + self.grid_searches = {}
31 + self.best_estimator = {}
32 +
33 + def fit(self, X, y, cv=3, n_jobs=1, verbose=1, scoring=None, refit=False):
34 + for key in self.keys:
35 + print("Running GridSearchCV for %s." % key)
36 + model = self.models[key]
37 + params = self.params[key]
38 + gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
39 + verbose=verbose, scoring=scoring, refit=refit)
40 + gs.fit(X,y)
41 + self.grid_searches[key] = gs
42 +
43 + def score_summary(self, sort_by='mean_score'):
44 + def row(key, scores, params, model):
45 + d = {
46 + 'estimator': key,
47 + 'min_score': min(scores),
48 + 'max_score': max(scores),
49 + 'mean_score': mean(scores),
50 + 'std_score': std(scores),
51 + 'model': model
52 + }
53 + return pd.Series(dict(list(params.items()) + list(d.items())))
54 +
55 + rows = [row(k, gsc.cv_validation_scores, gsc.parameters, m)
56 + for k in self.keys
57 + for gsc, m in zip(self.grid_searches[k].grid_scores_, self.grid_searches[k].best_estimator_)]
58 + df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
59 +
60 + columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
61 + columns = columns + [c for c in df.columns if (c not in columns and c != 'model')]
62 + self.best_estimator_ = df['model'][0]
63 + return df[columns]
64 +
65 +
66 +def get_abstracts(file_name, label):
67 + f = open(file_name)
68 + extract = {}
69 + docs = []
70 + empties = []
71 + lines = f.readlines()
72 + cpright = False
73 +
74 + for i, ln in enumerate(lines):
75 + if not ln.strip():
76 + empties.append(i)
77 + continue
78 + elif ' doi: ' in ln:
79 + for j in range(i, i + 10):
80 + if not lines[j].strip():
81 + title_idx = j + 1
82 + break
83 + continue
84 +
85 + elif 'cpright ' in ln:
86 + cpright = True
87 +
88 + elif 'DOI: ' in ln:
89 + if 'PMCID: ' in lines[i + 1]:
90 + extract['pmid'] = int(lines[i + 2].strip().split()[1])
91 + elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]:
92 + extract['pmid'] = int(lines[i + 1].strip().split()[1])
93 +
94 + if cpright:
95 + get = slice(empties[-3], empties[-2])
96 + cpright = False
97 + else:
98 + get = slice(empties[-2], empties[-1])
99 +
100 + extract['body'] = " ".join(lines[get]).replace("\n", ' ').replace(" ", ' ')
101 + title = []
102 + for j in range(title_idx, title_idx + 5):
103 + if lines[j].strip():
104 + title.append(lines[j])
105 + else:
106 + break
107 + extract['title'] = " ".join(title).replace("\n", ' ').replace(" ", ' ')
108 + extract['topic'] = label
109 + docs.append(extract)
110 + empties = []
111 + extract = {}
112 +
113 + return docs
114 +
115 +
116 +filename = "data/ecoli_abstracts/not_useful_abstracts.txt"
117 +labels = ['useless', 'useful']
118 +
119 +abstracs = get_abstracts(file_name=filename, label=labels[0])
120 +
121 +filename = "data/ecoli_abstracts/useful_abstracts.txt"
122 +
123 +abstracs += get_abstracts(file_name=filename, label=labels[1])
124 +
125 +X = [x['body'] for x in abstracs]
126 +y = [1 if x['topic'] == 'useful' else 0 for x in abstracs]
127 +
128 +models1 = {
129 + 'ExtraTreesClassifier': ExtraTreesClassifier(),
130 + 'RandomForestClassifier': RandomForestClassifier(),
131 + 'AdaBoostClassifier': AdaBoostClassifier(),
132 + 'GradientBoostingClassifier': GradientBoostingClassifier(),
133 + 'SVC': SVC()
134 +}
135 +
136 +params1 = {
137 + 'ExtraTreesClassifier': {'n_estimators': [16, 32]},
138 + 'RandomForestClassifier': {'n_estimators': [16, 32]},
139 + 'AdaBoostClassifier': {'n_estimators': [16, 32]},
140 + 'GradientBoostingClassifier': {'n_estimators': [16, 32],
141 + 'learning_rate': [0.8, 1.0]},
142 + 'SVC': [
143 + {'kernel': ['rbf'], 'C': [1, 10, 100, 150, 200, 300, 350, 400],
144 + 'gamma': [0.1, 0.01, 0.001, 0.0001, 0.00001]},
145 + {'kernel': ['poly'], 'C': [1, 10, 100, 150, 200, 300, 350, 400],
146 + 'degree': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 23, 26],
147 + 'coef0': [0.1, 0.2,0.3,0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
148 + ]
149 +}
150 +
151 +clf = EstimatorSelectionHelper(models1, params1)
152 +
153 +vectorizer = TfidfVectorizer(binary=True)
154 + #ngram_range=(1, 3)
155 + #)
156 +#vectorizer = HashingVectorizer(non_negative=True)
157 +print(vectorizer)
158 +#svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20)
159 +X = vectorizer.fit_transform(X)
160 +#X = svd.fit_transform(X)
161 +
162 +#X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
163 +
164 +#from sklearn.feature_selection import chi2, SelectKBest
165 +#ch2 = SelectKBest(chi2, k=200)
166 +#X_train = ch2.fit_transform(X_train, y_train)
167 +#X_test = ch2.transform(X_test)
168 +
169 +#clf = MultinomialNB(alpha=.01)
170 +#clf = Classifier(n_jobs=-1, n_iter=100)
171 +#st()
172 +clf.fit(X, y, scoring='f1', n_jobs=-1)
173 +
174 +#pred = clf.predict(X_test)
175 +#print(metrics.f1_score(y_test, pred, average='macro'))
176 +print(clf.score_summary(sort_by='min_score'))
177 +
178 +joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
179 +joblib.dump(vectorizer, 'model/tifidf_model.pkl')
1 +TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
2 + dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
3 + lowercase=True, max_df=1.0, max_features=None, min_df=1,
4 + ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
5 + stop_words=None, strip_accents=None, sublinear_tf=False,
6 + token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
7 + vocabulary=None)
8 +Running GridSearchCV for GradientBoostingClassifier.
9 +Fitting 3 folds for each of 4 candidates, totalling 12 fits
10 +Running GridSearchCV for AdaBoostClassifier.
11 +Fitting 3 folds for each of 2 candidates, totalling 6 fits
12 +Running GridSearchCV for ExtraTreesClassifier.
13 +Fitting 3 folds for each of 2 candidates, totalling 6 fits
14 +Running GridSearchCV for SVC.
15 +Fitting 3 folds for each of 63 candidates, totalling 189 fits
16 +Running GridSearchCV for RandomForestClassifier.
17 +Fitting 3 folds for each of 2 candidates, totalling 6 fits
18 + estimator min_score mean_score max_score std_score \
19 +36 SVC 0.69697 0.702911 0.705882 0.00420147
20 +66 SVC 0.69697 0.702911 0.705882 0.00420147
21 +35 SVC 0.69697 0.702911 0.705882 0.00420147
22 +37 SVC 0.69697 0.702911 0.705882 0.00420147
23 +38 SVC 0.69697 0.702911 0.705882 0.00420147
24 +39 SVC 0.69697 0.702911 0.705882 0.00420147
25 +40 SVC 0.69697 0.702911 0.705882 0.00420147
26 +41 SVC 0.69697 0.702911 0.705882 0.00420147
27 +42 SVC 0.69697 0.702911 0.705882 0.00420147
28 +43 SVC 0.69697 0.702911 0.705882 0.00420147
29 +44 SVC 0.69697 0.702911 0.705882 0.00420147
30 +45 SVC 0.69697 0.702911 0.705882 0.00420147
31 +46 SVC 0.69697 0.702911 0.705882 0.00420147
32 +47 SVC 0.69697 0.702911 0.705882 0.00420147
33 +48 SVC 0.69697 0.702911 0.705882 0.00420147
34 +49 SVC 0.69697 0.702911 0.705882 0.00420147
35 +50 SVC 0.69697 0.702911 0.705882 0.00420147
36 +51 SVC 0.69697 0.702911 0.705882 0.00420147
37 +52 SVC 0.69697 0.702911 0.705882 0.00420147
38 +53 SVC 0.69697 0.702911 0.705882 0.00420147
39 +54 SVC 0.69697 0.702911 0.705882 0.00420147
40 +55 SVC 0.69697 0.702911 0.705882 0.00420147
41 +56 SVC 0.69697 0.702911 0.705882 0.00420147
42 +57 SVC 0.69697 0.702911 0.705882 0.00420147
43 +58 SVC 0.69697 0.702911 0.705882 0.00420147
44 +59 SVC 0.69697 0.702911 0.705882 0.00420147
45 +60 SVC 0.69697 0.702911 0.705882 0.00420147
46 +61 SVC 0.69697 0.702911 0.705882 0.00420147
47 +62 SVC 0.69697 0.702911 0.705882 0.00420147
48 +63 SVC 0.69697 0.702911 0.705882 0.00420147
49 +.. ... ... ... ... ...
50 +12 SVC 0.69697 0.702911 0.705882 0.00420147
51 +13 SVC 0.69697 0.702911 0.705882 0.00420147
52 +14 SVC 0.69697 0.702911 0.705882 0.00420147
53 +15 SVC 0.69697 0.702911 0.705882 0.00420147
54 +16 SVC 0.69697 0.702911 0.705882 0.00420147
55 +17 SVC 0.69697 0.702911 0.705882 0.00420147
56 +26 SVC 0.69697 0.702911 0.705882 0.00420147
57 +25 SVC 0.69697 0.702911 0.705882 0.00420147
58 +30 SVC 0.69697 0.702911 0.705882 0.00420147
59 +29 SVC 0.69697 0.702911 0.705882 0.00420147
60 +28 SVC 0.69697 0.702911 0.705882 0.00420147
61 +27 SVC 0.69697 0.702911 0.705882 0.00420147
62 +19 SVC 0.69697 0.702911 0.705882 0.00420147
63 +65 SVC 0.69697 0.702911 0.705882 0.00420147
64 +24 SVC 0.69697 0.702911 0.705882 0.00420147
65 +23 SVC 0.69697 0.702911 0.705882 0.00420147
66 +22 SVC 0.69697 0.702911 0.705882 0.00420147
67 +21 SVC 0.69697 0.702911 0.705882 0.00420147
68 +18 SVC 0.686567 0.693502 0.69697 0.0049038
69 +20 SVC 0.676923 0.691047 0.707692 0.0126874
70 +7 ExtraTreesClassifier 0.619048 0.662524 0.688525 0.0309388
71 +6 ExtraTreesClassifier 0.588235 0.611627 0.655738 0.0312098
72 +1 GradientBoostingClassifier 0.577778 0.595982 0.610169 0.0135256
73 +0 GradientBoostingClassifier 0.5 0.549894 0.596491 0.0394613
74 +71 RandomForestClassifier 0.470588 0.557789 0.625 0.0646035
75 +3 GradientBoostingClassifier 0.454545 0.548927 0.596491 0.0667386
76 +2 GradientBoostingClassifier 0.439024 0.588593 0.701754 0.110305
77 +5 AdaBoostClassifier 0.411765 0.489657 0.618182 0.0915596
78 +4 AdaBoostClassifier 0.4 0.54013 0.655172 0.105673
79 +72 RandomForestClassifier 0.380952 0.504177 0.631579 0.10236
80 +
81 + C degree gamma kernel learning_rate n_estimators
82 +36 100 6 NaN poly NaN NaN
83 +66 200 NaN 0.0001 sigmoid NaN NaN
84 +35 100 5 NaN poly NaN NaN
85 +37 150 2 NaN poly NaN NaN
86 +38 150 3 NaN poly NaN NaN
87 +39 150 4 NaN poly NaN NaN
88 +40 150 5 NaN poly NaN NaN
89 +41 150 6 NaN poly NaN NaN
90 +42 200 2 NaN poly NaN NaN
91 +43 200 3 NaN poly NaN NaN
92 +44 200 4 NaN poly NaN NaN
93 +45 200 5 NaN poly NaN NaN
94 +46 200 6 NaN poly NaN NaN
95 +47 300 2 NaN poly NaN NaN
96 +48 300 3 NaN poly NaN NaN
97 +49 300 4 NaN poly NaN NaN
98 +50 300 5 NaN poly NaN NaN
99 +51 300 6 NaN poly NaN NaN
100 +52 400 2 NaN poly NaN NaN
101 +53 400 3 NaN poly NaN NaN
102 +54 400 4 NaN poly NaN NaN
103 +55 400 5 NaN poly NaN NaN
104 +56 400 6 NaN poly NaN NaN
105 +57 1 NaN 0.001 sigmoid NaN NaN
106 +58 1 NaN 0.0001 sigmoid NaN NaN
107 +59 10 NaN 0.001 sigmoid NaN NaN
108 +60 10 NaN 0.0001 sigmoid NaN NaN
109 +61 100 NaN 0.001 sigmoid NaN NaN
110 +62 100 NaN 0.0001 sigmoid NaN NaN
111 +63 150 NaN 0.001 sigmoid NaN NaN
112 +.. ... ... ... ... ... ...
113 +12 100 NaN 0.001 rbf NaN NaN
114 +13 100 NaN 0.0001 rbf NaN NaN
115 +14 150 NaN 0.001 rbf NaN NaN
116 +15 150 NaN 0.0001 rbf NaN NaN
117 +16 200 NaN 0.001 rbf NaN NaN
118 +17 200 NaN 0.0001 rbf NaN NaN
119 +26 1 6 NaN poly NaN NaN
120 +25 1 5 NaN poly NaN NaN
121 +30 10 5 NaN poly NaN NaN
122 +29 10 4 NaN poly NaN NaN
123 +28 10 3 NaN poly NaN NaN
124 +27 10 2 NaN poly NaN NaN
125 +19 300 NaN 0.0001 rbf NaN NaN
126 +65 200 NaN 0.001 sigmoid NaN NaN
127 +24 1 4 NaN poly NaN NaN
128 +23 1 3 NaN poly NaN NaN
129 +22 1 2 NaN poly NaN NaN
130 +21 400 NaN 0.0001 rbf NaN NaN
131 +18 300 NaN 0.001 rbf NaN NaN
132 +20 400 NaN 0.001 rbf NaN NaN
133 +7 NaN NaN NaN NaN NaN 32
134 +6 NaN NaN NaN NaN NaN 16
135 +1 NaN NaN NaN NaN 0.8 32
136 +0 NaN NaN NaN NaN 0.8 16
137 +71 NaN NaN NaN NaN NaN 16
138 +3 NaN NaN NaN NaN 1 32
139 +2 NaN NaN NaN NaN 1 16
140 +5 NaN NaN NaN NaN NaN 32
141 +4 NaN NaN NaN NaN NaN 16
142 +72 NaN NaN NaN NaN NaN 32
143 +
144 +[73 rows x 11 columns]
1 -from pdb import set_trace as st 1 +#from pdb import set_trace as st
2 from sklearn.cross_validation import train_test_split as splitt 2 from sklearn.cross_validation import train_test_split as splitt
3 from sklearn.feature_extraction.text import TfidfVectorizer 3 from sklearn.feature_extraction.text import TfidfVectorizer
4 from sklearn.model_selection import RandomizedSearchCV 4 from sklearn.model_selection import RandomizedSearchCV
...@@ -8,10 +8,12 @@ from sklearn.svm import SVC ...@@ -8,10 +8,12 @@ from sklearn.svm import SVC
8 import numpy as np 8 import numpy as np
9 import argparse 9 import argparse
10 import csv 10 import csv
11 +import os
11 from sklearn.externals import joblib 12 from sklearn.externals import joblib
12 from time import time 13 from time import time
13 from scipy.stats import randint as sp_randint 14 from scipy.stats import randint as sp_randint
14 from scipy.stats import expon 15 from scipy.stats import expon
16 +from sklearn.preprocessing import label_binarize
15 17
16 18
17 def get_abstracts(file_name, label): 19 def get_abstracts(file_name, label):
...@@ -33,7 +35,7 @@ def get_abstracts(file_name, label): ...@@ -33,7 +35,7 @@ def get_abstracts(file_name, label):
33 break 35 break
34 continue 36 continue
35 37
36 - elif 'Copyright ' in ln: 38 + elif 'Copyright ' in ln or 'Publish' in ln or u'\N{COPYRIGHT SIGN}' in ln:
37 copyright = True 39 copyright = True
38 40
39 elif 'DOI: ' in ln: 41 elif 'DOI: ' in ln:
...@@ -82,7 +84,7 @@ parser.add_argument("--svcmodel", help="Path to custom pretrained svc model" ...@@ -82,7 +84,7 @@ parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
82 84
83 args = parser.parse_args() 85 args = parser.parse_args()
84 86
85 -labels = {'useless': 0, 'useful': 1} 87 +labels = {0: 'useless', 1: 'useful'}
86 vectorizer = TfidfVectorizer(binary=True) 88 vectorizer = TfidfVectorizer(binary=True)
87 print(vectorizer) 89 print(vectorizer)
88 90
...@@ -103,11 +105,14 @@ if args.classA and args.classA and not args.input: ...@@ -103,11 +105,14 @@ if args.classA and args.classA and not args.input:
103 except ValueError: 105 except ValueError:
104 model_params[n].append(d[k]) 106 model_params[n].append(d[k])
105 107
106 - abstracs = get_abstracts(file_name=args.classA, label=labels['useless']) 108 + model_params = {k: list(set(model_params[k])) for k in model_params}
107 - abstracs += get_abstracts(file_name=args.classB, label=labels['useful']) 109 + abstracs = get_abstracts(file_name=args.classA, label=labels[0])
110 + abstracs += get_abstracts(file_name=args.classB, label=labels[1])
108 111
109 - X = vectorizer.fit_transform([x['body'] for x in abstracs]) 112 + tfidf_model = vectorizer.fit([x['body'] for x in abstracs])
110 - y = [x['topic'] for x in abstracs] 113 + X = vectorizer.transform([x['body'] for x in abstracs])
114 + #y = [x['topic'] for x in abstracs]
115 + y = [0 if x['topic'] == 'useless' else 1 for x in abstracs]
111 116
112 #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42) 117 #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
113 118
...@@ -124,27 +129,29 @@ if args.classA and args.classA and not args.input: ...@@ -124,27 +129,29 @@ if args.classA and args.classA and not args.input:
124 " parameter settings." % ((time() - start), n_iter_search)) 129 " parameter settings." % ((time() - start), n_iter_search))
125 130
126 print(clf.best_estimator_) 131 print(clf.best_estimator_)
127 - print(clf) 132 + print()
128 print(clf.best_score_) 133 print(clf.best_score_)
129 #print(metrics.f1_score(clf.predict(X_test), y_test)) 134 #print(metrics.f1_score(clf.predict(X_test), y_test))
130 135
131 #joblib.dump(clf, 'model/svm_model.pkl') 136 #joblib.dump(clf, 'model/svm_model.pkl')
132 joblib.dump(clf.best_estimator_, 'model/svm_model.pkl') 137 joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
133 - joblib.dump(vectorizer, 'model/tifidf_model.pkl') 138 + joblib.dump(tfidf_model, 'model/tfidf_model.pkl')
139 +
134 else: 140 else:
135 141
136 clf = joblib.load(args.svcmodel) 142 clf = joblib.load(args.svcmodel)
137 vectorizer = joblib.load('model/tfidf_model.pkl') 143 vectorizer = joblib.load('model/tfidf_model.pkl')
138 - #filename=args.input #"data/ecoli_abstracts/not_useful_abstracts.txt"
139 abstracs = get_abstracts(file_name=args.input, label='unknown') 144 abstracs = get_abstracts(file_name=args.input, label='unknown')
140 - X = vectorizer.fit_transform([x['body'] for x in abstracs]) 145 + X = vectorizer.transform([x['body'] for x in abstracs])
141 classes = clf.predict(X) 146 classes = clf.predict(X)
142 147
143 - with open(args.output + "/" + labels[0] + ".out", 'w') as f0, \ 148 + if not os.path.exists(args.out):
144 - open(args.output + "/" + labels[1] + ".out", 'w') as f1: 149 + os.makedirs(args.out)
150 + # Writing predictions to output files
151 + with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \
152 + open(args.out + "/" + labels[1] + ".out", 'w') as f1:
145 for c, a in zip(classes, abstracs): 153 for c, a in zip(classes, abstracs):
146 if c == 0: 154 if c == 0:
147 f0.write("%d\t%s\n" % (a['pmid'], a['body'])) 155 f0.write("%d\t%s\n" % (a['pmid'], a['body']))
148 elif c == 1: 156 elif c == 1:
149 f1.write("%d\t%s\n" % (a['pmid'], a['body'])) 157 f1.write("%d\t%s\n" % (a['pmid'], a['body']))
150 -#clf.fit(X, y, scoring='f1', n_jobs=-1)
......
This diff could not be displayed because it is too large.
File mode changed
No preview for this file type
No preview for this file type
1 kernel,degree,coef0,C,gamma 1 kernel,degree,coef0,C,gamma
2 -poly,3,0.2,300,0
3 -poly,11,0.9,150,0
4 -rbf,0,0.5,100,0.0001
5 linear,1,0.5,100,0.0 2 linear,1,0.5,100,0.0
6 -linear,1,1.5,100,0.0
7 -linear,1,2.5,100,0.0
8 -linear,1,3.5,100,0.0
9 -linear,1,4.5,100,0.0
10 -linear,1,1.5,150,0.0
11 -linear,1,2.5,200,0.0
12 -linear,1,3.5,300,0.0
13 -linear,1,4.5,400,0.0
...\ No newline at end of file ...\ No newline at end of file
3 +linear,1,0.5,10,0.0
4 +linear,1,0.5,50,0.0
5 +linear,1,0.5,100,0.0
6 +linear,1,0.5,5,0.0
7 +linear,1,0.5,150,0.0
8 +linear,1,0.5,200,0.0
9 +linear,1,0.5,300,0.0
10 +linear,1,0.5,400,0.0
11 +poly,3,0.0,100,0.0
......