Ignacio Arroyo

testing pretrained model

...@@ -13,14 +13,10 @@ from sklearn import metrics ...@@ -13,14 +13,10 @@ from sklearn import metrics
13 from sklearn.ensemble import (ExtraTreesClassifier, RandomForestClassifier, 13 from sklearn.ensemble import (ExtraTreesClassifier, RandomForestClassifier,
14 AdaBoostClassifier, GradientBoostingClassifier) 14 AdaBoostClassifier, GradientBoostingClassifier)
15 from sklearn.grid_search import GridSearchCV 15 from sklearn.grid_search import GridSearchCV
16 +from sklearn.externals import joblib
16 import pandas as pd 17 import pandas as pd
17 from numpy import mean, std 18 from numpy import mean, std
18 19
19 -#Classifier = KNeighborsClassifier # 0.6464
20 -#Classifier = NearestCentroid # 0.5054
21 -#Classifier = RandomForestClassifier # 0.49
22 -#Classifier = LinearSVC # 0.5402
23 -#Classifier = SGDClassifier # 0.664
24 20
25 class EstimatorSelectionHelper: 21 class EstimatorSelectionHelper:
26 "http://www.codiply.com/blog/hyperparameter-grid-search-across-multiple-models-in-scikit-learn/" 22 "http://www.codiply.com/blog/hyperparameter-grid-search-across-multiple-models-in-scikit-learn/"
...@@ -71,7 +67,7 @@ def get_abstracts(file_name, label): ...@@ -71,7 +67,7 @@ def get_abstracts(file_name, label):
71 docs = [] 67 docs = []
72 empties = [] 68 empties = []
73 lines = f.readlines() 69 lines = f.readlines()
74 - copyright = False 70 + cpright = False
75 71
76 for i, ln in enumerate(lines): 72 for i, ln in enumerate(lines):
77 if not ln.strip(): 73 if not ln.strip():
...@@ -84,8 +80,8 @@ def get_abstracts(file_name, label): ...@@ -84,8 +80,8 @@ def get_abstracts(file_name, label):
84 break 80 break
85 continue 81 continue
86 82
87 - elif 'Copyright ' in ln: 83 + elif 'cpright ' in ln:
88 - copyright = True 84 + cpright = True
89 85
90 elif 'DOI: ' in ln: 86 elif 'DOI: ' in ln:
91 if 'PMCID: ' in lines[i + 1]: 87 if 'PMCID: ' in lines[i + 1]:
...@@ -93,9 +89,9 @@ def get_abstracts(file_name, label): ...@@ -93,9 +89,9 @@ def get_abstracts(file_name, label):
93 elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]: 89 elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]:
94 extract['pmid'] = int(lines[i + 1].strip().split()[1]) 90 extract['pmid'] = int(lines[i + 1].strip().split()[1])
95 91
96 - if copyright: 92 + if cpright:
97 get = slice(empties[-3], empties[-2]) 93 get = slice(empties[-3], empties[-2])
98 - copyright = False 94 + cpright = False
99 else: 95 else:
100 get = slice(empties[-2], empties[-1]) 96 get = slice(empties[-2], empties[-1])
101 97
...@@ -115,14 +111,14 @@ def get_abstracts(file_name, label): ...@@ -115,14 +111,14 @@ def get_abstracts(file_name, label):
115 return docs 111 return docs
116 112
117 113
118 -filename="data/ecoli_abstracts/not_useful_abstracts.txt" 114 +filename = "data/ecoli_abstracts/not_useful_abstracts.txt"
119 labels = ['useless', 'useful'] 115 labels = ['useless', 'useful']
120 116
121 -abstracs = get_abstracts(file_name = filename, label = labels[0]) 117 +abstracs = get_abstracts(file_name=filename, label=labels[0])
122 118
123 -filename="data/ecoli_abstracts/useful_abstracts.txt" 119 +filename = "data/ecoli_abstracts/useful_abstracts.txt"
124 120
125 -abstracs += get_abstracts(file_name = filename, label = labels[1]) 121 +abstracs += get_abstracts(file_name=filename, label=labels[1])
126 122
127 X = [x['body'] for x in abstracs] 123 X = [x['body'] for x in abstracs]
128 y = [1 if x['topic'] == 'useful' else 0 for x in abstracs] 124 y = [1 if x['topic'] == 'useful' else 0 for x in abstracs]
...@@ -136,15 +132,17 @@ models1 = { ...@@ -136,15 +132,17 @@ models1 = {
136 } 132 }
137 133
138 params1 = { 134 params1 = {
139 - 'ExtraTreesClassifier': { 'n_estimators': [16, 32] }, 135 + 'ExtraTreesClassifier': {'n_estimators': [16, 32]},
140 - 'RandomForestClassifier': { 'n_estimators': [16, 32] }, 136 + 'RandomForestClassifier': {'n_estimators': [16, 32]},
141 - 'AdaBoostClassifier': { 'n_estimators': [16, 32] }, 137 + 'AdaBoostClassifier': {'n_estimators': [16, 32]},
142 - 'GradientBoostingClassifier': { 'n_estimators': [16, 32], 'learning_rate': [0.8, 1.0] }, 138 + 'GradientBoostingClassifier': {'n_estimators': [16, 32],
139 + 'learning_rate': [0.8, 1.0]},
143 'SVC': [ 140 'SVC': [
144 - #{'kernel': ['linear'], 'C': [1, 10, 100, 150, 200, 300, 400]}, 141 + {'kernel': ['rbf'], 'C': [1, 10, 100, 150, 200, 300, 350, 400],
145 - {'kernel': ['rbf'], 'C': [1, 10, 100, 150, 200, 300, 400], 'gamma': [0.001, 0.0001]}, 142 + 'gamma': [0.1, 0.01, 0.001, 0.0001, 0.00001]},
146 - {'kernel': ['poly'], 'C': [1, 10, 100, 150, 200, 300, 400], 'degree': [2, 3, 4, 5, 6]}, 143 + {'kernel': ['poly'], 'C': [1, 10, 100, 150, 200, 300, 350, 400],
147 - {'kernel': ['sigmoid'], 'C': [1, 10, 100, 150, 200, 300, 400], 'gamma': [0.001, 0.0001]}, 144 + 'degree': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 23, 26],
145 + 'coef0': [0.1, 0.2,0.3,0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
148 ] 146 ]
149 } 147 }
150 148
...@@ -171,6 +169,9 @@ X = vectorizer.fit_transform(X) ...@@ -171,6 +169,9 @@ X = vectorizer.fit_transform(X)
171 #st() 169 #st()
172 clf.fit(X, y, scoring='f1', n_jobs=-1) 170 clf.fit(X, y, scoring='f1', n_jobs=-1)
173 171
172 +joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
173 +joblib.dump(clf.best_estimator_, 'model/tifidf_model.pkl')
174 +
174 #pred = clf.predict(X_test) 175 #pred = clf.predict(X_test)
175 #print(metrics.f1_score(y_test, pred, average='macro')) 176 #print(metrics.f1_score(y_test, pred, average='macro'))
176 print(clf.score_summary(sort_by='min_score')) 177 print(clf.score_summary(sort_by='min_score'))
......
1 +from pdb import set_trace as st
2 +from sklearn.cross_validation import train_test_split as splitt
3 +from sklearn.feature_extraction.text import TfidfVectorizer
4 +from sklearn.model_selection import RandomizedSearchCV
5 +from sklearn.model_selection import GridSearchCV
6 +from sklearn import metrics
7 +from sklearn.svm import SVC
8 +import numpy as np
9 +import argparse
10 +import csv
11 +from sklearn.externals import joblib
12 +from time import time
13 +from scipy.stats import randint as sp_randint
14 +from scipy.stats import expon
15 +
16 +
17 +def get_abstracts(file_name, label):
18 + f = open(file_name)
19 + extract = {}
20 + docs = []
21 + empties = []
22 + lines = f.readlines()
23 + copyright = False
24 +
25 + for i, ln in enumerate(lines):
26 + if not ln.strip():
27 + empties.append(i)
28 + continue
29 + elif ' doi: ' in ln:
30 + for j in range(i, i + 10):
31 + if not lines[j].strip():
32 + title_idx = j + 1
33 + break
34 + continue
35 +
36 + elif 'Copyright ' in ln:
37 + copyright = True
38 +
39 + elif 'DOI: ' in ln:
40 + if 'PMCID: ' in lines[i + 1]:
41 + extract['pmid'] = int(lines[i + 2].strip().split()[1])
42 + elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]:
43 + extract['pmid'] = int(lines[i + 1].strip().split()[1])
44 +
45 + if copyright:
46 + get = slice(empties[-3], empties[-2])
47 + copyright = False
48 + else:
49 + get = slice(empties[-2], empties[-1])
50 +
51 + extract['body'] = " ".join(lines[get]).replace("\n", ' '
52 + ).replace(" ", ' ')
53 + title = []
54 + for j in range(title_idx, title_idx + 5):
55 + if lines[j].strip():
56 + title.append(lines[j])
57 + else:
58 + break
59 + extract['title'] = " ".join(title).replace("\n", ' '
60 + ).replace(" ", ' ')
61 + extract['topic'] = label
62 + docs.append(extract)
63 + empties = []
64 + extract = {}
65 +
66 + return docs
67 +
68 +
69 +parser = argparse.ArgumentParser(
70 + description="This script separates abstracts of biomedical papers that"
71 + "report data from biomedical experiments from those that do not.")
72 +parser.add_argument("--input", help="Input file containing the abstracts to"
73 + "be predited.")
74 +parser.add_argument("--classA", help="Input file containing the abstracts of"
75 + "class A to be learned.")
76 +parser.add_argument("--classB", help="Input file containing the abstracts of"
77 + "class B to be learned.")
78 +parser.add_argument("--out", help="Path to the output directory "
79 + "(default='./filter_output')", default="filter_output")
80 +parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
81 + "(default='./model/svm_model.pkl')", default="model/svm_model.pkl")
82 +
83 +args = parser.parse_args()
84 +
85 +labels = {'useless': 0, 'useful': 1}
86 +vectorizer = TfidfVectorizer(binary=True)
87 +print(vectorizer)
88 +
89 +if args.classA and args.classA and not args.input:
90 + f0 = open("model_params.conf")
91 + n_iter_search = 10
92 + params = [p for p in csv.DictReader(f0)]
93 + f0.close()
94 + names = list(params[0].keys())
95 + model_params = {n: [] for n in names}
96 +
97 + for n in names:
98 + for d in params:
99 + for k in d:
100 + if k == n:
101 + try:
102 + model_params[n].append(float(d[k]))
103 + except ValueError:
104 + model_params[n].append(d[k])
105 +
106 + abstracs = get_abstracts(file_name=args.classA, label=labels['useless'])
107 + abstracs += get_abstracts(file_name=args.classB, label=labels['useful'])
108 +
109 + X = vectorizer.fit_transform([x['body'] for x in abstracs])
110 + y = [x['topic'] for x in abstracs]
111 +
112 + #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
113 +
114 + clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
115 + clf = GridSearchCV(clf, cv=3,
116 + param_grid=model_params,
117 + # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
118 + n_jobs=-1, scoring='f1')
119 + start = time()
120 + clf.fit(X, y)
121 +
122 + #clf.fit(X_train, y_train)
123 + print("GridSearch took %.2f seconds for %d candidates"
124 + " parameter settings." % ((time() - start), n_iter_search))
125 +
126 + print(clf.best_estimator_)
127 + print(clf)
128 + print(clf.best_score_)
129 + #print(metrics.f1_score(clf.predict(X_test), y_test))
130 +
131 + #joblib.dump(clf, 'model/svm_model.pkl')
132 + joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
133 + joblib.dump(vectorizer, 'model/tifidf_model.pkl')
134 +else:
135 +
136 + clf = joblib.load(args.svcmodel)
137 + vectorizer = joblib.load('model/tfidf_model.pkl')
138 + #filename=args.input #"data/ecoli_abstracts/not_useful_abstracts.txt"
139 + abstracs = get_abstracts(file_name=args.input, label='unknown')
140 + X = vectorizer.fit_transform([x['body'] for x in abstracs])
141 + classes = clf.predict(X)
142 +
143 + with open(args.output + "/" + labels[0] + ".out", 'w') as f0, \
144 + open(args.output + "/" + labels[1] + ".out", 'w') as f1:
145 + for c, a in zip(classes, abstracs):
146 + if c == 0:
147 + f0.write("%d\t%s\n" % (a['pmid'], a['body']))
148 + elif c == 1:
149 + f1.write("%d\t%s\n" % (a['pmid'], a['body']))
150 +#clf.fit(X, y, scoring='f1', n_jobs=-1)
No preview for this file type
No preview for this file type
1 +kernel,degree,coef0,C,gamma
2 +poly,3,0.2,300,0
3 +poly,11,0.9,150,0
4 +rbf,0,0.5,100,0.0001
5 +linear,1,0.5,100,0.0
6 +linear,1,1.5,100,0.0
7 +linear,1,2.5,100,0.0
8 +linear,1,3.5,100,0.0
9 +linear,1,4.5,100,0.0
10 +linear,1,1.5,150,0.0
11 +linear,1,2.5,200,0.0
12 +linear,1,3.5,300,0.0
13 +linear,1,4.5,400,0.0
...\ No newline at end of file ...\ No newline at end of file