iarroyof

Final version for abstracts

1 -#from pdb import set_trace as st
2 -from sklearn.cross_validation import train_test_split as splitt
3 from sklearn.feature_extraction.text import TfidfVectorizer 1 from sklearn.feature_extraction.text import TfidfVectorizer
4 from sklearn.decomposition import TruncatedSVD 2 from sklearn.decomposition import TruncatedSVD
5 -from sklearn.model_selection import RandomizedSearchCV
6 from sklearn.model_selection import GridSearchCV 3 from sklearn.model_selection import GridSearchCV
7 from sklearn import metrics 4 from sklearn import metrics
8 from sklearn.svm import SVC 5 from sklearn.svm import SVC
...@@ -12,9 +9,6 @@ import csv ...@@ -12,9 +9,6 @@ import csv
12 import os 9 import os
13 from sklearn.externals import joblib 10 from sklearn.externals import joblib
14 from time import time 11 from time import time
15 -from scipy.stats import randint as sp_randint
16 -from scipy.stats import expon
17 -from sklearn.preprocessing import label_binarize
18 12
19 13
20 def get_abstracts(file_name, label): 14 def get_abstracts(file_name, label):
...@@ -75,22 +69,21 @@ parser = argparse.ArgumentParser( ...@@ -75,22 +69,21 @@ parser = argparse.ArgumentParser(
75 parser.add_argument("--input", help="Input file containing the abstracts to" 69 parser.add_argument("--input", help="Input file containing the abstracts to"
76 "be predited.") 70 "be predited.")
77 parser.add_argument("--classA", help="Input file containing the abstracts of" 71 parser.add_argument("--classA", help="Input file containing the abstracts of"
78 - "class A to be learned.") 72 + " class useless to be learned.")
79 parser.add_argument("--classB", help="Input file containing the abstracts of" 73 parser.add_argument("--classB", help="Input file containing the abstracts of"
80 - "class B to be learned.") 74 + " class USEFUL to be learned.")
81 parser.add_argument("--out", help="Path to the output directory " 75 parser.add_argument("--out", help="Path to the output directory "
82 "(default='./filter_output')", default="filter_output") 76 "(default='./filter_output')", default="filter_output")
83 parser.add_argument("--svcmodel", help="Path to custom pretrained svc model" 77 parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
84 - "(default='./model/svm_model.pkl')", default="model/svm_model.pkl") 78 + "(default='./model_binClass/svm_model.pkl')", default="model_binClass/svm_model.pkl")
85 79
86 args = parser.parse_args() 80 args = parser.parse_args()
87 81
88 labels = {0: 'useless', 1: 'useful'} 82 labels = {0: 'useless', 1: 'useful'}
89 vectorizer = TfidfVectorizer(binary=True) 83 vectorizer = TfidfVectorizer(binary=True)
90 -print(vectorizer)
91 84
92 if args.classA and args.classB and not args.input: 85 if args.classA and args.classB and not args.input:
93 - f0 = open("model_params.conf") 86 + f0 = open("model_params_binClass.conf")
94 n_iter_search = 10 87 n_iter_search = 10
95 params = [p for p in csv.DictReader(f0)] 88 params = [p for p in csv.DictReader(f0)]
96 f0.close() 89 f0.close()
...@@ -115,38 +108,38 @@ if args.classA and args.classB and not args.input: ...@@ -115,38 +108,38 @@ if args.classA and args.classB and not args.input:
115 svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20) 108 svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20)
116 svd_model = svd.fit(X) 109 svd_model = svd.fit(X)
117 X = svd_model.transform(X) 110 X = svd_model.transform(X)
118 - #y = [x['topic'] for x in abstracs]
119 y = [0 if x['topic'] == 'useless' else 1 for x in abstracs] 111 y = [0 if x['topic'] == 'useless' else 1 for x in abstracs]
120 112
121 - #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42) 113 + clf = SVC()
122 - 114 + clf = GridSearchCV(clf, cv=3, param_grid=model_params,
123 - clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
124 - clf = GridSearchCV(clf, cv=3,
125 - param_grid=model_params,
126 - # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
127 n_jobs=-1, scoring='f1') 115 n_jobs=-1, scoring='f1')
128 start = time() 116 start = time()
129 clf.fit(X, y) 117 clf.fit(X, y)
130 118
131 - #clf.fit(X_train, y_train)
132 print("GridSearch took %.2f seconds for %d candidates" 119 print("GridSearch took %.2f seconds for %d candidates"
133 " parameter settings." % ((time() - start), n_iter_search)) 120 " parameter settings." % ((time() - start), n_iter_search))
134 121
122 + print()
123 + print("The best model parameters:")
124 + print(vectorizer)
125 + print(svd)
135 print(clf.best_estimator_) 126 print(clf.best_estimator_)
136 print() 127 print()
128 + print("The best F1 score:")
137 print(clf.best_score_) 129 print(clf.best_score_)
138 - #print(metrics.f1_score(clf.predict(X_test), y_test))
139 130
140 - #joblib.dump(clf, 'model/svm_model.pkl') 131 + joblib.dump(clf.best_estimator_, 'model_binClass/svm_model.pkl')
141 - joblib.dump(clf.best_estimator_, 'model/svm_model.pkl') 132 + joblib.dump(tfidf_model, 'model_binClass/tfidf_model.pkl')
142 - joblib.dump(tfidf_model, 'model/tfidf_model.pkl') 133 + joblib.dump(svd_model, 'model_binClass/svd_model.pkl')
143 - joblib.dump(svd_model, 'model/svd_model.pkl')
144 134
145 else: 135 else:
146 136
147 clf = joblib.load(args.svcmodel) 137 clf = joblib.load(args.svcmodel)
148 - vectorizer = joblib.load('model/tfidf_model.pkl') 138 + vectorizer = joblib.load('model_binClass/tfidf_model.pkl')
149 - svd = joblib.load('model/svd_model.pkl') 139 + svd = joblib.load('model_binClass/svd_model.pkl')
140 + print(vectorizer)
141 + print(svd)
142 + print(clf)
150 abstracs = get_abstracts(file_name=args.input, label='unknown') 143 abstracs = get_abstracts(file_name=args.input, label='unknown')
151 X = vectorizer.transform([x['body'] for x in abstracs]) 144 X = vectorizer.transform([x['body'] for x in abstracs])
152 X = svd.transform(X) 145 X = svd.transform(X)
...@@ -162,3 +155,5 @@ else: ...@@ -162,3 +155,5 @@ else:
162 f0.write("%d\t%s\n" % (a['pmid'], a['body'])) 155 f0.write("%d\t%s\n" % (a['pmid'], a['body']))
163 elif c == 1: 156 elif c == 1:
164 f1.write("%d\t%s\n" % (a['pmid'], a['body'])) 157 f1.write("%d\t%s\n" % (a['pmid'], a['body']))
158 +
159 + print ("FINISHED!!")
......
1 -#from pdb import set_trace as st
2 -from sklearn.cross_validation import train_test_split as splitt
3 from sklearn.feature_extraction.text import TfidfVectorizer 1 from sklearn.feature_extraction.text import TfidfVectorizer
4 from sklearn.decomposition import TruncatedSVD 2 from sklearn.decomposition import TruncatedSVD
5 -from sklearn.model_selection import RandomizedSearchCV
6 from sklearn.model_selection import GridSearchCV 3 from sklearn.model_selection import GridSearchCV
7 from sklearn import metrics 4 from sklearn import metrics
8 -from sklearn.svm import SVC 5 +
6 +from sklearn.svm import OneClassSVM
9 import numpy as np 7 import numpy as np
10 import argparse 8 import argparse
11 import csv 9 import csv
12 import os 10 import os
13 from sklearn.externals import joblib 11 from sklearn.externals import joblib
14 from time import time 12 from time import time
15 -from scipy.stats import randint as sp_randint
16 -from scipy.stats import expon
17 -from sklearn.preprocessing import label_binarize
18 13
19 14
20 def get_abstracts(file_name, label): 15 def get_abstracts(file_name, label):
...@@ -75,22 +70,22 @@ parser = argparse.ArgumentParser( ...@@ -75,22 +70,22 @@ parser = argparse.ArgumentParser(
75 parser.add_argument("--input", help="Input file containing the abstracts to" 70 parser.add_argument("--input", help="Input file containing the abstracts to"
76 "be predited.") 71 "be predited.")
77 parser.add_argument("--classA", help="Input file containing the abstracts of" 72 parser.add_argument("--classA", help="Input file containing the abstracts of"
78 - "class A to be learned.") 73 + " class USEFUL to be learned.")
79 parser.add_argument("--classB", help="Input file containing the abstracts of" 74 parser.add_argument("--classB", help="Input file containing the abstracts of"
80 - "class B to be learned.") 75 + " class useless to be learned.")
81 parser.add_argument("--out", help="Path to the output directory " 76 parser.add_argument("--out", help="Path to the output directory "
82 "(default='./filter_output')", default="filter_output") 77 "(default='./filter_output')", default="filter_output")
83 parser.add_argument("--svcmodel", help="Path to custom pretrained svc model" 78 parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
84 - "(default='./model/svm_model.pkl')", default="model/svm_model.pkl") 79 + "(default='./model/svm_model.pkl')", default="model_oneClass/svm_model.pkl")
85 80
86 args = parser.parse_args() 81 args = parser.parse_args()
87 82
88 labels = {0: 'useless', 1: 'useful'} 83 labels = {0: 'useless', 1: 'useful'}
89 vectorizer = TfidfVectorizer(binary=True) 84 vectorizer = TfidfVectorizer(binary=True)
90 -print(vectorizer) 85 +
91 86
92 if args.classA and args.classB and not args.input: 87 if args.classA and args.classB and not args.input:
93 - f0 = open("model_params.conf") 88 + f0 = open("model_params_oneClass.conf")
94 n_iter_search = 10 89 n_iter_search = 10
95 params = [p for p in csv.DictReader(f0)] 90 params = [p for p in csv.DictReader(f0)]
96 f0.close() 91 f0.close()
...@@ -115,50 +110,52 @@ if args.classA and args.classB and not args.input: ...@@ -115,50 +110,52 @@ if args.classA and args.classB and not args.input:
115 svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20) 110 svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20)
116 svd_model = svd.fit(X) 111 svd_model = svd.fit(X)
117 X = svd_model.transform(X) 112 X = svd_model.transform(X)
118 - #y = [x['topic'] for x in abstracs] 113 + y = [-1 if x['topic'] == 'useless' else 1 for x in abstracs]
119 - y = [0 if x['topic'] == 'useless' else 1 for x in abstracs]
120 -
121 - #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
122 114
123 - clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9) 115 + clf = OneClassSVM()
124 - clf = GridSearchCV(clf, cv=3, 116 + clf = GridSearchCV(clf, cv=3, param_grid=model_params,
125 - param_grid=model_params,
126 - # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
127 n_jobs=-1, scoring='f1') 117 n_jobs=-1, scoring='f1')
128 start = time() 118 start = time()
129 clf.fit(X, y) 119 clf.fit(X, y)
130 120
131 - #clf.fit(X_train, y_train)
132 print("GridSearch took %.2f seconds for %d candidates" 121 print("GridSearch took %.2f seconds for %d candidates"
133 " parameter settings." % ((time() - start), n_iter_search)) 122 " parameter settings." % ((time() - start), n_iter_search))
134 123
124 + print()
125 + print("The best model parameters:")
126 + print(vectorizer)
127 + print(svd)
135 print(clf.best_estimator_) 128 print(clf.best_estimator_)
136 print() 129 print()
130 + print("The best F1 score:")
137 print(clf.best_score_) 131 print(clf.best_score_)
138 - #print(metrics.f1_score(clf.predict(X_test), y_test))
139 132
140 - #joblib.dump(clf, 'model/svm_model.pkl') 133 + joblib.dump(clf.best_estimator_, 'model_oneClass/svm_model.pkl')
141 - joblib.dump(clf.best_estimator_, 'model/svm_model.pkl') 134 + joblib.dump(tfidf_model, 'model_oneClass/tfidf_model.pkl')
142 - joblib.dump(tfidf_model, 'model/tfidf_model.pkl') 135 + joblib.dump(svd_model, 'model_oneClass/svd_model.pkl')
143 - joblib.dump(svd_model, 'model/svd_model.pkl')
144 136
145 else: 137 else:
146 138
147 clf = joblib.load(args.svcmodel) 139 clf = joblib.load(args.svcmodel)
148 - vectorizer = joblib.load('model/tfidf_model.pkl') 140 + vectorizer = joblib.load('model_oneClass/tfidf_model.pkl')
149 - svd = joblib.load('model/svd_model.pkl') 141 + svd = joblib.load('model_oneClass/svd_model.pkl')
142 + print(vectorizer)
143 + print(svd)
144 + print(clf)
150 abstracs = get_abstracts(file_name=args.input, label='unknown') 145 abstracs = get_abstracts(file_name=args.input, label='unknown')
151 X = vectorizer.transform([x['body'] for x in abstracs]) 146 X = vectorizer.transform([x['body'] for x in abstracs])
152 X = svd.transform(X) 147 X = svd.transform(X)
153 classes = clf.predict(X) 148 classes = clf.predict(X)
154 - 149 +
155 if not os.path.exists(args.out): 150 if not os.path.exists(args.out):
156 os.makedirs(args.out) 151 os.makedirs(args.out)
157 # Writing predictions to output files 152 # Writing predictions to output files
158 with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \ 153 with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \
159 open(args.out + "/" + labels[1] + ".out", 'w') as f1: 154 open(args.out + "/" + labels[1] + ".out", 'w') as f1:
160 for c, a in zip(classes, abstracs): 155 for c, a in zip(classes, abstracs):
161 - if c == 0: 156 + if c == 1:
162 f0.write("%d\t%s\n" % (a['pmid'], a['body'])) 157 f0.write("%d\t%s\n" % (a['pmid'], a['body']))
163 - elif c == 1: 158 + elif c == -1:
164 f1.write("%d\t%s\n" % (a['pmid'], a['body'])) 159 f1.write("%d\t%s\n" % (a['pmid'], a['body']))
160 +
161 + print("FINISHED!!")
......
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
1 +kernel,degree,coef0,C,gamma
2 +linear,1,0.5,100,0.0
3 +linear,1,0.5,10,0.0
4 +linear,1,0.5,50,0.0
5 +linear,1,0.5,100,0.0
6 +linear,1,0.5,5,0.0
7 +linear,1,0.5,150,0.0
8 +linear,1,0.5,200,0.0
9 +linear,1,0.5,300,0.0
10 +linear,1,0.5,400,0.0
11 +linear,1,0.5,1.0,0.0
12 +linear,1,0.5,5.0,0.0
1 +kernel,degree,coef0,nu,gamma
2 +linear,1,0.5,1.0,0.0
3 +linear,1,0.5,0.9,0.0
4 +linear,1,0.5,0.8,0.0
5 +linear,1,0.5,0.7,0.0
6 +linear,1,0.5,0.6,0.0
7 +linear,1,0.5,0.5,0.0
8 +linear,1,0.5,0.4,0.0
9 +linear,1,0.5,0.3,0.0
10 +linear,1,0.5,0.2,0.0
11 +linear,1,0.5,0.1,0.0
12 +rbf,1,0.5,1.0,2.0
13 +rbf,1,0.5,0.9,0.0001
14 +rbf,1,0.5,0.8,0.0001
15 +rbf,1,0.5,0.7,0.0001
16 +rbf,1,0.5,0.6,0.001
17 +rbf,1,0.5,0.5,0.001
18 +rbf,1,0.5,0.4,0.001
19 +rbf,1,0.5,0.7,0.0001
20 +rbf,1,0.5,0.4,0.0001
21 +rbf,1,0.5,0.5,0.0001
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.