Showing
20 changed files
with
82 additions
and
57 deletions
1 | -#from pdb import set_trace as st | ||
2 | -from sklearn.cross_validation import train_test_split as splitt | ||
3 | from sklearn.feature_extraction.text import TfidfVectorizer | 1 | from sklearn.feature_extraction.text import TfidfVectorizer |
4 | from sklearn.decomposition import TruncatedSVD | 2 | from sklearn.decomposition import TruncatedSVD |
5 | -from sklearn.model_selection import RandomizedSearchCV | ||
6 | from sklearn.model_selection import GridSearchCV | 3 | from sklearn.model_selection import GridSearchCV |
7 | from sklearn import metrics | 4 | from sklearn import metrics |
8 | from sklearn.svm import SVC | 5 | from sklearn.svm import SVC |
... | @@ -12,9 +9,6 @@ import csv | ... | @@ -12,9 +9,6 @@ import csv |
12 | import os | 9 | import os |
13 | from sklearn.externals import joblib | 10 | from sklearn.externals import joblib |
14 | from time import time | 11 | from time import time |
15 | -from scipy.stats import randint as sp_randint | ||
16 | -from scipy.stats import expon | ||
17 | -from sklearn.preprocessing import label_binarize | ||
18 | 12 | ||
19 | 13 | ||
20 | def get_abstracts(file_name, label): | 14 | def get_abstracts(file_name, label): |
... | @@ -75,22 +69,21 @@ parser = argparse.ArgumentParser( | ... | @@ -75,22 +69,21 @@ parser = argparse.ArgumentParser( |
75 | parser.add_argument("--input", help="Input file containing the abstracts to" | 69 | parser.add_argument("--input", help="Input file containing the abstracts to" |
76 | "be predited.") | 70 | "be predited.") |
77 | parser.add_argument("--classA", help="Input file containing the abstracts of" | 71 | parser.add_argument("--classA", help="Input file containing the abstracts of" |
78 | - "class A to be learned.") | 72 | + " class useless to be learned.") |
79 | parser.add_argument("--classB", help="Input file containing the abstracts of" | 73 | parser.add_argument("--classB", help="Input file containing the abstracts of" |
80 | - "class B to be learned.") | 74 | + " class USEFUL to be learned.") |
81 | parser.add_argument("--out", help="Path to the output directory " | 75 | parser.add_argument("--out", help="Path to the output directory " |
82 | "(default='./filter_output')", default="filter_output") | 76 | "(default='./filter_output')", default="filter_output") |
83 | parser.add_argument("--svcmodel", help="Path to custom pretrained svc model" | 77 | parser.add_argument("--svcmodel", help="Path to custom pretrained svc model" |
84 | - "(default='./model/svm_model.pkl')", default="model/svm_model.pkl") | 78 | + "(default='./model_binClass/svm_model.pkl')", default="model_binClass/svm_model.pkl") |
85 | 79 | ||
86 | args = parser.parse_args() | 80 | args = parser.parse_args() |
87 | 81 | ||
88 | labels = {0: 'useless', 1: 'useful'} | 82 | labels = {0: 'useless', 1: 'useful'} |
89 | vectorizer = TfidfVectorizer(binary=True) | 83 | vectorizer = TfidfVectorizer(binary=True) |
90 | -print(vectorizer) | ||
91 | 84 | ||
92 | if args.classA and args.classB and not args.input: | 85 | if args.classA and args.classB and not args.input: |
93 | - f0 = open("model_params.conf") | 86 | + f0 = open("model_params_binClass.conf") |
94 | n_iter_search = 10 | 87 | n_iter_search = 10 |
95 | params = [p for p in csv.DictReader(f0)] | 88 | params = [p for p in csv.DictReader(f0)] |
96 | f0.close() | 89 | f0.close() |
... | @@ -115,38 +108,38 @@ if args.classA and args.classB and not args.input: | ... | @@ -115,38 +108,38 @@ if args.classA and args.classB and not args.input: |
115 | svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20) | 108 | svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20) |
116 | svd_model = svd.fit(X) | 109 | svd_model = svd.fit(X) |
117 | X = svd_model.transform(X) | 110 | X = svd_model.transform(X) |
118 | - #y = [x['topic'] for x in abstracs] | ||
119 | y = [0 if x['topic'] == 'useless' else 1 for x in abstracs] | 111 | y = [0 if x['topic'] == 'useless' else 1 for x in abstracs] |
120 | 112 | ||
121 | - #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42) | 113 | + clf = SVC() |
122 | - | 114 | + clf = GridSearchCV(clf, cv=3, param_grid=model_params, |
123 | - clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9) | ||
124 | - clf = GridSearchCV(clf, cv=3, | ||
125 | - param_grid=model_params, | ||
126 | - # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search, | ||
127 | n_jobs=-1, scoring='f1') | 115 | n_jobs=-1, scoring='f1') |
128 | start = time() | 116 | start = time() |
129 | clf.fit(X, y) | 117 | clf.fit(X, y) |
130 | 118 | ||
131 | - #clf.fit(X_train, y_train) | ||
132 | print("GridSearch took %.2f seconds for %d candidates" | 119 | print("GridSearch took %.2f seconds for %d candidates" |
133 | " parameter settings." % ((time() - start), n_iter_search)) | 120 | " parameter settings." % ((time() - start), n_iter_search)) |
134 | 121 | ||
122 | + print() | ||
123 | + print("The best model parameters:") | ||
124 | + print(vectorizer) | ||
125 | + print(svd) | ||
135 | print(clf.best_estimator_) | 126 | print(clf.best_estimator_) |
136 | print() | 127 | print() |
128 | + print("The best F1 score:") | ||
137 | print(clf.best_score_) | 129 | print(clf.best_score_) |
138 | - #print(metrics.f1_score(clf.predict(X_test), y_test)) | ||
139 | 130 | ||
140 | - #joblib.dump(clf, 'model/svm_model.pkl') | 131 | + joblib.dump(clf.best_estimator_, 'model_binClass/svm_model.pkl') |
141 | - joblib.dump(clf.best_estimator_, 'model/svm_model.pkl') | 132 | + joblib.dump(tfidf_model, 'model_binClass/tfidf_model.pkl') |
142 | - joblib.dump(tfidf_model, 'model/tfidf_model.pkl') | 133 | + joblib.dump(svd_model, 'model_binClass/svd_model.pkl') |
143 | - joblib.dump(svd_model, 'model/svd_model.pkl') | ||
144 | 134 | ||
145 | else: | 135 | else: |
146 | 136 | ||
147 | clf = joblib.load(args.svcmodel) | 137 | clf = joblib.load(args.svcmodel) |
148 | - vectorizer = joblib.load('model/tfidf_model.pkl') | 138 | + vectorizer = joblib.load('model_binClass/tfidf_model.pkl') |
149 | - svd = joblib.load('model/svd_model.pkl') | 139 | + svd = joblib.load('model_binClass/svd_model.pkl') |
140 | + print(vectorizer) | ||
141 | + print(svd) | ||
142 | + print(clf) | ||
150 | abstracs = get_abstracts(file_name=args.input, label='unknown') | 143 | abstracs = get_abstracts(file_name=args.input, label='unknown') |
151 | X = vectorizer.transform([x['body'] for x in abstracs]) | 144 | X = vectorizer.transform([x['body'] for x in abstracs]) |
152 | X = svd.transform(X) | 145 | X = svd.transform(X) |
... | @@ -162,3 +155,5 @@ else: | ... | @@ -162,3 +155,5 @@ else: |
162 | f0.write("%d\t%s\n" % (a['pmid'], a['body'])) | 155 | f0.write("%d\t%s\n" % (a['pmid'], a['body'])) |
163 | elif c == 1: | 156 | elif c == 1: |
164 | f1.write("%d\t%s\n" % (a['pmid'], a['body'])) | 157 | f1.write("%d\t%s\n" % (a['pmid'], a['body'])) |
158 | + | ||
159 | + print ("FINISHED!!") | ... | ... |
1 | -#from pdb import set_trace as st | ||
2 | -from sklearn.cross_validation import train_test_split as splitt | ||
3 | from sklearn.feature_extraction.text import TfidfVectorizer | 1 | from sklearn.feature_extraction.text import TfidfVectorizer |
4 | from sklearn.decomposition import TruncatedSVD | 2 | from sklearn.decomposition import TruncatedSVD |
5 | -from sklearn.model_selection import RandomizedSearchCV | ||
6 | from sklearn.model_selection import GridSearchCV | 3 | from sklearn.model_selection import GridSearchCV |
7 | from sklearn import metrics | 4 | from sklearn import metrics |
8 | -from sklearn.svm import SVC | 5 | + |
6 | +from sklearn.svm import OneClassSVM | ||
9 | import numpy as np | 7 | import numpy as np |
10 | import argparse | 8 | import argparse |
11 | import csv | 9 | import csv |
12 | import os | 10 | import os |
13 | from sklearn.externals import joblib | 11 | from sklearn.externals import joblib |
14 | from time import time | 12 | from time import time |
15 | -from scipy.stats import randint as sp_randint | ||
16 | -from scipy.stats import expon | ||
17 | -from sklearn.preprocessing import label_binarize | ||
18 | 13 | ||
19 | 14 | ||
20 | def get_abstracts(file_name, label): | 15 | def get_abstracts(file_name, label): |
... | @@ -75,22 +70,22 @@ parser = argparse.ArgumentParser( | ... | @@ -75,22 +70,22 @@ parser = argparse.ArgumentParser( |
75 | parser.add_argument("--input", help="Input file containing the abstracts to" | 70 | parser.add_argument("--input", help="Input file containing the abstracts to" |
76 | "be predited.") | 71 | "be predited.") |
77 | parser.add_argument("--classA", help="Input file containing the abstracts of" | 72 | parser.add_argument("--classA", help="Input file containing the abstracts of" |
78 | - "class A to be learned.") | 73 | + " class USEFUL to be learned.") |
79 | parser.add_argument("--classB", help="Input file containing the abstracts of" | 74 | parser.add_argument("--classB", help="Input file containing the abstracts of" |
80 | - "class B to be learned.") | 75 | + " class useless to be learned.") |
81 | parser.add_argument("--out", help="Path to the output directory " | 76 | parser.add_argument("--out", help="Path to the output directory " |
82 | "(default='./filter_output')", default="filter_output") | 77 | "(default='./filter_output')", default="filter_output") |
83 | parser.add_argument("--svcmodel", help="Path to custom pretrained svc model" | 78 | parser.add_argument("--svcmodel", help="Path to custom pretrained svc model" |
84 | - "(default='./model/svm_model.pkl')", default="model/svm_model.pkl") | 79 | + "(default='./model/svm_model.pkl')", default="model_oneClass/svm_model.pkl") |
85 | 80 | ||
86 | args = parser.parse_args() | 81 | args = parser.parse_args() |
87 | 82 | ||
88 | labels = {0: 'useless', 1: 'useful'} | 83 | labels = {0: 'useless', 1: 'useful'} |
89 | vectorizer = TfidfVectorizer(binary=True) | 84 | vectorizer = TfidfVectorizer(binary=True) |
90 | -print(vectorizer) | 85 | + |
91 | 86 | ||
92 | if args.classA and args.classB and not args.input: | 87 | if args.classA and args.classB and not args.input: |
93 | - f0 = open("model_params.conf") | 88 | + f0 = open("model_params_oneClass.conf") |
94 | n_iter_search = 10 | 89 | n_iter_search = 10 |
95 | params = [p for p in csv.DictReader(f0)] | 90 | params = [p for p in csv.DictReader(f0)] |
96 | f0.close() | 91 | f0.close() |
... | @@ -115,50 +110,52 @@ if args.classA and args.classB and not args.input: | ... | @@ -115,50 +110,52 @@ if args.classA and args.classB and not args.input: |
115 | svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20) | 110 | svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20) |
116 | svd_model = svd.fit(X) | 111 | svd_model = svd.fit(X) |
117 | X = svd_model.transform(X) | 112 | X = svd_model.transform(X) |
118 | - #y = [x['topic'] for x in abstracs] | 113 | + y = [-1 if x['topic'] == 'useless' else 1 for x in abstracs] |
119 | - y = [0 if x['topic'] == 'useless' else 1 for x in abstracs] | ||
120 | - | ||
121 | - #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42) | ||
122 | 114 | ||
123 | - clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9) | 115 | + clf = OneClassSVM() |
124 | - clf = GridSearchCV(clf, cv=3, | 116 | + clf = GridSearchCV(clf, cv=3, param_grid=model_params, |
125 | - param_grid=model_params, | ||
126 | - # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search, | ||
127 | n_jobs=-1, scoring='f1') | 117 | n_jobs=-1, scoring='f1') |
128 | start = time() | 118 | start = time() |
129 | clf.fit(X, y) | 119 | clf.fit(X, y) |
130 | 120 | ||
131 | - #clf.fit(X_train, y_train) | ||
132 | print("GridSearch took %.2f seconds for %d candidates" | 121 | print("GridSearch took %.2f seconds for %d candidates" |
133 | " parameter settings." % ((time() - start), n_iter_search)) | 122 | " parameter settings." % ((time() - start), n_iter_search)) |
134 | 123 | ||
124 | + print() | ||
125 | + print("The best model parameters:") | ||
126 | + print(vectorizer) | ||
127 | + print(svd) | ||
135 | print(clf.best_estimator_) | 128 | print(clf.best_estimator_) |
136 | print() | 129 | print() |
130 | + print("The best F1 score:") | ||
137 | print(clf.best_score_) | 131 | print(clf.best_score_) |
138 | - #print(metrics.f1_score(clf.predict(X_test), y_test)) | ||
139 | 132 | ||
140 | - #joblib.dump(clf, 'model/svm_model.pkl') | 133 | + joblib.dump(clf.best_estimator_, 'model_oneClass/svm_model.pkl') |
141 | - joblib.dump(clf.best_estimator_, 'model/svm_model.pkl') | 134 | + joblib.dump(tfidf_model, 'model_oneClass/tfidf_model.pkl') |
142 | - joblib.dump(tfidf_model, 'model/tfidf_model.pkl') | 135 | + joblib.dump(svd_model, 'model_oneClass/svd_model.pkl') |
143 | - joblib.dump(svd_model, 'model/svd_model.pkl') | ||
144 | 136 | ||
145 | else: | 137 | else: |
146 | 138 | ||
147 | clf = joblib.load(args.svcmodel) | 139 | clf = joblib.load(args.svcmodel) |
148 | - vectorizer = joblib.load('model/tfidf_model.pkl') | 140 | + vectorizer = joblib.load('model_oneClass/tfidf_model.pkl') |
149 | - svd = joblib.load('model/svd_model.pkl') | 141 | + svd = joblib.load('model_oneClass/svd_model.pkl') |
142 | + print(vectorizer) | ||
143 | + print(svd) | ||
144 | + print(clf) | ||
150 | abstracs = get_abstracts(file_name=args.input, label='unknown') | 145 | abstracs = get_abstracts(file_name=args.input, label='unknown') |
151 | X = vectorizer.transform([x['body'] for x in abstracs]) | 146 | X = vectorizer.transform([x['body'] for x in abstracs]) |
152 | X = svd.transform(X) | 147 | X = svd.transform(X) |
153 | classes = clf.predict(X) | 148 | classes = clf.predict(X) |
154 | - | 149 | + |
155 | if not os.path.exists(args.out): | 150 | if not os.path.exists(args.out): |
156 | os.makedirs(args.out) | 151 | os.makedirs(args.out) |
157 | # Writing predictions to output files | 152 | # Writing predictions to output files |
158 | with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \ | 153 | with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \ |
159 | open(args.out + "/" + labels[1] + ".out", 'w') as f1: | 154 | open(args.out + "/" + labels[1] + ".out", 'w') as f1: |
160 | for c, a in zip(classes, abstracs): | 155 | for c, a in zip(classes, abstracs): |
161 | - if c == 0: | 156 | + if c == 1: |
162 | f0.write("%d\t%s\n" % (a['pmid'], a['body'])) | 157 | f0.write("%d\t%s\n" % (a['pmid'], a['body'])) |
163 | - elif c == 1: | 158 | + elif c == -1: |
164 | f1.write("%d\t%s\n" % (a['pmid'], a['body'])) | 159 | f1.write("%d\t%s\n" % (a['pmid'], a['body'])) |
160 | + | ||
161 | + print("FINISHED!!") | ... | ... |
model_binClass/svd_model.pkl
0 → 100644
No preview for this file type
model_binClass/svm_model.paper.pkl
0 → 100644
No preview for this file type
model_binClass/svm_model.pkl
0 → 100644
No preview for this file type
model_binClass/tfidf_model.paper.pkl
0 → 100644
No preview for this file type
model_binClass/tfidf_model.pkl
0 → 100644
No preview for this file type
model_oneClass/svd_model.pkl
0 → 100644
No preview for this file type
model_oneClass/svm_model.pkl
0 → 100644
No preview for this file type
model_oneClass/tfidf_model.pkl
0 → 100644
No preview for this file type
model_params_binClass.conf
0 → 100644
model_params_oneClass.conf
0 → 100644
1 | +kernel,degree,coef0,nu,gamma | ||
2 | +linear,1,0.5,1.0,0.0 | ||
3 | +linear,1,0.5,0.9,0.0 | ||
4 | +linear,1,0.5,0.8,0.0 | ||
5 | +linear,1,0.5,0.7,0.0 | ||
6 | +linear,1,0.5,0.6,0.0 | ||
7 | +linear,1,0.5,0.5,0.0 | ||
8 | +linear,1,0.5,0.4,0.0 | ||
9 | +linear,1,0.5,0.3,0.0 | ||
10 | +linear,1,0.5,0.2,0.0 | ||
11 | +linear,1,0.5,0.1,0.0 | ||
12 | +rbf,1,0.5,1.0,2.0 | ||
13 | +rbf,1,0.5,0.9,0.0001 | ||
14 | +rbf,1,0.5,0.8,0.0001 | ||
15 | +rbf,1,0.5,0.7,0.0001 | ||
16 | +rbf,1,0.5,0.6,0.001 | ||
17 | +rbf,1,0.5,0.5,0.001 | ||
18 | +rbf,1,0.5,0.4,0.001 | ||
19 | +rbf,1,0.5,0.7,0.0001 | ||
20 | +rbf,1,0.5,0.4,0.0001 | ||
21 | +rbf,1,0.5,0.5,0.0001 |
oneClass_trainUseful_out/useful.out
0 → 100644
This diff is collapsed. Click to expand it.
oneClass_trainUseful_out/useless.out
0 → 100644
File mode changed
oneClass_trainUseless_out/useful.out
0 → 100644
This diff is collapsed. Click to expand it.
oneClass_trainUseless_out/useless.out
0 → 100644
This diff is collapsed. Click to expand it.
outRNAseq_binClass/useful.out
0 → 100644
This diff is collapsed. Click to expand it.
outRNAseq_binClass/useless.out
0 → 100644
This diff is collapsed. Click to expand it.
outRNAseq_oneClass/useful.out
0 → 100644
This diff is collapsed. Click to expand it.
outRNAseq_oneClass/useless.out
0 → 100644
This diff is collapsed. Click to expand it.
-
Please register or login to post a comment