Showing
11 changed files
with
504 additions
and
14 deletions
filter_abstracts.py.save
0 → 100644
1 | +#from pdb import set_trace as st | ||
2 | +from sklearn.cross_validation import train_test_split as splitt | ||
3 | +from sklearn.feature_extraction.text import TfidfVectorizer | ||
4 | +from sklearn.decomposition import TruncatedSVD | ||
5 | +from sklearn.model_selection import RandomizedSearchCV | ||
6 | +from sklearn.model_selection import GridSearchCV | ||
7 | +from sklearn import metrics | ||
8 | +from sklearn.svm import SVC | ||
9 | +import numpy as np | ||
10 | +import argparse | ||
11 | +import csv | ||
12 | +import os | ||
13 | +from sklearn.externals import joblib | ||
14 | +from time import time | ||
15 | +from scipy.stats import randint as sp_randint | ||
16 | +from scipy.stats import expon | ||
17 | +from sklearn.preprocessing import label_binarize | ||
18 | + | ||
19 | + | ||
20 | +def get_abstracts(file_name, label): | ||
21 | + f = open(file_name) | ||
22 | + extract = {} | ||
23 | + docs = [] | ||
24 | + empties = [] | ||
25 | + lines = f.readlines() | ||
26 | + copyright = False | ||
27 | + | ||
28 | + for i, ln in enumerate(lines): | ||
29 | + if not ln.strip(): | ||
30 | + empties.append(i) | ||
31 | + continue | ||
32 | + elif ' doi: ' in ln: | ||
33 | + for j in range(i, i + 10): | ||
34 | + if not lines[j].strip(): | ||
35 | + title_idx = j + 1 | ||
36 | + break | ||
37 | + continue | ||
38 | + | ||
39 | + elif 'Copyright ' in ln or 'Publish' in ln or u'\N{COPYRIGHT SIGN}' in ln: | ||
40 | + copyright = True | ||
41 | + | ||
42 | + elif 'DOI: ' in ln: | ||
43 | + if 'PMCID: ' in lines[i + 1]: | ||
44 | + extract['pmid'] = int(lines[i + 2].strip().split()[1]) | ||
45 | + elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]: | ||
46 | + extract['pmid'] = int(lines[i + 1].strip().split()[1]) | ||
47 | + | ||
48 | + if copyright: | ||
49 | + get = slice(empties[-3], empties[-2]) | ||
50 | + copyright = False | ||
51 | + else: | ||
52 | + get = slice(empties[-2], empties[-1]) | ||
53 | + | ||
54 | + extract['body'] = " ".join(lines[get]).replace("\n", ' ' | ||
55 | + ).replace(" ", ' ') | ||
56 | + title = [] | ||
57 | + for j in range(title_idx, title_idx + 5): | ||
58 | + if lines[j].strip(): | ||
59 | + title.append(lines[j]) | ||
60 | + else: | ||
61 | + break | ||
62 | + extract['title'] = " ".join(title).replace("\n", ' ' | ||
63 | + ).replace(" ", ' ') | ||
64 | + extract['topic'] = label | ||
65 | + docs.append(extract) | ||
66 | + empties = [] | ||
67 | + extract = {} | ||
68 | + | ||
69 | + return docs | ||
70 | + | ||
71 | + | ||
72 | +parser = argparse.ArgumentParser( | ||
73 | + description="This script separates abstracts of biomedical papers that" | ||
74 | + "report data from biomedical experiments from those that do not.") | ||
75 | +parser.add_argument("--input", help="Input file containing the abstracts to" | ||
76 | + "be predited.") | ||
77 | +parser.add_argument("--classA", help="Input file containing the abstracts of" | ||
78 | + "class A to be learned.") | ||
79 | +parser.add_argument("--classB", help="Input file containing the abstracts of" | ||
80 | + "class B to be learned.") | ||
81 | +parser.add_argument("--out", help="Path to the output directory " | ||
82 | + "(default='./filter_output')", default="filter_output") | ||
83 | +parser.add_argument("--svcmodel", help="Path to custom pretrained svc model" | ||
84 | + "(default='./model/svm_model.pkl')", default="model/svm_model.pkl") | ||
85 | + | ||
86 | +args = parser.parse_args() | ||
87 | + | ||
88 | +labels = {0: 'useless', 1: 'useful'} | ||
89 | + | ||
90 | +if args.classA and args.classB and not args.input: | ||
91 | + vectorizer = TfidfVectorizer(binary=True) | ||
92 | + print(vectorizer) | ||
93 | + f0 = open("model_params.conf") | ||
94 | + n_iter_search = 10 | ||
95 | + params = [p for p in csv.DictReader(f0)] | ||
96 | + f0.close() | ||
97 | + names = list(params[0].keys()) | ||
98 | + model_params = {n: [] for n in names} | ||
99 | + | ||
100 | + for n in names: | ||
101 | + for d in params: | ||
102 | + for k in d: | ||
103 | + if k == n: | ||
104 | + try: | ||
105 | + model_params[n].append(float(d[k])) | ||
106 | + except ValueError: | ||
107 | + model_params[n].append(d[k]) | ||
108 | + | ||
109 | + model_params = {k: list(set(model_params[k])) for k in model_params} | ||
110 | + abstracs = get_abstracts(file_name=args.classA, label=labels[0]) | ||
111 | + abstracs += get_abstracts(file_name=args.classB, label=labels[1]) | ||
112 | + | ||
113 | + tfidf_model = vectorizer.fit([x['body'] for x in abstracs]) | ||
114 | + X = tfidf_model.transform([x['body'] for x in abstracs]) | ||
115 | + svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20) | ||
116 | + svd_model = svd.fit(X) | ||
117 | + X = svd_model.transform(X) | ||
118 | + #y = [x['topic'] for x in abstracs] | ||
119 | + y = [0 if x['topic'] == 'useless' else 1 for x in abstracs] | ||
120 | + | ||
121 | + #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42) | ||
122 | + | ||
123 | + clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9) | ||
124 | + clf = GridSearchCV(clf, cv=3, | ||
125 | + param_grid=model_params, | ||
126 | + # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search, | ||
127 | + n_jobs=-1, scoring='f1') | ||
128 | + start = time() | ||
129 | + clf.fit(X, y) | ||
130 | + | ||
131 | + #clf.fit(X_train, y_train) | ||
132 | + print("GridSearch took %.2f seconds for %d candidates" | ||
133 | + " parameter settings." % ((time() - start), n_iter_search)) | ||
134 | + | ||
135 | + print(clf.best_estimator_) | ||
136 | + print() | ||
137 | + print(clf.best_score_) | ||
138 | + #print(metrics.f1_score(clf.predict(X_test), y_test)) | ||
139 | + | ||
140 | + #joblib.dump(clf, 'model/svm_model.pkl') | ||
141 | + joblib.dump(clf.best_estimator_, 'model/svm_model.pkl') | ||
142 | + joblib.dump(tfidf_model, 'model/tfidf_model.pkl') | ||
143 | + joblib.dump(svd_model, 'model/svd_model.pkl') | ||
144 | + | ||
145 | +else: | ||
146 | + | ||
147 | + clf = joblib.load(args.svcmodel) | ||
148 | + vectorizer = joblib.load('model/tfidf_model.pkl') | ||
149 | + svd = joblib.load('model/svd_model.pkl') | ||
150 | + abstracs = get_abstracts(file_name=args.input, label='unknown') | ||
151 | + X = vectorizer.transform([x['body'] for x in abstracs]) | ||
152 | + X = svd.transform(X) | ||
153 | + classes = clf.predict(X) | ||
154 | + | ||
155 | + if not os.path.exists(args.out): | ||
156 | + os.makedirs(args.out) | ||
157 | + # Writing predictions to output files | ||
158 | + with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \ | ||
159 | + open(args.out + "/" + labels[1] + ".out", 'w') as f1: | ||
160 | + for c, a in zip(classes, abstracs): | ||
161 | + if c == 0: | ||
162 | + f0.write("%d\t%s\n" % (a['pmid'], a['body'])) | ||
163 | + elif c == 1: | ||
164 | + f1.write("%d\t%s\n" % (a['pmid'], a['body'])) |
filter_abstracts_binClass.py
0 → 100644
1 | +#from pdb import set_trace as st | ||
2 | +from sklearn.cross_validation import train_test_split as splitt | ||
3 | +from sklearn.feature_extraction.text import TfidfVectorizer | ||
4 | +from sklearn.decomposition import TruncatedSVD | ||
5 | +from sklearn.model_selection import RandomizedSearchCV | ||
6 | +from sklearn.model_selection import GridSearchCV | ||
7 | +from sklearn import metrics | ||
8 | +from sklearn.svm import SVC | ||
9 | +import numpy as np | ||
10 | +import argparse | ||
11 | +import csv | ||
12 | +import os | ||
13 | +from sklearn.externals import joblib | ||
14 | +from time import time | ||
15 | +from scipy.stats import randint as sp_randint | ||
16 | +from scipy.stats import expon | ||
17 | +from sklearn.preprocessing import label_binarize | ||
18 | + | ||
19 | + | ||
20 | +def get_abstracts(file_name, label): | ||
21 | + f = open(file_name) | ||
22 | + extract = {} | ||
23 | + docs = [] | ||
24 | + empties = [] | ||
25 | + lines = f.readlines() | ||
26 | + copyright = False | ||
27 | + | ||
28 | + for i, ln in enumerate(lines): | ||
29 | + if not ln.strip(): | ||
30 | + empties.append(i) | ||
31 | + continue | ||
32 | + elif ' doi: ' in ln: | ||
33 | + for j in range(i, i + 10): | ||
34 | + if not lines[j].strip(): | ||
35 | + title_idx = j + 1 | ||
36 | + break | ||
37 | + continue | ||
38 | + | ||
39 | + elif 'Copyright ' in ln or 'Publish' in ln or u'\N{COPYRIGHT SIGN}' in ln: | ||
40 | + copyright = True | ||
41 | + | ||
42 | + elif 'DOI: ' in ln: | ||
43 | + if 'PMCID: ' in lines[i + 1]: | ||
44 | + extract['pmid'] = int(lines[i + 2].strip().split()[1]) | ||
45 | + elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]: | ||
46 | + extract['pmid'] = int(lines[i + 1].strip().split()[1]) | ||
47 | + | ||
48 | + if copyright: | ||
49 | + get = slice(empties[-3], empties[-2]) | ||
50 | + copyright = False | ||
51 | + else: | ||
52 | + get = slice(empties[-2], empties[-1]) | ||
53 | + | ||
54 | + extract['body'] = " ".join(lines[get]).replace("\n", ' ' | ||
55 | + ).replace(" ", ' ') | ||
56 | + title = [] | ||
57 | + for j in range(title_idx, title_idx + 5): | ||
58 | + if lines[j].strip(): | ||
59 | + title.append(lines[j]) | ||
60 | + else: | ||
61 | + break | ||
62 | + extract['title'] = " ".join(title).replace("\n", ' ' | ||
63 | + ).replace(" ", ' ') | ||
64 | + extract['topic'] = label | ||
65 | + docs.append(extract) | ||
66 | + empties = [] | ||
67 | + extract = {} | ||
68 | + | ||
69 | + return docs | ||
70 | + | ||
71 | + | ||
72 | +parser = argparse.ArgumentParser( | ||
73 | + description="This script separates abstracts of biomedical papers that" | ||
74 | + "report data from biomedical experiments from those that do not.") | ||
75 | +parser.add_argument("--input", help="Input file containing the abstracts to" | ||
76 | + "be predited.") | ||
77 | +parser.add_argument("--classA", help="Input file containing the abstracts of" | ||
78 | + "class A to be learned.") | ||
79 | +parser.add_argument("--classB", help="Input file containing the abstracts of" | ||
80 | + "class B to be learned.") | ||
81 | +parser.add_argument("--out", help="Path to the output directory " | ||
82 | + "(default='./filter_output')", default="filter_output") | ||
83 | +parser.add_argument("--svcmodel", help="Path to custom pretrained svc model" | ||
84 | + "(default='./model/svm_model.pkl')", default="model/svm_model.pkl") | ||
85 | + | ||
86 | +args = parser.parse_args() | ||
87 | + | ||
88 | +labels = {0: 'useless', 1: 'useful'} | ||
89 | +vectorizer = TfidfVectorizer(binary=True) | ||
90 | +print(vectorizer) | ||
91 | + | ||
92 | +if args.classA and args.classB and not args.input: | ||
93 | + f0 = open("model_params.conf") | ||
94 | + n_iter_search = 10 | ||
95 | + params = [p for p in csv.DictReader(f0)] | ||
96 | + f0.close() | ||
97 | + names = list(params[0].keys()) | ||
98 | + model_params = {n: [] for n in names} | ||
99 | + | ||
100 | + for n in names: | ||
101 | + for d in params: | ||
102 | + for k in d: | ||
103 | + if k == n: | ||
104 | + try: | ||
105 | + model_params[n].append(float(d[k])) | ||
106 | + except ValueError: | ||
107 | + model_params[n].append(d[k]) | ||
108 | + | ||
109 | + model_params = {k: list(set(model_params[k])) for k in model_params} | ||
110 | + abstracs = get_abstracts(file_name=args.classA, label=labels[0]) | ||
111 | + abstracs += get_abstracts(file_name=args.classB, label=labels[1]) | ||
112 | + | ||
113 | + tfidf_model = vectorizer.fit([x['body'] for x in abstracs]) | ||
114 | + X = tfidf_model.transform([x['body'] for x in abstracs]) | ||
115 | + svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20) | ||
116 | + svd_model = svd.fit(X) | ||
117 | + X = svd_model.transform(X) | ||
118 | + #y = [x['topic'] for x in abstracs] | ||
119 | + y = [0 if x['topic'] == 'useless' else 1 for x in abstracs] | ||
120 | + | ||
121 | + #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42) | ||
122 | + | ||
123 | + clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9) | ||
124 | + clf = GridSearchCV(clf, cv=3, | ||
125 | + param_grid=model_params, | ||
126 | + # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search, | ||
127 | + n_jobs=-1, scoring='f1') | ||
128 | + start = time() | ||
129 | + clf.fit(X, y) | ||
130 | + | ||
131 | + #clf.fit(X_train, y_train) | ||
132 | + print("GridSearch took %.2f seconds for %d candidates" | ||
133 | + " parameter settings." % ((time() - start), n_iter_search)) | ||
134 | + | ||
135 | + print(clf.best_estimator_) | ||
136 | + print() | ||
137 | + print(clf.best_score_) | ||
138 | + #print(metrics.f1_score(clf.predict(X_test), y_test)) | ||
139 | + | ||
140 | + #joblib.dump(clf, 'model/svm_model.pkl') | ||
141 | + joblib.dump(clf.best_estimator_, 'model/svm_model.pkl') | ||
142 | + joblib.dump(tfidf_model, 'model/tfidf_model.pkl') | ||
143 | + joblib.dump(svd_model, 'model/svd_model.pkl') | ||
144 | + | ||
145 | +else: | ||
146 | + | ||
147 | + clf = joblib.load(args.svcmodel) | ||
148 | + vectorizer = joblib.load('model/tfidf_model.pkl') | ||
149 | + svd = joblib.load('model/svd_model.pkl') | ||
150 | + abstracs = get_abstracts(file_name=args.input, label='unknown') | ||
151 | + X = vectorizer.transform([x['body'] for x in abstracs]) | ||
152 | + X = svd.transform(X) | ||
153 | + classes = clf.predict(X) | ||
154 | + | ||
155 | + if not os.path.exists(args.out): | ||
156 | + os.makedirs(args.out) | ||
157 | + # Writing predictions to output files | ||
158 | + with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \ | ||
159 | + open(args.out + "/" + labels[1] + ".out", 'w') as f1: | ||
160 | + for c, a in zip(classes, abstracs): | ||
161 | + if c == 0: | ||
162 | + f0.write("%d\t%s\n" % (a['pmid'], a['body'])) | ||
163 | + elif c == 1: | ||
164 | + f1.write("%d\t%s\n" % (a['pmid'], a['body'])) |
filter_abstracts_oneClass.py
0 → 100644
1 | +#from pdb import set_trace as st | ||
2 | +from sklearn.cross_validation import train_test_split as splitt | ||
3 | +from sklearn.feature_extraction.text import TfidfVectorizer | ||
4 | +from sklearn.decomposition import TruncatedSVD | ||
5 | +from sklearn.model_selection import RandomizedSearchCV | ||
6 | +from sklearn.model_selection import GridSearchCV | ||
7 | +from sklearn import metrics | ||
8 | +from sklearn.svm import SVC | ||
9 | +import numpy as np | ||
10 | +import argparse | ||
11 | +import csv | ||
12 | +import os | ||
13 | +from sklearn.externals import joblib | ||
14 | +from time import time | ||
15 | +from scipy.stats import randint as sp_randint | ||
16 | +from scipy.stats import expon | ||
17 | +from sklearn.preprocessing import label_binarize | ||
18 | + | ||
19 | + | ||
20 | +def get_abstracts(file_name, label): | ||
21 | + f = open(file_name) | ||
22 | + extract = {} | ||
23 | + docs = [] | ||
24 | + empties = [] | ||
25 | + lines = f.readlines() | ||
26 | + copyright = False | ||
27 | + | ||
28 | + for i, ln in enumerate(lines): | ||
29 | + if not ln.strip(): | ||
30 | + empties.append(i) | ||
31 | + continue | ||
32 | + elif ' doi: ' in ln: | ||
33 | + for j in range(i, i + 10): | ||
34 | + if not lines[j].strip(): | ||
35 | + title_idx = j + 1 | ||
36 | + break | ||
37 | + continue | ||
38 | + | ||
39 | + elif 'Copyright ' in ln or 'Publish' in ln or u'\N{COPYRIGHT SIGN}' in ln: | ||
40 | + copyright = True | ||
41 | + | ||
42 | + elif 'DOI: ' in ln: | ||
43 | + if 'PMCID: ' in lines[i + 1]: | ||
44 | + extract['pmid'] = int(lines[i + 2].strip().split()[1]) | ||
45 | + elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]: | ||
46 | + extract['pmid'] = int(lines[i + 1].strip().split()[1]) | ||
47 | + | ||
48 | + if copyright: | ||
49 | + get = slice(empties[-3], empties[-2]) | ||
50 | + copyright = False | ||
51 | + else: | ||
52 | + get = slice(empties[-2], empties[-1]) | ||
53 | + | ||
54 | + extract['body'] = " ".join(lines[get]).replace("\n", ' ' | ||
55 | + ).replace(" ", ' ') | ||
56 | + title = [] | ||
57 | + for j in range(title_idx, title_idx + 5): | ||
58 | + if lines[j].strip(): | ||
59 | + title.append(lines[j]) | ||
60 | + else: | ||
61 | + break | ||
62 | + extract['title'] = " ".join(title).replace("\n", ' ' | ||
63 | + ).replace(" ", ' ') | ||
64 | + extract['topic'] = label | ||
65 | + docs.append(extract) | ||
66 | + empties = [] | ||
67 | + extract = {} | ||
68 | + | ||
69 | + return docs | ||
70 | + | ||
71 | + | ||
72 | +parser = argparse.ArgumentParser( | ||
73 | + description="This script separates abstracts of biomedical papers that" | ||
74 | + "report data from biomedical experiments from those that do not.") | ||
75 | +parser.add_argument("--input", help="Input file containing the abstracts to" | ||
76 | + "be predited.") | ||
77 | +parser.add_argument("--classA", help="Input file containing the abstracts of" | ||
78 | + "class A to be learned.") | ||
79 | +parser.add_argument("--classB", help="Input file containing the abstracts of" | ||
80 | + "class B to be learned.") | ||
81 | +parser.add_argument("--out", help="Path to the output directory " | ||
82 | + "(default='./filter_output')", default="filter_output") | ||
83 | +parser.add_argument("--svcmodel", help="Path to custom pretrained svc model" | ||
84 | + "(default='./model/svm_model.pkl')", default="model/svm_model.pkl") | ||
85 | + | ||
86 | +args = parser.parse_args() | ||
87 | + | ||
88 | +labels = {0: 'useless', 1: 'useful'} | ||
89 | +vectorizer = TfidfVectorizer(binary=True) | ||
90 | +print(vectorizer) | ||
91 | + | ||
92 | +if args.classA and args.classB and not args.input: | ||
93 | + f0 = open("model_params.conf") | ||
94 | + n_iter_search = 10 | ||
95 | + params = [p for p in csv.DictReader(f0)] | ||
96 | + f0.close() | ||
97 | + names = list(params[0].keys()) | ||
98 | + model_params = {n: [] for n in names} | ||
99 | + | ||
100 | + for n in names: | ||
101 | + for d in params: | ||
102 | + for k in d: | ||
103 | + if k == n: | ||
104 | + try: | ||
105 | + model_params[n].append(float(d[k])) | ||
106 | + except ValueError: | ||
107 | + model_params[n].append(d[k]) | ||
108 | + | ||
109 | + model_params = {k: list(set(model_params[k])) for k in model_params} | ||
110 | + abstracs = get_abstracts(file_name=args.classA, label=labels[0]) | ||
111 | + abstracs += get_abstracts(file_name=args.classB, label=labels[1]) | ||
112 | + | ||
113 | + tfidf_model = vectorizer.fit([x['body'] for x in abstracs]) | ||
114 | + X = tfidf_model.transform([x['body'] for x in abstracs]) | ||
115 | + svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20) | ||
116 | + svd_model = svd.fit(X) | ||
117 | + X = svd_model.transform(X) | ||
118 | + #y = [x['topic'] for x in abstracs] | ||
119 | + y = [0 if x['topic'] == 'useless' else 1 for x in abstracs] | ||
120 | + | ||
121 | + #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42) | ||
122 | + | ||
123 | + clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9) | ||
124 | + clf = GridSearchCV(clf, cv=3, | ||
125 | + param_grid=model_params, | ||
126 | + # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search, | ||
127 | + n_jobs=-1, scoring='f1') | ||
128 | + start = time() | ||
129 | + clf.fit(X, y) | ||
130 | + | ||
131 | + #clf.fit(X_train, y_train) | ||
132 | + print("GridSearch took %.2f seconds for %d candidates" | ||
133 | + " parameter settings." % ((time() - start), n_iter_search)) | ||
134 | + | ||
135 | + print(clf.best_estimator_) | ||
136 | + print() | ||
137 | + print(clf.best_score_) | ||
138 | + #print(metrics.f1_score(clf.predict(X_test), y_test)) | ||
139 | + | ||
140 | + #joblib.dump(clf, 'model/svm_model.pkl') | ||
141 | + joblib.dump(clf.best_estimator_, 'model/svm_model.pkl') | ||
142 | + joblib.dump(tfidf_model, 'model/tfidf_model.pkl') | ||
143 | + joblib.dump(svd_model, 'model/svd_model.pkl') | ||
144 | + | ||
145 | +else: | ||
146 | + | ||
147 | + clf = joblib.load(args.svcmodel) | ||
148 | + vectorizer = joblib.load('model/tfidf_model.pkl') | ||
149 | + svd = joblib.load('model/svd_model.pkl') | ||
150 | + abstracs = get_abstracts(file_name=args.input, label='unknown') | ||
151 | + X = vectorizer.transform([x['body'] for x in abstracs]) | ||
152 | + X = svd.transform(X) | ||
153 | + classes = clf.predict(X) | ||
154 | + | ||
155 | + if not os.path.exists(args.out): | ||
156 | + os.makedirs(args.out) | ||
157 | + # Writing predictions to output files | ||
158 | + with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \ | ||
159 | + open(args.out + "/" + labels[1] + ".out", 'w') as f1: | ||
160 | + for c, a in zip(classes, abstracs): | ||
161 | + if c == 0: | ||
162 | + f0.write("%d\t%s\n" % (a['pmid'], a['body'])) | ||
163 | + elif c == 1: | ||
164 | + f1.write("%d\t%s\n" % (a['pmid'], a['body'])) |
This diff could not be displayed because it is too large.
... | @@ -30,15 +30,14 @@ parser.add_argument("--svcmodel", help="Path to custom pretrained svc model" | ... | @@ -30,15 +30,14 @@ parser.add_argument("--svcmodel", help="Path to custom pretrained svc model" |
30 | "(default='./model/svm_model.paper.pkl')", default="model/svm_model.paper.pkl") | 30 | "(default='./model/svm_model.paper.pkl')", default="model/svm_model.paper.pkl") |
31 | 31 | ||
32 | args = parser.parse_args() | 32 | args = parser.parse_args() |
33 | +labels = {0: 'useless', 1: 'useful'} | ||
33 | 34 | ||
34 | -data=load_files(container_path=args.traind, encoding=None, | 35 | +if args.traind and not args.input: |
36 | + data=load_files(container_path=args.traind, encoding=None, | ||
35 | decode_error='replace') | 37 | decode_error='replace') |
36 | -labels = data.target_names | 38 | + labels = data.target_names |
37 | - | 39 | + vectorizer = TfidfVectorizer(binary=True) |
38 | -vectorizer = TfidfVectorizer(binary=True) | 40 | + print(vectorizer) |
39 | -print(vectorizer) | ||
40 | - | ||
41 | -if args.train and not args.input: | ||
42 | f0 = open("model_params.conf") | 41 | f0 = open("model_params.conf") |
43 | n_iter_search = 10 | 42 | n_iter_search = 10 |
44 | params = [p for p in csv.DictReader(f0)] | 43 | params = [p for p in csv.DictReader(f0)] |
... | @@ -56,10 +55,9 @@ if args.train and not args.input: | ... | @@ -56,10 +55,9 @@ if args.train and not args.input: |
56 | model_params[n].append(d[k]) | 55 | model_params[n].append(d[k]) |
57 | 56 | ||
58 | model_params = {k: list(set(model_params[k])) for k in model_params} | 57 | model_params = {k: list(set(model_params[k])) for k in model_params} |
59 | - papers = data.data | ||
60 | 58 | ||
61 | - tfidf_model = vectorizer.fit(papers) | 59 | + tfidf_model = vectorizer.fit(data.data) |
62 | - X = vectorizer.transform(papers) | 60 | + X = vectorizer.transform(data.data) |
63 | #y = [x['topic'] for x in abstracs] | 61 | #y = [x['topic'] for x in abstracs] |
64 | y = data.target | 62 | y = data.target |
65 | 63 | ||
... | @@ -87,15 +85,15 @@ if args.train and not args.input: | ... | @@ -87,15 +85,15 @@ if args.train and not args.input: |
87 | joblib.dump(tfidf_model, 'model/tfidf_model.paper.pkl') | 85 | joblib.dump(tfidf_model, 'model/tfidf_model.paper.pkl') |
88 | 86 | ||
89 | else: | 87 | else: |
90 | - | 88 | + from pdb import set_trace as st |
91 | data=load_files(container_path=args.input, encoding=None, | 89 | data=load_files(container_path=args.input, encoding=None, |
92 | decode_error='replace') | 90 | decode_error='replace') |
93 | clf = joblib.load(args.svcmodel) | 91 | clf = joblib.load(args.svcmodel) |
94 | vectorizer = joblib.load('model/tfidf_model.paper.pkl') | 92 | vectorizer = joblib.load('model/tfidf_model.paper.pkl') |
95 | - papers = data.data | 93 | + X = vectorizer.transform(data.data) |
96 | - X = vectorizer.transform(papers) | ||
97 | - classes = clf.predict(X) | ||
98 | 94 | ||
95 | + classes = clf.predict(X) | ||
96 | + st() | ||
99 | if not os.path.exists(args.out): | 97 | if not os.path.exists(args.out): |
100 | os.makedirs(args.out) | 98 | os.makedirs(args.out) |
101 | # Writing predictions to output files | 99 | # Writing predictions to output files | ... | ... |
model/svd_model.pkl
0 → 100644
No preview for this file type
model/svm_model.paper.pkl
0 → 100644
No preview for this file type
No preview for this file type
model/tfidf_model.paper.pkl
0 → 100644
No preview for this file type
outRNAseq/useful.out
0 → 100644
This diff is collapsed. Click to expand it.
outRNAseq/useless.out
0 → 100644
File mode changed
-
Please register or login to post a comment