iarroyof

laste version

1 +#from pdb import set_trace as st
2 +from sklearn.cross_validation import train_test_split as splitt
3 +from sklearn.feature_extraction.text import TfidfVectorizer
4 +from sklearn.decomposition import TruncatedSVD
5 +from sklearn.model_selection import RandomizedSearchCV
6 +from sklearn.model_selection import GridSearchCV
7 +from sklearn import metrics
8 +from sklearn.svm import SVC
9 +import numpy as np
10 +import argparse
11 +import csv
12 +import os
13 +from sklearn.externals import joblib
14 +from time import time
15 +from scipy.stats import randint as sp_randint
16 +from scipy.stats import expon
17 +from sklearn.preprocessing import label_binarize
18 +
19 +
20 +def get_abstracts(file_name, label):
21 + f = open(file_name)
22 + extract = {}
23 + docs = []
24 + empties = []
25 + lines = f.readlines()
26 + copyright = False
27 +
28 + for i, ln in enumerate(lines):
29 + if not ln.strip():
30 + empties.append(i)
31 + continue
32 + elif ' doi: ' in ln:
33 + for j in range(i, i + 10):
34 + if not lines[j].strip():
35 + title_idx = j + 1
36 + break
37 + continue
38 +
39 + elif 'Copyright ' in ln or 'Publish' in ln or u'\N{COPYRIGHT SIGN}' in ln:
40 + copyright = True
41 +
42 + elif 'DOI: ' in ln:
43 + if 'PMCID: ' in lines[i + 1]:
44 + extract['pmid'] = int(lines[i + 2].strip().split()[1])
45 + elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]:
46 + extract['pmid'] = int(lines[i + 1].strip().split()[1])
47 +
48 + if copyright:
49 + get = slice(empties[-3], empties[-2])
50 + copyright = False
51 + else:
52 + get = slice(empties[-2], empties[-1])
53 +
54 + extract['body'] = " ".join(lines[get]).replace("\n", ' '
55 + ).replace(" ", ' ')
56 + title = []
57 + for j in range(title_idx, title_idx + 5):
58 + if lines[j].strip():
59 + title.append(lines[j])
60 + else:
61 + break
62 + extract['title'] = " ".join(title).replace("\n", ' '
63 + ).replace(" ", ' ')
64 + extract['topic'] = label
65 + docs.append(extract)
66 + empties = []
67 + extract = {}
68 +
69 + return docs
70 +
71 +
72 +parser = argparse.ArgumentParser(
73 + description="This script separates abstracts of biomedical papers that"
74 + "report data from biomedical experiments from those that do not.")
75 +parser.add_argument("--input", help="Input file containing the abstracts to"
76 + "be predited.")
77 +parser.add_argument("--classA", help="Input file containing the abstracts of"
78 + "class A to be learned.")
79 +parser.add_argument("--classB", help="Input file containing the abstracts of"
80 + "class B to be learned.")
81 +parser.add_argument("--out", help="Path to the output directory "
82 + "(default='./filter_output')", default="filter_output")
83 +parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
84 + "(default='./model/svm_model.pkl')", default="model/svm_model.pkl")
85 +
86 +args = parser.parse_args()
87 +
88 +labels = {0: 'useless', 1: 'useful'}
89 +
90 +if args.classA and args.classB and not args.input:
91 + vectorizer = TfidfVectorizer(binary=True)
92 + print(vectorizer)
93 + f0 = open("model_params.conf")
94 + n_iter_search = 10
95 + params = [p for p in csv.DictReader(f0)]
96 + f0.close()
97 + names = list(params[0].keys())
98 + model_params = {n: [] for n in names}
99 +
100 + for n in names:
101 + for d in params:
102 + for k in d:
103 + if k == n:
104 + try:
105 + model_params[n].append(float(d[k]))
106 + except ValueError:
107 + model_params[n].append(d[k])
108 +
109 + model_params = {k: list(set(model_params[k])) for k in model_params}
110 + abstracs = get_abstracts(file_name=args.classA, label=labels[0])
111 + abstracs += get_abstracts(file_name=args.classB, label=labels[1])
112 +
113 + tfidf_model = vectorizer.fit([x['body'] for x in abstracs])
114 + X = tfidf_model.transform([x['body'] for x in abstracs])
115 + svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20)
116 + svd_model = svd.fit(X)
117 + X = svd_model.transform(X)
118 + #y = [x['topic'] for x in abstracs]
119 + y = [0 if x['topic'] == 'useless' else 1 for x in abstracs]
120 +
121 + #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
122 +
123 + clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
124 + clf = GridSearchCV(clf, cv=3,
125 + param_grid=model_params,
126 + # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
127 + n_jobs=-1, scoring='f1')
128 + start = time()
129 + clf.fit(X, y)
130 +
131 + #clf.fit(X_train, y_train)
132 + print("GridSearch took %.2f seconds for %d candidates"
133 + " parameter settings." % ((time() - start), n_iter_search))
134 +
135 + print(clf.best_estimator_)
136 + print()
137 + print(clf.best_score_)
138 + #print(metrics.f1_score(clf.predict(X_test), y_test))
139 +
140 + #joblib.dump(clf, 'model/svm_model.pkl')
141 + joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
142 + joblib.dump(tfidf_model, 'model/tfidf_model.pkl')
143 + joblib.dump(svd_model, 'model/svd_model.pkl')
144 +
145 +else:
146 +
147 + clf = joblib.load(args.svcmodel)
148 + vectorizer = joblib.load('model/tfidf_model.pkl')
149 + svd = joblib.load('model/svd_model.pkl')
150 + abstracs = get_abstracts(file_name=args.input, label='unknown')
151 + X = vectorizer.transform([x['body'] for x in abstracs])
152 + X = svd.transform(X)
153 + classes = clf.predict(X)
154 +
155 + if not os.path.exists(args.out):
156 + os.makedirs(args.out)
157 + # Writing predictions to output files
158 + with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \
159 + open(args.out + "/" + labels[1] + ".out", 'w') as f1:
160 + for c, a in zip(classes, abstracs):
161 + if c == 0:
162 + f0.write("%d\t%s\n" % (a['pmid'], a['body']))
163 + elif c == 1:
164 + f1.write("%d\t%s\n" % (a['pmid'], a['body']))
1 +#from pdb import set_trace as st
2 +from sklearn.cross_validation import train_test_split as splitt
3 +from sklearn.feature_extraction.text import TfidfVectorizer
4 +from sklearn.decomposition import TruncatedSVD
5 +from sklearn.model_selection import RandomizedSearchCV
6 +from sklearn.model_selection import GridSearchCV
7 +from sklearn import metrics
8 +from sklearn.svm import SVC
9 +import numpy as np
10 +import argparse
11 +import csv
12 +import os
13 +from sklearn.externals import joblib
14 +from time import time
15 +from scipy.stats import randint as sp_randint
16 +from scipy.stats import expon
17 +from sklearn.preprocessing import label_binarize
18 +
19 +
20 +def get_abstracts(file_name, label):
21 + f = open(file_name)
22 + extract = {}
23 + docs = []
24 + empties = []
25 + lines = f.readlines()
26 + copyright = False
27 +
28 + for i, ln in enumerate(lines):
29 + if not ln.strip():
30 + empties.append(i)
31 + continue
32 + elif ' doi: ' in ln:
33 + for j in range(i, i + 10):
34 + if not lines[j].strip():
35 + title_idx = j + 1
36 + break
37 + continue
38 +
39 + elif 'Copyright ' in ln or 'Publish' in ln or u'\N{COPYRIGHT SIGN}' in ln:
40 + copyright = True
41 +
42 + elif 'DOI: ' in ln:
43 + if 'PMCID: ' in lines[i + 1]:
44 + extract['pmid'] = int(lines[i + 2].strip().split()[1])
45 + elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]:
46 + extract['pmid'] = int(lines[i + 1].strip().split()[1])
47 +
48 + if copyright:
49 + get = slice(empties[-3], empties[-2])
50 + copyright = False
51 + else:
52 + get = slice(empties[-2], empties[-1])
53 +
54 + extract['body'] = " ".join(lines[get]).replace("\n", ' '
55 + ).replace(" ", ' ')
56 + title = []
57 + for j in range(title_idx, title_idx + 5):
58 + if lines[j].strip():
59 + title.append(lines[j])
60 + else:
61 + break
62 + extract['title'] = " ".join(title).replace("\n", ' '
63 + ).replace(" ", ' ')
64 + extract['topic'] = label
65 + docs.append(extract)
66 + empties = []
67 + extract = {}
68 +
69 + return docs
70 +
71 +
72 +parser = argparse.ArgumentParser(
73 + description="This script separates abstracts of biomedical papers that"
74 + "report data from biomedical experiments from those that do not.")
75 +parser.add_argument("--input", help="Input file containing the abstracts to"
76 + "be predited.")
77 +parser.add_argument("--classA", help="Input file containing the abstracts of"
78 + "class A to be learned.")
79 +parser.add_argument("--classB", help="Input file containing the abstracts of"
80 + "class B to be learned.")
81 +parser.add_argument("--out", help="Path to the output directory "
82 + "(default='./filter_output')", default="filter_output")
83 +parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
84 + "(default='./model/svm_model.pkl')", default="model/svm_model.pkl")
85 +
86 +args = parser.parse_args()
87 +
88 +labels = {0: 'useless', 1: 'useful'}
89 +vectorizer = TfidfVectorizer(binary=True)
90 +print(vectorizer)
91 +
92 +if args.classA and args.classB and not args.input:
93 + f0 = open("model_params.conf")
94 + n_iter_search = 10
95 + params = [p for p in csv.DictReader(f0)]
96 + f0.close()
97 + names = list(params[0].keys())
98 + model_params = {n: [] for n in names}
99 +
100 + for n in names:
101 + for d in params:
102 + for k in d:
103 + if k == n:
104 + try:
105 + model_params[n].append(float(d[k]))
106 + except ValueError:
107 + model_params[n].append(d[k])
108 +
109 + model_params = {k: list(set(model_params[k])) for k in model_params}
110 + abstracs = get_abstracts(file_name=args.classA, label=labels[0])
111 + abstracs += get_abstracts(file_name=args.classB, label=labels[1])
112 +
113 + tfidf_model = vectorizer.fit([x['body'] for x in abstracs])
114 + X = tfidf_model.transform([x['body'] for x in abstracs])
115 + svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20)
116 + svd_model = svd.fit(X)
117 + X = svd_model.transform(X)
118 + #y = [x['topic'] for x in abstracs]
119 + y = [0 if x['topic'] == 'useless' else 1 for x in abstracs]
120 +
121 + #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
122 +
123 + clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
124 + clf = GridSearchCV(clf, cv=3,
125 + param_grid=model_params,
126 + # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
127 + n_jobs=-1, scoring='f1')
128 + start = time()
129 + clf.fit(X, y)
130 +
131 + #clf.fit(X_train, y_train)
132 + print("GridSearch took %.2f seconds for %d candidates"
133 + " parameter settings." % ((time() - start), n_iter_search))
134 +
135 + print(clf.best_estimator_)
136 + print()
137 + print(clf.best_score_)
138 + #print(metrics.f1_score(clf.predict(X_test), y_test))
139 +
140 + #joblib.dump(clf, 'model/svm_model.pkl')
141 + joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
142 + joblib.dump(tfidf_model, 'model/tfidf_model.pkl')
143 + joblib.dump(svd_model, 'model/svd_model.pkl')
144 +
145 +else:
146 +
147 + clf = joblib.load(args.svcmodel)
148 + vectorizer = joblib.load('model/tfidf_model.pkl')
149 + svd = joblib.load('model/svd_model.pkl')
150 + abstracs = get_abstracts(file_name=args.input, label='unknown')
151 + X = vectorizer.transform([x['body'] for x in abstracs])
152 + X = svd.transform(X)
153 + classes = clf.predict(X)
154 +
155 + if not os.path.exists(args.out):
156 + os.makedirs(args.out)
157 + # Writing predictions to output files
158 + with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \
159 + open(args.out + "/" + labels[1] + ".out", 'w') as f1:
160 + for c, a in zip(classes, abstracs):
161 + if c == 0:
162 + f0.write("%d\t%s\n" % (a['pmid'], a['body']))
163 + elif c == 1:
164 + f1.write("%d\t%s\n" % (a['pmid'], a['body']))
1 +#from pdb import set_trace as st
2 +from sklearn.cross_validation import train_test_split as splitt
3 +from sklearn.feature_extraction.text import TfidfVectorizer
4 +from sklearn.decomposition import TruncatedSVD
5 +from sklearn.model_selection import RandomizedSearchCV
6 +from sklearn.model_selection import GridSearchCV
7 +from sklearn import metrics
8 +from sklearn.svm import SVC
9 +import numpy as np
10 +import argparse
11 +import csv
12 +import os
13 +from sklearn.externals import joblib
14 +from time import time
15 +from scipy.stats import randint as sp_randint
16 +from scipy.stats import expon
17 +from sklearn.preprocessing import label_binarize
18 +
19 +
20 +def get_abstracts(file_name, label):
21 + f = open(file_name)
22 + extract = {}
23 + docs = []
24 + empties = []
25 + lines = f.readlines()
26 + copyright = False
27 +
28 + for i, ln in enumerate(lines):
29 + if not ln.strip():
30 + empties.append(i)
31 + continue
32 + elif ' doi: ' in ln:
33 + for j in range(i, i + 10):
34 + if not lines[j].strip():
35 + title_idx = j + 1
36 + break
37 + continue
38 +
39 + elif 'Copyright ' in ln or 'Publish' in ln or u'\N{COPYRIGHT SIGN}' in ln:
40 + copyright = True
41 +
42 + elif 'DOI: ' in ln:
43 + if 'PMCID: ' in lines[i + 1]:
44 + extract['pmid'] = int(lines[i + 2].strip().split()[1])
45 + elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]:
46 + extract['pmid'] = int(lines[i + 1].strip().split()[1])
47 +
48 + if copyright:
49 + get = slice(empties[-3], empties[-2])
50 + copyright = False
51 + else:
52 + get = slice(empties[-2], empties[-1])
53 +
54 + extract['body'] = " ".join(lines[get]).replace("\n", ' '
55 + ).replace(" ", ' ')
56 + title = []
57 + for j in range(title_idx, title_idx + 5):
58 + if lines[j].strip():
59 + title.append(lines[j])
60 + else:
61 + break
62 + extract['title'] = " ".join(title).replace("\n", ' '
63 + ).replace(" ", ' ')
64 + extract['topic'] = label
65 + docs.append(extract)
66 + empties = []
67 + extract = {}
68 +
69 + return docs
70 +
71 +
72 +parser = argparse.ArgumentParser(
73 + description="This script separates abstracts of biomedical papers that"
74 + "report data from biomedical experiments from those that do not.")
75 +parser.add_argument("--input", help="Input file containing the abstracts to"
76 + "be predited.")
77 +parser.add_argument("--classA", help="Input file containing the abstracts of"
78 + "class A to be learned.")
79 +parser.add_argument("--classB", help="Input file containing the abstracts of"
80 + "class B to be learned.")
81 +parser.add_argument("--out", help="Path to the output directory "
82 + "(default='./filter_output')", default="filter_output")
83 +parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
84 + "(default='./model/svm_model.pkl')", default="model/svm_model.pkl")
85 +
86 +args = parser.parse_args()
87 +
88 +labels = {0: 'useless', 1: 'useful'}
89 +vectorizer = TfidfVectorizer(binary=True)
90 +print(vectorizer)
91 +
92 +if args.classA and args.classB and not args.input:
93 + f0 = open("model_params.conf")
94 + n_iter_search = 10
95 + params = [p for p in csv.DictReader(f0)]
96 + f0.close()
97 + names = list(params[0].keys())
98 + model_params = {n: [] for n in names}
99 +
100 + for n in names:
101 + for d in params:
102 + for k in d:
103 + if k == n:
104 + try:
105 + model_params[n].append(float(d[k]))
106 + except ValueError:
107 + model_params[n].append(d[k])
108 +
109 + model_params = {k: list(set(model_params[k])) for k in model_params}
110 + abstracs = get_abstracts(file_name=args.classA, label=labels[0])
111 + abstracs += get_abstracts(file_name=args.classB, label=labels[1])
112 +
113 + tfidf_model = vectorizer.fit([x['body'] for x in abstracs])
114 + X = tfidf_model.transform([x['body'] for x in abstracs])
115 + svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20)
116 + svd_model = svd.fit(X)
117 + X = svd_model.transform(X)
118 + #y = [x['topic'] for x in abstracs]
119 + y = [0 if x['topic'] == 'useless' else 1 for x in abstracs]
120 +
121 + #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
122 +
123 + clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
124 + clf = GridSearchCV(clf, cv=3,
125 + param_grid=model_params,
126 + # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
127 + n_jobs=-1, scoring='f1')
128 + start = time()
129 + clf.fit(X, y)
130 +
131 + #clf.fit(X_train, y_train)
132 + print("GridSearch took %.2f seconds for %d candidates"
133 + " parameter settings." % ((time() - start), n_iter_search))
134 +
135 + print(clf.best_estimator_)
136 + print()
137 + print(clf.best_score_)
138 + #print(metrics.f1_score(clf.predict(X_test), y_test))
139 +
140 + #joblib.dump(clf, 'model/svm_model.pkl')
141 + joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
142 + joblib.dump(tfidf_model, 'model/tfidf_model.pkl')
143 + joblib.dump(svd_model, 'model/svd_model.pkl')
144 +
145 +else:
146 +
147 + clf = joblib.load(args.svcmodel)
148 + vectorizer = joblib.load('model/tfidf_model.pkl')
149 + svd = joblib.load('model/svd_model.pkl')
150 + abstracs = get_abstracts(file_name=args.input, label='unknown')
151 + X = vectorizer.transform([x['body'] for x in abstracs])
152 + X = svd.transform(X)
153 + classes = clf.predict(X)
154 +
155 + if not os.path.exists(args.out):
156 + os.makedirs(args.out)
157 + # Writing predictions to output files
158 + with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \
159 + open(args.out + "/" + labels[1] + ".out", 'w') as f1:
160 + for c, a in zip(classes, abstracs):
161 + if c == 0:
162 + f0.write("%d\t%s\n" % (a['pmid'], a['body']))
163 + elif c == 1:
164 + f1.write("%d\t%s\n" % (a['pmid'], a['body']))
This diff could not be displayed because it is too large.
...@@ -30,15 +30,14 @@ parser.add_argument("--svcmodel", help="Path to custom pretrained svc model" ...@@ -30,15 +30,14 @@ parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
30 "(default='./model/svm_model.paper.pkl')", default="model/svm_model.paper.pkl") 30 "(default='./model/svm_model.paper.pkl')", default="model/svm_model.paper.pkl")
31 31
32 args = parser.parse_args() 32 args = parser.parse_args()
33 +labels = {0: 'useless', 1: 'useful'}
33 34
34 -data=load_files(container_path=args.traind, encoding=None, 35 +if args.traind and not args.input:
36 + data=load_files(container_path=args.traind, encoding=None,
35 decode_error='replace') 37 decode_error='replace')
36 -labels = data.target_names 38 + labels = data.target_names
37 - 39 + vectorizer = TfidfVectorizer(binary=True)
38 -vectorizer = TfidfVectorizer(binary=True) 40 + print(vectorizer)
39 -print(vectorizer)
40 -
41 -if args.train and not args.input:
42 f0 = open("model_params.conf") 41 f0 = open("model_params.conf")
43 n_iter_search = 10 42 n_iter_search = 10
44 params = [p for p in csv.DictReader(f0)] 43 params = [p for p in csv.DictReader(f0)]
...@@ -56,10 +55,9 @@ if args.train and not args.input: ...@@ -56,10 +55,9 @@ if args.train and not args.input:
56 model_params[n].append(d[k]) 55 model_params[n].append(d[k])
57 56
58 model_params = {k: list(set(model_params[k])) for k in model_params} 57 model_params = {k: list(set(model_params[k])) for k in model_params}
59 - papers = data.data
60 58
61 - tfidf_model = vectorizer.fit(papers) 59 + tfidf_model = vectorizer.fit(data.data)
62 - X = vectorizer.transform(papers) 60 + X = vectorizer.transform(data.data)
63 #y = [x['topic'] for x in abstracs] 61 #y = [x['topic'] for x in abstracs]
64 y = data.target 62 y = data.target
65 63
...@@ -87,15 +85,15 @@ if args.train and not args.input: ...@@ -87,15 +85,15 @@ if args.train and not args.input:
87 joblib.dump(tfidf_model, 'model/tfidf_model.paper.pkl') 85 joblib.dump(tfidf_model, 'model/tfidf_model.paper.pkl')
88 86
89 else: 87 else:
90 - 88 + from pdb import set_trace as st
91 data=load_files(container_path=args.input, encoding=None, 89 data=load_files(container_path=args.input, encoding=None,
92 decode_error='replace') 90 decode_error='replace')
93 clf = joblib.load(args.svcmodel) 91 clf = joblib.load(args.svcmodel)
94 vectorizer = joblib.load('model/tfidf_model.paper.pkl') 92 vectorizer = joblib.load('model/tfidf_model.paper.pkl')
95 - papers = data.data 93 + X = vectorizer.transform(data.data)
96 - X = vectorizer.transform(papers)
97 - classes = clf.predict(X)
98 94
95 + classes = clf.predict(X)
96 + st()
99 if not os.path.exists(args.out): 97 if not os.path.exists(args.out):
100 os.makedirs(args.out) 98 os.makedirs(args.out)
101 # Writing predictions to output files 99 # Writing predictions to output files
......
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
This diff is collapsed. Click to expand it.
File mode changed