Toggle navigation
Toggle navigation
This project
Loading...
Sign in
bioNLP-UNAM
/
useless
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
iarroyof
2018-03-30 19:20:33 -0600
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
d4ced3342229375e6708bb982d8353261895632a
d4ced334
1 parent
cbc7767d
modified readme
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
111 additions
and
3 deletions
README.md
filter_papers.py
README.md
View file @
d4ced33
...
...
@@ -18,15 +18,15 @@ The main method follows the next pipeline:
# Usage
For filtering unknown a
n
stracts run
For filtering unknown a
b
stracts run
```
bash
$
python filter_abstracts.py --input data/ecoli_abstracts/useful_abstracts.txt
```
The predictions will be stored by default at
`filter_output/`
, unless a different directory is specified by means of the
`--out`
option. The default names containing the predicitons are
-
filter_output/useful.out
-
filter_output/useless.out
-
`filter_output/useful.out`
-
`filter_output/useless.out`
The format of each file is:
...
...
filter_papers.py
0 → 100644
View file @
d4ced33
#from pdb import set_trace as st
from
sklearn.cross_validation
import
train_test_split
as
splitt
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.model_selection
import
RandomizedSearchCV
from
sklearn.model_selection
import
GridSearchCV
from
sklearn
import
metrics
from
sklearn.svm
import
SVC
import
numpy
as
np
import
argparse
import
csv
import
os
from
sklearn.externals
import
joblib
from
time
import
time
from
scipy.stats
import
randint
as
sp_randint
from
scipy.stats
import
expon
from
sklearn.preprocessing
import
label_binarize
from
sklearn.datasets
import
load_files
parser
=
argparse
.
ArgumentParser
(
description
=
"This script separates biomedical papers that"
"report data from biomedical experiments from those that do not."
)
parser
.
add_argument
(
"--input"
,
help
=
"Input file containing the to"
"be predited."
)
parser
.
add_argument
(
"--traind"
,
help
=
"Input directory containing the papers of"
"two classes to be learned."
)
parser
.
add_argument
(
"--out"
,
help
=
"Path to the output directory "
"(default='./filter_output')"
,
default
=
"filter_output"
)
parser
.
add_argument
(
"--svcmodel"
,
help
=
"Path to custom pretrained svc model"
"(default='./model/svm_model.paper.pkl')"
,
default
=
"model/svm_model.paper.pkl"
)
args
=
parser
.
parse_args
()
data
=
load_files
(
container_path
=
args
.
traind
,
encoding
=
None
,
decode_error
=
'replace'
)
labels
=
data
.
target_names
vectorizer
=
TfidfVectorizer
(
binary
=
True
)
print
(
vectorizer
)
if
args
.
train
and
not
args
.
input
:
f0
=
open
(
"model_params.conf"
)
n_iter_search
=
10
params
=
[
p
for
p
in
csv
.
DictReader
(
f0
)]
f0
.
close
()
names
=
list
(
params
[
0
]
.
keys
())
model_params
=
{
n
:
[]
for
n
in
names
}
for
n
in
names
:
for
d
in
params
:
for
k
in
d
:
if
k
==
n
:
try
:
model_params
[
n
]
.
append
(
float
(
d
[
k
]))
except
ValueError
:
model_params
[
n
]
.
append
(
d
[
k
])
model_params
=
{
k
:
list
(
set
(
model_params
[
k
]))
for
k
in
model_params
}
papers
=
data
.
data
tfidf_model
=
vectorizer
.
fit
(
papers
)
X
=
vectorizer
.
transform
(
papers
)
#y = [x['topic'] for x in abstracs]
y
=
data
.
target
#X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
clf
=
SVC
()
#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
clf
=
GridSearchCV
(
clf
,
cv
=
3
,
param_grid
=
model_params
,
# clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
n_jobs
=-
1
,
scoring
=
'f1'
)
start
=
time
()
clf
.
fit
(
X
,
y
)
#clf.fit(X_train, y_train)
print
(
"GridSearch took
%.2
f seconds for
%
d candidates"
" parameter settings."
%
((
time
()
-
start
),
n_iter_search
))
print
(
clf
.
best_estimator_
)
print
()
print
(
clf
.
best_score_
)
#print(metrics.f1_score(clf.predict(X_test), y_test))
#joblib.dump(clf, 'model/svm_model.pkl')
joblib
.
dump
(
clf
.
best_estimator_
,
'model/svm_model.paper.pkl'
)
joblib
.
dump
(
tfidf_model
,
'model/tfidf_model.paper.pkl'
)
else
:
data
=
load_files
(
container_path
=
args
.
input
,
encoding
=
None
,
decode_error
=
'replace'
)
clf
=
joblib
.
load
(
args
.
svcmodel
)
vectorizer
=
joblib
.
load
(
'model/tfidf_model.paper.pkl'
)
papers
=
data
.
data
X
=
vectorizer
.
transform
(
papers
)
classes
=
clf
.
predict
(
X
)
if
not
os
.
path
.
exists
(
args
.
out
):
os
.
makedirs
(
args
.
out
)
# Writing predictions to output files
with
open
(
args
.
out
+
"/"
+
labels
[
0
]
+
".out"
,
'w'
)
as
f0
,
\
open
(
args
.
out
+
"/"
+
labels
[
1
]
+
".out"
,
'w'
)
as
f1
:
for
c
,
a
in
zip
(
classes
,
papers
):
if
c
==
0
:
f0
.
write
(
"
%
d
\t
%
s
\n
"
%
(
a
[
'title'
],
a
[
'body'
]))
elif
c
==
1
:
f1
.
write
(
"
%
d
\t
%
s
\n
"
%
(
a
[
'title'
],
a
[
'body'
]))
Please
register
or
login
to post a comment