Toggle navigation
Toggle navigation
This project
Loading...
Sign in
bioNLP-UNAM
/
useless
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
Carlos-Francisco Méndez-Cruz
2018-04-03 20:21:45 -0500
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
6d3377a94bb01f2ab9f9257cc94ca5223deeb4a8
6d3377a9
1 parent
27818a9a
Final version binClass for papers
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
34 additions
and
12 deletions
filter_papers.py
filter_papers.py
View file @
6d3377a
...
...
@@ -3,7 +3,8 @@ from sklearn.cross_validation import train_test_split as splitt
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.model_selection
import
RandomizedSearchCV
from
sklearn.model_selection
import
GridSearchCV
from
sklearn
import
metrics
from
sklearn.model_selection
import
train_test_split
from
sklearn.metrics
import
recall_score
,
precision_score
,
f1_score
,
classification_report
from
sklearn.svm
import
SVC
import
numpy
as
np
import
argparse
...
...
@@ -16,18 +17,23 @@ from scipy.stats import expon
from
sklearn.preprocessing
import
label_binarize
from
sklearn.datasets
import
load_files
# CMC: Run example
# python3.4 filter_papers.py --traind /home/cmendezc/gitlab_repositories/useless/data/TEXT_FILES
parser
=
argparse
.
ArgumentParser
(
description
=
"This script separates biomedical papers that"
"report data from biomedical experiments from those that do not."
)
parser
.
add_argument
(
"--input"
,
help
=
"Input
file containing the
to"
parser
.
add_argument
(
"--input"
,
help
=
"Input
directory containing the papers
to"
"be predited."
)
parser
.
add_argument
(
"--traind"
,
help
=
"Input directory containing the papers of"
"two classes to be learned."
)
parser
.
add_argument
(
"--out"
,
help
=
"Path to the output directory "
"(default='./filter_output')"
,
default
=
"filter_output"
)
parser
.
add_argument
(
"--svcmodel"
,
help
=
"Path to custom pretrained svc model"
"(default='./model/svm_model.paper.pkl')"
,
default
=
"model/svm_model.paper.pkl"
)
"(default='./model_binClass/svm_model.paper.pkl')"
,
default
=
"model_binClass/svm_model.paper.pkl"
)
parser
.
add_argument
(
"--split"
,
default
=
False
,
action
=
"store_true"
,
dest
=
"split"
,
help
=
"Automatic split training/test of input data "
)
args
=
parser
.
parse_args
()
labels
=
{
0
:
'useless'
,
1
:
'useful'
}
...
...
@@ -56,11 +62,20 @@ if args.traind and not args.input:
model_params
=
{
k
:
list
(
set
(
model_params
[
k
]))
for
k
in
model_params
}
# CMC: separate in training - validation datasets
if
args
.
split
:
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
data
.
data
,
labels
,
test_size
=
0.25
,
random_state
=
42
)
tfidf_model
=
vectorizer
.
fit
(
X_train
)
X
=
vectorizer
.
transform
(
X_train
)
y
=
y_train
else
:
#y = [x['topic'] for x in abstracs]
# Original Nacho:
tfidf_model
=
vectorizer
.
fit
(
data
.
data
)
X
=
vectorizer
.
transform
(
data
.
data
)
#y = [x['topic'] for x in abstracs]
y
=
data
.
target
#X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
clf
=
SVC
()
#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
...
...
@@ -81,15 +96,22 @@ if args.traind and not args.input:
#print(metrics.f1_score(clf.predict(X_test), y_test))
#joblib.dump(clf, 'model/svm_model.pkl')
joblib
.
dump
(
clf
.
best_estimator_
,
'model/svm_model.paper.pkl'
)
joblib
.
dump
(
tfidf_model
,
'model/tfidf_model.paper.pkl'
)
joblib
.
dump
(
clf
.
best_estimator_
,
'model
_binClass
/svm_model.paper.pkl'
)
joblib
.
dump
(
tfidf_model
,
'model
_binClass
/tfidf_model.paper.pkl'
)
if
args
.
split
:
X
=
vectorizer
.
transform
(
X_test
)
y_pred
=
clf
.
predict
(
X
)
print
(
precision_score
(
y_test
,
y_pred
))
print
(
recall_score
(
y_test
,
y_pred
))
print
(
f1_score
(
y_test
,
y_pred
))
print
(
classification_report
(
y_test
,
y_pred
))
else
:
from
pdb
import
set_trace
as
st
data
=
load_files
(
container_path
=
args
.
input
,
encoding
=
None
,
decode_error
=
'replace'
)
clf
=
joblib
.
load
(
args
.
svcmodel
)
vectorizer
=
joblib
.
load
(
'model/tfidf_model.paper.pkl'
)
vectorizer
=
joblib
.
load
(
'model
_binClass
/tfidf_model.paper.pkl'
)
X
=
vectorizer
.
transform
(
data
.
data
)
classes
=
clf
.
predict
(
X
)
...
...
@@ -97,10 +119,10 @@ else:
if
not
os
.
path
.
exists
(
args
.
out
):
os
.
makedirs
(
args
.
out
)
# Writing predictions to output files
with
open
(
args
.
out
+
"/"
+
labels
[
0
]
+
".out"
,
'w'
)
as
f0
,
\
open
(
args
.
out
+
"/"
+
labels
[
1
]
+
".out"
,
'w'
)
as
f1
:
for
c
,
a
in
zip
(
classes
,
papers
):
with
open
(
args
.
out
+
"/"
+
labels
[
0
]
+
"
-binClass-paper
.out"
,
'w'
)
as
f0
,
\
open
(
args
.
out
+
"/"
+
labels
[
1
]
+
"
-binClass-paper
.out"
,
'w'
)
as
f1
:
for
c
,
a
in
zip
(
classes
,
data
):
if
c
==
0
:
f0
.
write
(
"
%
d
\
t
%
s
\n
"
%
(
a
[
'title'
],
a
[
'body
'
]))
f0
.
write
(
"
%
d
\
n
"
%
(
a
[
'title
'
]))
elif
c
==
1
:
f1
.
write
(
"
%
d
\
t
%
s
\n
"
%
(
a
[
'title'
],
a
[
'body
'
]))
f1
.
write
(
"
%
d
\
n
"
%
(
a
[
'title
'
]))
...
...
Please
register
or
login
to post a comment