Toggle navigation
Toggle navigation
This project
Loading...
Sign in
bioNLP-UNAM
/
useless
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
Ignacio Arroyo
2018-03-26 00:03:33 -0600
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
063a626d4f84d03e9389cb5ea7b89f02df7b04b3
063a626d
0 parents
grid search classification
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
175 additions
and
0 deletions
get_abstracts.py
get_abstracts.py
0 → 100644
View file @
063a626
#from pdb import set_trace as st
from
sklearn.cross_validation
import
train_test_split
as
splitt
from
sklearn.feature_extraction.text
import
TfidfVectorizer
,
HashingVectorizer
from
sklearn.decomposition
import
TruncatedSVD
from
sklearn.naive_bayes
import
MultinomialNB
from
sklearn.linear_model
import
SGDClassifier
from
sklearn.neighbors
import
KNeighborsClassifier
from
sklearn.neighbors
import
NearestCentroid
from
sklearn.ensemble
import
RandomForestClassifier
from
sklearn.svm
import
LinearSVC
from
sklearn.svm
import
SVC
from
sklearn
import
metrics
from
sklearn.ensemble
import
(
ExtraTreesClassifier
,
RandomForestClassifier
,
AdaBoostClassifier
,
GradientBoostingClassifier
)
from
sklearn.grid_search
import
GridSearchCV
import
pandas
as
pd
from
numpy
import
mean
,
std
#Classifier = KNeighborsClassifier # 0.6464
#Classifier = NearestCentroid # 0.5054
#Classifier = RandomForestClassifier # 0.49
#Classifier = LinearSVC # 0.5402
#Classifier = SGDClassifier # 0.664
class
EstimatorSelectionHelper
:
def
__init__
(
self
,
models
,
params
):
if
not
set
(
models
.
keys
())
.
issubset
(
set
(
params
.
keys
())):
missing_params
=
list
(
set
(
models
.
keys
())
-
set
(
params
.
keys
()))
raise
ValueError
(
"Some estimators are missing parameters:
%
s"
%
missing_params
)
self
.
models
=
models
self
.
params
=
params
self
.
keys
=
models
.
keys
()
self
.
grid_searches
=
{}
def
fit
(
self
,
X
,
y
,
cv
=
3
,
n_jobs
=
1
,
verbose
=
1
,
scoring
=
None
,
refit
=
False
):
for
key
in
self
.
keys
:
print
(
"Running GridSearchCV for
%
s."
%
key
)
model
=
self
.
models
[
key
]
params
=
self
.
params
[
key
]
gs
=
GridSearchCV
(
model
,
params
,
cv
=
cv
,
n_jobs
=
n_jobs
,
verbose
=
verbose
,
scoring
=
scoring
,
refit
=
refit
)
gs
.
fit
(
X
,
y
)
self
.
grid_searches
[
key
]
=
gs
def
score_summary
(
self
,
sort_by
=
'mean_score'
):
def
row
(
key
,
scores
,
params
):
d
=
{
'estimator'
:
key
,
'min_score'
:
min
(
scores
),
'max_score'
:
max
(
scores
),
'mean_score'
:
mean
(
scores
),
'std_score'
:
std
(
scores
),
}
return
pd
.
Series
(
dict
(
list
(
params
.
items
())
+
list
(
d
.
items
())))
rows
=
[
row
(
k
,
gsc
.
cv_validation_scores
,
gsc
.
parameters
)
for
k
in
self
.
keys
for
gsc
in
self
.
grid_searches
[
k
]
.
grid_scores_
]
df
=
pd
.
concat
(
rows
,
axis
=
1
)
.
T
.
sort_values
([
sort_by
],
ascending
=
False
)
columns
=
[
'estimator'
,
'min_score'
,
'mean_score'
,
'max_score'
,
'std_score'
]
columns
=
columns
+
[
c
for
c
in
df
.
columns
if
c
not
in
columns
]
return
df
[
columns
]
def
get_abstracts
(
file_name
,
label
):
f
=
open
(
file_name
)
extract
=
{}
docs
=
[]
empties
=
[]
lines
=
f
.
readlines
()
copyright
=
False
for
i
,
ln
in
enumerate
(
lines
):
if
not
ln
.
strip
():
empties
.
append
(
i
)
continue
elif
' doi: '
in
ln
:
for
j
in
range
(
i
,
i
+
10
):
if
not
lines
[
j
]
.
strip
():
title_idx
=
j
+
1
break
continue
elif
'Copyright '
in
ln
:
copyright
=
True
elif
'DOI: '
in
ln
:
if
'PMCID: '
in
lines
[
i
+
1
]:
extract
[
'pmid'
]
=
int
(
lines
[
i
+
2
]
.
strip
()
.
split
()[
1
])
elif
not
'PMCID: '
in
lines
[
i
+
1
]
and
'PMID: '
in
lines
[
i
+
1
]:
extract
[
'pmid'
]
=
int
(
lines
[
i
+
1
]
.
strip
()
.
split
()[
1
])
if
copyright
:
get
=
slice
(
empties
[
-
3
],
empties
[
-
2
])
copyright
=
False
else
:
get
=
slice
(
empties
[
-
2
],
empties
[
-
1
])
extract
[
'body'
]
=
" "
.
join
(
lines
[
get
])
.
replace
(
"
\n
"
,
' '
)
.
replace
(
" "
,
' '
)
title
=
[]
for
j
in
range
(
title_idx
,
title_idx
+
5
):
if
lines
[
j
]
.
strip
():
title
.
append
(
lines
[
j
])
else
:
break
extract
[
'title'
]
=
" "
.
join
(
title
)
.
replace
(
"
\n
"
,
' '
)
.
replace
(
" "
,
' '
)
extract
[
'topic'
]
=
label
docs
.
append
(
extract
)
empties
=
[]
extract
=
{}
return
docs
filename
=
"../data/ecoli_abstracts/not_useful_abstracts.txt"
labels
=
[
'useless'
,
'useful'
]
abstracs
=
get_abstracts
(
file_name
=
filename
,
label
=
labels
[
0
])
filename
=
"../data/ecoli_abstracts/useful_abstracts.txt"
abstracs
+=
get_abstracts
(
file_name
=
filename
,
label
=
labels
[
1
])
X
=
[
x
[
'body'
]
for
x
in
abstracs
]
y
=
[
1
if
x
[
'topic'
]
==
'useful'
else
0
for
x
in
abstracs
]
models1
=
{
'ExtraTreesClassifier'
:
ExtraTreesClassifier
(),
'RandomForestClassifier'
:
RandomForestClassifier
(),
'AdaBoostClassifier'
:
AdaBoostClassifier
(),
'GradientBoostingClassifier'
:
GradientBoostingClassifier
(),
'SVC'
:
SVC
()
}
params1
=
{
'ExtraTreesClassifier'
:
{
'n_estimators'
:
[
16
,
32
]
},
'RandomForestClassifier'
:
{
'n_estimators'
:
[
16
,
32
]
},
'AdaBoostClassifier'
:
{
'n_estimators'
:
[
16
,
32
]
},
'GradientBoostingClassifier'
:
{
'n_estimators'
:
[
16
,
32
],
'learning_rate'
:
[
0.8
,
1.0
]
},
'SVC'
:
[
#{'kernel': ['linear'], 'C': [1, 10, 100, 150, 200, 300, 400]},
{
'kernel'
:
[
'rbf'
],
'C'
:
[
1
,
10
,
100
,
150
,
200
,
300
,
400
],
'gamma'
:
[
0.001
,
0.0001
]},
{
'kernel'
:
[
'poly'
],
'C'
:
[
1
,
10
,
100
,
150
,
200
,
300
,
400
],
'degree'
:
[
2
,
3
,
4
,
5
,
6
]},
{
'kernel'
:
[
'sigmoid'
],
'C'
:
[
1
,
10
,
100
,
150
,
200
,
300
,
400
],
'gamma'
:
[
0.001
,
0.0001
]},
]
}
clf
=
EstimatorSelectionHelper
(
models1
,
params1
)
vectorizer
=
TfidfVectorizer
(
binary
=
True
)
#ngram_range=(1, 3)
#)
#vectorizer = HashingVectorizer(non_negative=True)
print
(
vectorizer
)
#svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20)
X
=
vectorizer
.
fit_transform
(
X
)
#X = svd.fit_transform(X)
#X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
#from sklearn.feature_selection import chi2, SelectKBest
#ch2 = SelectKBest(chi2, k=200)
#X_train = ch2.fit_transform(X_train, y_train)
#X_test = ch2.transform(X_test)
#clf = MultinomialNB(alpha=.01)
#clf = Classifier(n_jobs=-1, n_iter=100)
#st()
clf
.
fit
(
X
,
y
,
scoring
=
'f1'
,
n_jobs
=-
1
)
#pred = clf.predict(X_test)
#print(metrics.f1_score(y_test, pred, average='macro'))
print
(
clf
.
score_summary
(
sort_by
=
'min_score'
))
Please
register
or
login
to post a comment