Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Carlos-Francisco Méndez-Cruz
/
lcg-bioinfoI-bionlp
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
Carlos-Francisco Méndez-Cruz
2018-09-20 00:07:37 -0500
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
846d51cce483177c396a08163d183338e6ca0017
846d51cc
1 parent
2d7bab8b
Training and testing binding thrombin dataset
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
74 additions
and
58 deletions
clasificacion-automatica/binding-thrombin-dataset/training-validation-binding-thrombin.py
representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
clasificacion-automatica/binding-thrombin-dataset/training-validation-binding-thrombin.py
View file @
846d51c
...
...
@@ -8,9 +8,8 @@ from sklearn.svm import SVC
from
sklearn.neighbors
import
NearestCentroid
from
sklearn.metrics
import
accuracy_score
,
precision_score
,
recall_score
,
f1_score
,
confusion_matrix
,
\
classification_report
import
sys
from
sklearn.externals
import
joblib
from
scipy.sparse
import
csr_matrix
import
numpy
as
np
__author__
=
'CMendezC'
...
...
@@ -26,6 +25,7 @@ __author__ = 'CMendezC'
# 7) --outputReportPath Path to place evaluation report.
# 8) --outputReportFile File to place evaluation report.
# 9) --classifier Classifier: BernoulliNB, SVM, NearestCentroid.
# 10) --saveData Save matrices
# Ouput:
# 1) Classification model and evaluation report.
...
...
@@ -42,9 +42,10 @@ __author__ = 'CMendezC'
# --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports
# --outputReportFile SVM.txt
# --classifier SVM
# --saveData
# source activate python3
# python training-validation-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM.txt --classifier SVM
# python training-validation-binding-thrombin.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset --inputTrainingData thrombin.data --inputTestingData Thrombin.testset --inputTestingClasses Thrombin.testset.class --outputModelPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/models --outputModelFile SVM-model.mod --outputReportPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/clasificacion-automatica/binding-thrombin-dataset/reports --outputReportFile SVM.txt --classifier SVM
--saveData
###########################################################
# MAIN PROGRAM #
...
...
@@ -72,6 +73,8 @@ if __name__ == "__main__":
parser
.
add_argument
(
"--classifier"
,
dest
=
"classifier"
,
help
=
"Classifier"
,
metavar
=
"NAME"
,
choices
=
(
'BernoulliNB'
,
'SVM'
,
'NearestCentroid'
),
default
=
'SVM'
)
parser
.
add_argument
(
"--saveData"
,
dest
=
"saveData"
,
action
=
'store_true'
,
help
=
"Save matrices"
)
args
=
parser
.
parse_args
()
...
...
@@ -86,48 +89,63 @@ if __name__ == "__main__":
print
(
"Path to place evaluation report: "
+
str
(
args
.
outputReportPath
))
print
(
"File to place evaluation report: "
+
str
(
args
.
outputReportFile
))
print
(
"Classifier: "
+
str
(
args
.
classifier
))
print
(
"Save matrices: "
+
str
(
args
.
saveData
))
# Start time
t0
=
time
()
print
(
" Reading training data and true classes..."
)
trainingClasses
=
[]
trainingData
=
[]
with
open
(
os
.
path
.
join
(
args
.
inputPath
,
args
.
inputTrainingData
),
encoding
=
'utf8'
,
mode
=
'r'
)
\
as
iFile
:
for
line
in
iFile
:
line
=
line
.
strip
(
'
\r\n
'
)
listLine
=
line
.
split
(
','
)
trainingClasses
.
append
(
listLine
[
0
])
trainingData
.
append
(
listLine
[
1
:])
# trainingMatrix = np.matrix(trainingData)
trainingMatrix
=
csr_matrix
(
trainingData
,
dtype
=
'double'
)
print
(
"Number of training classes: {}"
.
format
(
len
(
trainingClasses
)))
print
(
"Number of training class A: {}"
.
format
(
trainingClasses
.
count
(
'A'
)))
print
(
"Number of training class I: {}"
.
format
(
trainingClasses
.
count
(
'I'
)))
print
(
"Shape of training matrix: {}"
.
format
(
trainingMatrix
.
shape
))
print
(
" Reading testing data and true classes..."
)
testingClasses
=
[]
testingData
=
[]
with
open
(
os
.
path
.
join
(
args
.
inputPath
,
args
.
inputTestingData
),
encoding
=
'utf8'
,
mode
=
'r'
)
\
as
iFile
:
for
line
in
iFile
:
line
=
line
.
strip
(
'
\r\n
'
)
listLine
=
line
.
split
(
','
)
testingData
.
append
(
listLine
[
1
:])
testingMatrix
=
csr_matrix
(
testingData
,
dtype
=
'double'
)
with
open
(
os
.
path
.
join
(
args
.
inputPath
,
args
.
inputTestingClasses
),
encoding
=
'utf8'
,
mode
=
'r'
)
\
as
iFile
:
for
line
in
iFile
:
line
=
line
.
strip
(
'
\r\n
'
)
testingClasses
.
append
(
line
)
print
(
"Number of testing classes: {}"
.
format
(
len
(
testingClasses
)))
print
(
"Number of testing class A: {}"
.
format
(
trainingClasses
.
count
(
'A'
)))
print
(
"Number of testing class I: {}"
.
format
(
trainingClasses
.
count
(
'I'
)))
print
(
"Shape of testing matrix: {}"
.
format
(
testingMatrix
.
shape
))
print
(
"Reading training data and true classes..."
)
X_train
=
None
if
args
.
saveData
:
y_train
=
[]
trainingData
=
[]
with
open
(
os
.
path
.
join
(
args
.
inputPath
,
args
.
inputTrainingData
),
encoding
=
'utf8'
,
mode
=
'r'
)
\
as
iFile
:
for
line
in
iFile
:
line
=
line
.
strip
(
'
\r\n
'
)
listLine
=
line
.
split
(
','
)
y_train
.
append
(
listLine
[
0
])
trainingData
.
append
(
listLine
[
1
:])
# X_train = np.matrix(trainingData)
X_train
=
csr_matrix
(
trainingData
,
dtype
=
'double'
)
joblib
.
dump
(
X_train
,
os
.
path
.
join
(
args
.
outputModelPath
,
args
.
inputTrainingData
+
'.jlb'
))
joblib
.
dump
(
y_train
,
os
.
path
.
join
(
args
.
outputModelPath
,
args
.
inputTrainingData
+
'.class.jlb'
))
else
:
X_train
=
joblib
.
load
(
os
.
path
.
join
(
args
.
outputModelPath
,
args
.
inputTrainingData
+
'.jlb'
))
y_train
=
joblib
.
load
(
os
.
path
.
join
(
args
.
outputModelPath
,
args
.
inputTrainingData
+
'.class.jlb'
))
print
(
" Number of training classes: {}"
.
format
(
len
(
y_train
)))
print
(
" Number of training class A: {}"
.
format
(
y_train
.
count
(
'A'
)))
print
(
" Number of training class I: {}"
.
format
(
y_train
.
count
(
'I'
)))
print
(
" Shape of training matrix: {}"
.
format
(
X_train
.
shape
))
print
(
"Reading testing data and true classes..."
)
X_test
=
None
if
args
.
saveData
:
y_test
=
[]
testingData
=
[]
with
open
(
os
.
path
.
join
(
args
.
inputPath
,
args
.
inputTestingData
),
encoding
=
'utf8'
,
mode
=
'r'
)
\
as
iFile
:
for
line
in
iFile
:
line
=
line
.
strip
(
'
\r\n
'
)
listLine
=
line
.
split
(
','
)
testingData
.
append
(
listLine
[
1
:])
X_test
=
csr_matrix
(
testingData
,
dtype
=
'double'
)
with
open
(
os
.
path
.
join
(
args
.
inputPath
,
args
.
inputTestingClasses
),
encoding
=
'utf8'
,
mode
=
'r'
)
\
as
iFile
:
for
line
in
iFile
:
line
=
line
.
strip
(
'
\r\n
'
)
y_test
.
append
(
line
)
joblib
.
dump
(
X_test
,
os
.
path
.
join
(
args
.
outputModelPath
,
args
.
inputTestingData
+
'.jlb'
))
joblib
.
dump
(
y_test
,
os
.
path
.
join
(
args
.
outputModelPath
,
args
.
inputTestingClasses
+
'.class.jlb'
))
else
:
X_test
=
joblib
.
load
(
os
.
path
.
join
(
args
.
outputModelPath
,
args
.
inputTestingData
+
'.jlb'
))
y_test
=
joblib
.
load
(
os
.
path
.
join
(
args
.
outputModelPath
,
args
.
inputTestingClasses
+
'.class.jlb'
))
print
(
" Number of testing classes: {}"
.
format
(
len
(
y_test
)))
print
(
" Number of testing class A: {}"
.
format
(
y_test
.
count
(
'A'
)))
print
(
" Number of testing class I: {}"
.
format
(
y_test
.
count
(
'I'
)))
print
(
" Shape of testing matrix: {}"
.
format
(
X_test
.
shape
))
if
args
.
classifier
==
"MultinomialNB"
:
classifier
=
BernoulliNB
()
...
...
@@ -136,26 +154,26 @@ if __name__ == "__main__":
elif
args
.
classifier
==
"NearestCentroid"
:
classifier
=
NearestCentroid
()
print
(
"
Training..."
)
classifier
.
fit
(
trainingMatrix
,
trainingClasses
)
print
(
"
Done!"
)
print
(
"Training..."
)
classifier
.
fit
(
X_train
,
y_train
)
print
(
" Done!"
)
print
(
"
Testing (prediction in new data)..."
)
y_pred
=
classifier
.
predict
(
testingMatrix
)
print
(
"
Done!"
)
print
(
"Testing (prediction in new data)..."
)
y_pred
=
classifier
.
predict
(
X_test
)
print
(
" Done!"
)
print
(
"
Saving report..."
)
with
open
(
os
.
path
.
join
(
args
.
output
Path
,
args
.
outpu
tFile
),
mode
=
'w'
,
encoding
=
'utf8'
)
as
oFile
:
print
(
"Saving report..."
)
with
open
(
os
.
path
.
join
(
args
.
output
ReportPath
,
args
.
outputRepor
tFile
),
mode
=
'w'
,
encoding
=
'utf8'
)
as
oFile
:
oFile
.
write
(
'********** EVALUATION REPORT **********
\n
'
)
oFile
.
write
(
'Classifier: {}
\n
'
.
format
(
args
.
classifier
))
oFile
.
write
(
'Accuracy: {}
\n
'
.
format
(
accuracy_score
(
testingClasses
,
y_pred
)))
oFile
.
write
(
'Precision: {}
\n
'
.
format
(
precision_score
(
testingClasses
,
y_pred
,
average
=
'weighted'
)))
oFile
.
write
(
'Recall: {}
\n
'
.
format
(
recall_score
(
testingClasses
,
y_pred
,
average
=
'weighted'
)))
oFile
.
write
(
'F-score: {}
\n
'
.
format
(
f1_score
(
testingClasses
,
y_pred
,
average
=
'weighted'
)))
oFile
.
write
(
'Accuracy: {}
\n
'
.
format
(
accuracy_score
(
y_test
,
y_pred
)))
oFile
.
write
(
'Precision: {}
\n
'
.
format
(
precision_score
(
y_test
,
y_pred
,
average
=
'weighted'
)))
oFile
.
write
(
'Recall: {}
\n
'
.
format
(
recall_score
(
y_test
,
y_pred
,
average
=
'weighted'
)))
oFile
.
write
(
'F-score: {}
\n
'
.
format
(
f1_score
(
y_test
,
y_pred
,
average
=
'weighted'
)))
oFile
.
write
(
'Confusion matrix:
\n
'
)
oFile
.
write
(
str
(
confusion_matrix
(
testingClasses
,
y_pred
))
+
'
\n
'
)
oFile
.
write
(
str
(
confusion_matrix
(
y_test
,
y_pred
))
+
'
\n
'
)
oFile
.
write
(
'Classification report:
\n
'
)
oFile
.
write
(
classification_report
(
testingClasses
,
y_pred
)
+
'
\n
'
)
print
(
"
Done!"
)
oFile
.
write
(
classification_report
(
y_test
,
y_pred
)
+
'
\n
'
)
print
(
" Done!"
)
print
(
"Training and testing done in:
%
fs"
%
(
time
()
-
t0
))
...
...
representaciones-vectoriales/extraccion-caracteristicas-vectorizacion.py
View file @
846d51c
...
...
@@ -2,9 +2,7 @@
import
os
from
time
import
time
# from optparse import OptionParser
import
argparse
import
sys
from
sklearn.feature_extraction.text
import
TfidfVectorizer
,
CountVectorizer
from
scipy.sparse
import
csr_matrix
from
sklearn.metrics.pairwise
import
cosine_similarity
...
...
Please
register
or
login
to post a comment