Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Carlos-Francisco Méndez-Cruz
/
conditional-random-fields
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
Carlos-Francisco Méndez-Cruz
2018-03-08 01:20:04 -0600
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
e9aa86d0384ab69bee3d127a384cbe30a1db3c81
e9aa86d0
1 parent
919edf2e
Obtaining training and test data sets
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
610 additions
and
212 deletions
prepare-training-test.py
preparing-training-validation-test.py
training-validation-v1.py
training-validation.py
prepare-training-test.py
0 → 100644
View file @
e9aa86d
# -*- coding: UTF-8 -*-
from
optparse
import
OptionParser
import
os
import
sys
from
time
import
time
__author__
=
'CMendezC'
# Objective: Join transformed files for obtaining training and test data sets
# Parameters:
# 1) --inputPath Path to read files.
# 2) --trainingFile File name for training data.
# 3) --testFile File name for test data.
# 4) --outputPath Path to write files.
# Ouput:
# 1) Files created.
# Execution:
# python prepare-training-test.py
# --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/transformed
# --trainingFile training-data-set-70.txt
# --testFile test-data-set-30.txt
# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
# python prepare-training-test.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/transformed --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
###########################################################
# MAIN PROGRAM #
###########################################################
if
__name__
==
"__main__"
:
# Parameter definition
parser
=
OptionParser
()
parser
.
add_option
(
"--inputPath"
,
dest
=
"inputPath"
,
help
=
"Path to read files"
,
metavar
=
"PATH"
)
parser
.
add_option
(
"--trainingFile"
,
dest
=
"trainingFile"
,
help
=
"File for training examples"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"--testFile"
,
dest
=
"testFile"
,
help
=
"File for test examples"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"--outputPath"
,
dest
=
"outputPath"
,
help
=
"Path to write output file, feature parameter is concatenated to file name."
,
metavar
=
"PATH"
)
(
options
,
args
)
=
parser
.
parse_args
()
if
len
(
args
)
>
0
:
parser
.
error
(
"None parameters indicated."
)
sys
.
exit
(
1
)
# Printing parameter values
print
(
'-------------------------------- PARAMETERS --------------------------------'
)
print
(
"Path to read files: "
+
str
(
options
.
inputPath
))
print
(
"File for training examples"
,
str
(
options
.
trainingFile
))
print
(
"File for test examples"
,
str
(
options
.
testFile
))
print
(
"Path to write output files: "
+
str
(
options
.
outputPath
))
t0
=
time
()
trainingDataset
=
[]
testDataset
=
[]
counter
=
1
for
path
,
dirs
,
files
in
os
.
walk
(
options
.
inputPath
):
# For each file in dir
for
file
in
files
:
if
counter
<=
70
:
print
(
" Joining file {} to training data set"
.
format
(
file
))
with
open
(
os
.
path
.
join
(
path
,
file
),
"r"
,
encoding
=
"utf-8"
,
errors
=
"replace"
)
as
iFile
:
for
line
in
iFile
:
line
=
line
.
strip
(
'
\r\n
'
)
trainingDataset
.
append
(
line
)
if
counter
>
70
and
counter
<=
100
:
print
(
" Joining file {} to test data set"
.
format
(
file
))
with
open
(
os
.
path
.
join
(
path
,
file
),
"r"
,
encoding
=
"utf-8"
,
errors
=
"replace"
)
as
iFile
:
for
line
in
iFile
:
line
=
line
.
strip
(
'
\r\n
'
)
testDataset
.
append
(
line
)
with
open
(
os
.
path
.
join
(
options
.
outputPath
,
options
.
trainingFile
),
"r"
,
encoding
=
"utf-8"
,
errors
=
"replace"
)
as
oFile
:
for
line
in
trainingDataset
:
oFile
.
write
(
"{}
\n
"
.
format
(
line
))
with
open
(
os
.
path
.
join
(
options
.
outputPath
,
options
.
testFile
),
"r"
,
encoding
=
"utf-8"
,
errors
=
"replace"
)
as
oFile
:
for
line
in
testDataset
:
oFile
.
write
(
"{}
\n
"
.
format
(
line
))
preparing-training-validation-test.py
deleted
100644 → 0
View file @
919edf2
# -*- coding: UTF-8 -*-
from
optparse
import
OptionParser
import
os
import
sys
from
time
import
time
import
json
from
nltk.corpus
import
stopwords
__author__
=
'CMendezC'
# Objective: Take transformed file with format word|lemma|tag,
# for example: Multiple|multiple|JJ genetic|genetic|JJ variants|variant|NNS have|have|VBP
# and create file with an additional tagging for CRF training. For example:
# the|the|dt N-terminal|N-terminal|NN| domain|domain|NN -->
# the|the|dt|O N-terminal|N-terminal|DPOS domain|NN|O
# Additionally, we are going to aTag frequent words belonging to each aspect with corresponding aspect aTag (DOM or RP).
# We expect that these words are going to have one aTag in some context and different one in others.
# The frequent words were obtained by another program (corpusAnalysis) and save into files that are loaded here.
# In output file we only maintain the lemma and the tag or the word and the tag.
# This additional tagging is going to give us clues for aspect classification.
# Parameters:
# 1) --inputPath Path to read files.
# 2) --trainingFile File name with training data.
# 3) --testFile File name with test data.
# 4) --outputPath Path to write files. File names are concatenated with feature name.
# 5) ELIMINATED --feature Type of feature to extract and create file: lemma
# 6) --termPath Path to read term files
# 7) --termFiles JSON file with terms files and tags
# 8) --termPath Path to read JSON file with information about frequent words files
# 9) --inputFileFreq JSON file with information about frequent words
# 10 --skip=N Skip N words to form skip mentions
# 11) --stopWords Filtering stop words
# 12) --filterPunctMarks Filtering punctuation marks
# Ouput:
# 1) Files created. Name of feature is concatenated
# Execution:
# ASPECTS
# python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\aspects_TrainingTest_RP_DOM_20160723\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\aspects_TrainingTest_RP_DOM_20160723\CRF_trainingTest_Datasets --feature lemma,word --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
# SENTENCES
# python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\sentences_TrainingTest_RP_DOM_20160725\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\sentences_TrainingTest_RP_DOM_20160725\CRF_trainingTest_Datasets --feature lemma,word --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
# none: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json
# stopwords: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --stopWords
# stopwords AND filterPunctMarks: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --stopWords --filterPunctMarks
# filterPunctMarks: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --filterPunctMarks
# ¿? --SKIP
def
getSkipMentions
(
aList
,
aSkip
):
hashTemp
=
{}
for
j
in
range
(
0
,
aSkip
):
listTemp
=
[]
for
i
in
range
(
0
,
len
(
aList
),
aSkip
+
1
):
listTemp
.
append
(
aList
[
i
+
j
])
hashTemp
[
j
]
=
listTemp
return
hashTemp
###########################################################
# MAIN PROGRAM #
###########################################################
if
__name__
==
"__main__"
:
# Parameter definition
parser
=
OptionParser
()
parser
.
add_option
(
"--inputPath"
,
dest
=
"inputPath"
,
help
=
"Path to read files"
,
metavar
=
"PATH"
)
parser
.
add_option
(
"--trainingFile"
,
dest
=
"trainingFile"
,
help
=
"File with training examples"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"--testFile"
,
dest
=
"testFile"
,
help
=
"File with test examples"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"--trainingClassesFile"
,
dest
=
"trainingClassesFile"
,
help
=
"File with training classes"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"--testClassesFile"
,
dest
=
"testClassesFile"
,
help
=
"File with test classes"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"--outputPath"
,
dest
=
"outputPath"
,
help
=
"Path to write output file, feature parameter is concatenated to file name."
,
metavar
=
"PATH"
)
parser
.
add_option
(
"--termPath"
,
dest
=
"termPath"
,
help
=
"Path to read term files"
,
metavar
=
"PATH"
)
parser
.
add_option
(
"--termFiles"
,
dest
=
"termFiles"
,
help
=
"JSON file with terms files and tags"
,
metavar
=
"PATH"
)
parser
.
add_option
(
"--inputFileFreq"
,
dest
=
"inputFileFreq"
,
help
=
"JSON file with information about frequent words"
,
metavar
=
"PATH"
)
parser
.
add_option
(
"--skip"
,
type
=
"int"
,
dest
=
"skip"
,
default
=
0
,
help
=
"Skip mentions"
,
metavar
=
"N"
)
parser
.
add_option
(
"--filterStopWords"
,
default
=
False
,
action
=
"store_true"
,
dest
=
"filterStopWords"
,
help
=
"Filtering stop words"
)
parser
.
add_option
(
"--filterPunctMarks"
,
default
=
False
,
action
=
"store_true"
,
dest
=
"filterPunctMarks"
,
help
=
"Filtering punctuation marks"
)
(
options
,
args
)
=
parser
.
parse_args
()
if
len
(
args
)
>
0
:
parser
.
error
(
"None parameters indicated."
)
sys
.
exit
(
1
)
# Printing parameter values
print
(
'-------------------------------- PARAMETERS --------------------------------'
)
print
(
"Path to read files: "
+
str
(
options
.
inputPath
))
print
(
"File with training examples"
,
str
(
options
.
trainingFile
))
print
(
"File with test examples"
,
str
(
options
.
testFile
))
print
(
"File with training classes"
,
str
(
options
.
trainingClassesFile
))
print
(
"File with test classes"
,
str
(
options
.
testClassesFile
))
print
(
"File with training classes"
,
str
(
options
.
trainingClassesFile
))
print
(
"File with test classes"
,
str
(
options
.
testClassesFile
))
print
(
"Path to write output files: "
+
str
(
options
.
outputPath
))
print
(
"JSON file with information about frequent words: "
+
str
(
options
.
inputFileFreq
))
print
(
"Skip mentions: "
+
str
(
options
.
skip
))
print
(
"Filtering stop words: "
+
str
(
options
.
stopWords
))
punctMarks
=
[
'.'
,
','
,
':'
,
';'
,
'?'
,
'!'
,
'
\'
'
,
'"'
]
print
(
"Filtering puntuation marks "
+
str
(
punctMarks
)
+
': '
+
str
(
options
.
filterPunctMarks
))
filesRead
=
0
t0
=
time
()
print
(
'Loading biological term files...'
)
with
open
(
os
.
path
.
join
(
options
.
termPath
,
options
.
termFiles
))
as
data_file
:
hashes
=
json
.
load
(
data_file
)
print
(
' Loading biological term files... done'
)
hashTagAspect
=
hashes
[
"hashTagAspect"
]
print
(
'Loading frequent words...'
)
with
open
(
os
.
path
.
join
(
options
.
termPath
,
options
.
inputFileFreq
))
as
data_file
:
hashAspectFreqWords
=
json
.
load
(
data_file
)
print
(
' Loading frequent words... done'
)
listFiles
=
[
options
.
trainingFile
,
options
.
testFile
]
listClassesFiles
=
[
options
.
trainingClassesFile
,
options
.
testClassesFile
]
for
iFile
,
cFile
in
zip
(
listFiles
,
listClassesFiles
):
with
open
(
os
.
path
.
join
(
options
.
inputPath
,
iFile
),
"r"
,
encoding
=
"utf-8"
,
errors
=
"replace"
)
as
tFile
:
print
(
"Reading file..."
+
iFile
)
lines
=
[
l
.
strip
(
'
\n
'
)
for
l
in
tFile
.
readlines
()]
filesRead
+=
1
with
open
(
os
.
path
.
join
(
options
.
inputPath
,
cFile
),
"r"
,
encoding
=
"utf-8"
,
errors
=
"replace"
)
as
clFile
:
print
(
"Reading file..."
+
cFile
)
classes
=
[
c
.
strip
(
'
\n
'
)
for
c
in
clFile
.
readlines
()]
listLines
=
[]
print
(
"Processing files... "
)
for
line
,
c
in
zip
(
lines
,
classes
):
# print("class: ", c)
listTokenLine
=
[]
# listLemmaLine = []
for
tok
in
line
.
split
():
tokList
=
tok
.
split
(
"|"
)
word
=
tokList
[
0
]
lemma
=
tokList
[
1
]
tag
=
tokList
[
2
]
# Filtering stopwords
if
options
.
stopWords
:
if
lemma
in
stopwords
.
words
(
'english'
):
continue
if
options
.
filterPunctMarks
:
if
lemma
in
punctMarks
:
continue
# if tag in hashTagAspect:
# We change tag for aspect tag only in the case of aspect tag coincide with class.
# We want that CRF learn when to change term tag to aspect tag in correct context
if
tag
in
hashTagAspect
:
if
hashTagAspect
[
tag
]
==
c
:
aTag
=
hashTagAspect
[
tag
]
else
:
aTag
=
'O'
else
:
if
c
in
hashAspectFreqWords
:
# print("class: ", c)
hashFreqWords
=
hashAspectFreqWords
[
c
]
# We verify if word or lemma is in frequent words.
# These frequent words are word-forms (tokens)
if
word
.
lower
()
in
hashFreqWords
or
lemma
in
hashFreqWords
:
aTag
=
c
else
:
aTag
=
'O'
else
:
aTag
=
'O'
listTokenLine
.
append
(
word
+
"|"
+
lemma
+
"|"
+
tag
+
"|"
+
aTag
)
# if feature == "word":
listLines
.
append
(
listTokenLine
)
# if feature == "lemma":
# listLines = listLemmaLine.strip() + '\n'
if
options
.
skip
>
0
:
t0
=
time
()
skipTemp
=
options
.
skip
for
i
in
range
(
1
,
options
.
skip
):
hashTemp
=
getSkipMentions
(
listLines
,
skipTemp
)
# skipTemp -= 1
for
key
in
hashTemp
:
listLines
=
hashTemp
[
key
]
with
open
(
os
.
path
.
join
(
options
.
outputPath
,
iFile
.
replace
(
'.txt'
,
'.StopWords_'
+
str
(
options
.
stopWords
)
+
'.FilterPunctMarks_'
+
str
(
options
.
filterPunctMarks
)
+
'.Skip_'
+
str
(
skipTemp
)
+
'.txt'
)),
"w"
,
encoding
=
"utf-8"
)
as
oFile
:
for
line
in
listLines
:
oFile
.
write
(
line
)
print
(
"Skip mention done in:
%
fs"
%
(
time
()
-
t0
))
else
:
with
open
(
os
.
path
.
join
(
options
.
outputPath
,
iFile
.
replace
(
'.txt'
,
'.StopWords_'
+
str
(
options
.
stopWords
)
+
'.FilterPunctMarks_'
+
str
(
options
.
filterPunctMarks
)
+
'.Skip_'
+
str
(
options
.
skip
)
+
'.txt'
)),
"w"
,
encoding
=
"utf-8"
)
as
oFile
:
for
line
in
listLines
:
for
token
in
line
:
oFile
.
write
(
token
+
' '
)
oFile
.
write
(
'
\n
'
)
print
(
"Files processed: "
+
str
(
filesRead
))
\ No newline at end of file
training-validation-v1.py
0 → 100644
View file @
e9aa86d
# -*- coding: UTF-8 -*-
import
os
from
itertools
import
chain
from
optparse
import
OptionParser
from
time
import
time
from
collections
import
Counter
import
nltk
import
sklearn
import
scipy.stats
import
sys
from
sklearn.externals
import
joblib
from
sklearn.metrics
import
make_scorer
from
sklearn.cross_validation
import
cross_val_score
from
sklearn.grid_search
import
RandomizedSearchCV
import
sklearn_crfsuite
from
sklearn_crfsuite
import
scorers
from
sklearn_crfsuite
import
metrics
from
nltk.corpus
import
stopwords
# Objective
# Training and evaluation of CRFs with sklearn-crfsuite.
#
# Input parameters
# --inputPath=PATH Path of training and test data set
# --trainingFile File with training data set
# --testFile File with test data set
# --outputPath=PATH Output path to place output files
# --filteringStopWords Filtering stop words
# --filterSymbols Filtering punctuation marks
# Output
# 1) Best model
# Examples
# Sentences
# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS > output.TrainingTestingCRF.20161106_1.txt
# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords > output.TrainingTestingCRF.20161106_2.txt
# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterSymbols > output.TrainingTestingCRF.20161106_3.txt
# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords --filterSymbols > output.TrainingTestingCRF.20161106_4.txt
# Aspects
# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS > output.TrainingTestingCRF.20161106_5.txt
# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords > output.TrainingTestingCRF.20161106_6.txt
# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterSymbols > output.TrainingTestingCRF.20161106_7.txt
# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords --filterSymbols > output.TrainingTestingCRF.20161106_8.txt
#################################
# FUNCTIONS #
#################################
def
wordSize
(
text
):
lWord
=
len
(
text
)
if
lWord
==
1
:
return
'1'
elif
lWord
==
2
:
return
'2'
elif
lWord
==
3
:
return
'3'
elif
lWord
==
4
:
return
'4'
elif
lWord
==
5
:
return
'5'
elif
6
<=
lWord
<=
10
:
return
'6-10'
elif
11
<=
lWord
<=
15
:
return
'11-15'
elif
16
<=
lWord
<=
20
:
return
'16-20'
elif
21
<=
lWord
<=
30
:
return
'21-30'
else
:
return
'>30'
def
hasUpperLower
(
text
):
has
=
False
if
len
(
text
)
<
3
:
return
False
regexUp
=
nltk
.
re
.
compile
(
'[A-Z]'
)
regexLo
=
nltk
.
re
.
compile
(
'[a-z]'
)
if
(
regexUp
.
search
(
text
)
!=
None
)
and
(
regexLo
.
search
(
text
)
!=
None
):
has
=
True
return
has
def
hasDigit
(
text
):
has
=
False
if
len
(
text
)
<
3
:
return
False
myRegex
=
nltk
.
re
.
compile
(
'[0-9]'
)
if
myRegex
.
search
(
text
)
!=
None
:
has
=
True
return
has
def
hasNonAlphaNum
(
text
):
has
=
False
if
len
(
text
)
<
3
:
return
False
myRegex
=
nltk
.
re
.
compile
(
'
\
W'
)
if
myRegex
.
search
(
text
)
!=
None
:
has
=
True
return
has
def
word2features
(
sent
,
i
):
# print "i: " + str(i)
# print "sent[i]" + sent[i]
listElem
=
sent
[
i
]
.
split
(
'|'
)
word
=
listElem
[
0
]
lemma
=
listElem
[
1
]
postag
=
listElem
[
2
]
features
=
{
# Names of TF and genes change by lower and upper characters: 'word.lower()': word.lower(),
# Suffixes
'word[-3:]'
:
word
[
-
3
:],
'word[-2:]'
:
word
[
-
2
:],
'word[-1:]'
:
word
[
-
1
:],
'word.isupper()'
:
word
.
isupper
(),
'word.istitle()'
:
word
.
istitle
(),
'word.hasDigit()'
:
hasDigit
(
word
),
'word.hasNonAlphaNum'
:
hasNonAlphaNum
(
word
),
# 'word.hasUpperLower': hasUpperLower(word),
#'wordSize': wordSize(word),
# 'word.isdigit()': word.isdigit(),
'word'
:
word
,
'lemma'
:
lemma
,
'lemma[-3:]'
:
lemma
[
-
3
:],
'lemma[-2:]'
:
lemma
[
-
2
:],
'lemma[-1:]'
:
lemma
[
-
1
:],
'postag'
:
postag
,
# Prefixes
'postag[:2]'
:
postag
[:
2
],
'postag[:1]'
:
postag
[:
1
],
}
if
i
>
0
:
listElem
=
sent
[
i
-
1
]
.
split
(
'|'
)
word1
=
listElem
[
0
]
lemma1
=
listElem
[
1
]
postag1
=
listElem
[
2
]
features
.
update
({
'-1:word.lower()'
:
word1
.
lower
(),
'-1:word.istitle()'
:
word1
.
istitle
(),
'-1:word.isupper()'
:
word1
.
isupper
(),
'-1:word.hasDigit()'
:
hasDigit
(
word1
),
'-1:word.hasNonAlphaNum'
:
hasNonAlphaNum
(
word1
),
# '-1:word.hasUpperLower': hasUpperLower(word1),
'-1:word'
:
word1
,
'-1:lemma'
:
lemma1
,
'-1:postag'
:
postag1
,
'-1:postag[:2]'
:
postag1
[:
2
],
'-1:postag[:1]'
:
postag1
[:
1
],
})
# else:
# features['BOS'] = True
if
i
<
len
(
sent
)
-
1
:
listElem
=
sent
[
i
+
1
]
.
split
(
'|'
)
word1
=
listElem
[
0
]
lemma1
=
listElem
[
1
]
postag1
=
listElem
[
2
]
features
.
update
({
'+1:word.lower()'
:
word1
.
lower
(),
'+1:word.istitle()'
:
word1
.
istitle
(),
'+1:word.isupper()'
:
word1
.
isupper
(),
'+1:word.hasDigit()'
:
hasDigit
(
word1
),
'+1:word.hasNonAlphaNum'
:
hasNonAlphaNum
(
word1
),
# '+1:word.hasUpperLower': hasUpperLower(word1),
'+1:word'
:
word1
,
'+1:lemma'
:
lemma1
,
'+1:postag'
:
postag1
,
'+1:postag[:2]'
:
postag1
[:
2
],
'+1:postag[:1]'
:
postag1
[:
1
],
})
# else:
# features['EOS'] = True
if
i
>
1
:
listElem
=
sent
[
i
-
2
]
.
split
(
'|'
)
word2
=
listElem
[
0
]
lemma2
=
listElem
[
1
]
postag2
=
listElem
[
2
]
features
.
update
({
'-2:word.lower()'
:
word2
.
lower
(),
'-2:word.istitle()'
:
word2
.
istitle
(),
'-2:word.isupper()'
:
word2
.
isupper
(),
'-2:word.hasDigit()'
:
hasDigit
(
word2
),
'-2:word.hasNonAlphaNum'
:
hasNonAlphaNum
(
word2
),
# '-2:word.hasUpperLower': hasUpperLower(word2),
'-2:word'
:
word2
,
'-2:lemma'
:
lemma2
,
'-2:postag'
:
postag2
,
'-2:postag[:2]'
:
postag2
[:
2
],
'-2:postag[:1]'
:
postag2
[:
1
],
})
if
i
<
len
(
sent
)
-
2
:
listElem
=
sent
[
i
+
2
]
.
split
(
'|'
)
word2
=
listElem
[
0
]
lemma2
=
listElem
[
1
]
postag2
=
listElem
[
2
]
features
.
update
({
'+2:word.lower()'
:
word2
.
lower
(),
'+2:word.istitle()'
:
word2
.
istitle
(),
'+2:word.isupper()'
:
word2
.
isupper
(),
'+2:word.hasDigit()'
:
hasDigit
(
word2
),
'+2:word.hasNonAlphaNum'
:
hasNonAlphaNum
(
word2
),
# '+2:word.hasUpperLower': hasUpperLower(word2),
'+2:word'
:
word2
,
'+2:lemma'
:
lemma2
,
'+2:postag'
:
postag2
,
'+2:postag[:2]'
:
postag2
[:
2
],
'+2:postag[:1]'
:
postag2
[:
1
],
})
trigrams
=
False
if
trigrams
:
if
i
>
2
:
listElem
=
sent
[
i
-
3
]
.
split
(
'|'
)
word3
=
listElem
[
0
]
lemma3
=
listElem
[
1
]
postag3
=
listElem
[
2
]
features
.
update
({
'-3:word.lower()'
:
word3
.
lower
(),
'-3:word.istitle()'
:
word3
.
istitle
(),
'-3:word.isupper()'
:
word3
.
isupper
(),
'-3:word.hasDigit()'
:
hasDigit
(
word3
),
'-3:word.hasNonAlphaNum'
:
hasNonAlphaNum
(
word3
),
# '-3:word.hasUpperLower': hasUpperLower(word3),
'-3:word'
:
word3
,
'-3:lemma'
:
lemma3
,
'-3:postag'
:
postag3
,
'-3:postag[:2]'
:
postag3
[:
2
],
'-3:postag[:1]'
:
postag3
[:
1
],
})
if
i
<
len
(
sent
)
-
3
:
listElem
=
sent
[
i
+
3
]
.
split
(
'|'
)
word3
=
listElem
[
0
]
lemma3
=
listElem
[
1
]
postag3
=
listElem
[
2
]
features
.
update
({
'+3:word.lower()'
:
word3
.
lower
(),
'+3:word.istitle()'
:
word3
.
istitle
(),
'+3:word.isupper()'
:
word3
.
isupper
(),
'+3:word.hasDigit()'
:
hasDigit
(
word3
),
'+3:word.hasNonAlphaNum'
:
hasNonAlphaNum
(
word3
),
# '+3:word.hasUpperLower': hasUpperLower(word3),
'+3:word'
:
word3
,
'+3:lemma'
:
lemma3
,
'+3:postag'
:
postag3
,
'+3:postag[:2]'
:
postag3
[:
2
],
'+3:postag[:1]'
:
postag3
[:
1
],
})
return
features
def
sent2features
(
sent
):
return
[
word2features
(
sent
,
i
)
for
i
in
range
(
len
(
sent
))]
def
sent2labels
(
sent
):
return
[
elem
.
split
(
'|'
)[
3
]
for
elem
in
sent
]
# return [label for token, postag, label in sent]
def
sent2tokens
(
sent
):
return
[
token
for
token
,
postag
,
label
in
sent
]
def
print_transitions
(
trans_features
,
f
):
for
(
label_from
,
label_to
),
weight
in
trans_features
:
# f.write("%-6s -> %-7s %0.6f\n" % (label_from, label_to, weight))
# f.write("label_from :" + label_from)
# f.write("label_to :" + label_to)
# f.write("label_weight :" + weight)
# f.write("{} -> {} {:0.6f}\n".format(label_from.encode("utf-8"), label_to.encode("utf-8"), weight))
f
.
write
(
"{:6} -> {:7} {:0.6f}
\n
"
.
format
(
label_from
,
label_to
,
weight
))
def
print_state_features
(
state_features
,
f
):
for
(
attr
,
label
),
weight
in
state_features
:
# f.write("%0.6f %-8s %s\n" % (weight, label, attr))
# f.write(attr.encode("utf-8"))
# '{:06.2f}'.format(3.141592653589793)
f
.
write
(
"{:0.6f} {:8} {}
\n
"
.
format
(
weight
,
label
,
attr
.
encode
(
"utf-8"
)))
__author__
=
'CMendezC'
##########################################
# MAIN PROGRAM #
##########################################
if
__name__
==
"__main__"
:
# Defining parameters
parser
=
OptionParser
()
parser
.
add_option
(
"--inputPath"
,
dest
=
"inputPath"
,
help
=
"Path of training data set"
,
metavar
=
"PATH"
)
parser
.
add_option
(
"--outputPath"
,
dest
=
"outputPath"
,
help
=
"Output path to place output files"
,
metavar
=
"PATH"
)
parser
.
add_option
(
"--trainingFile"
,
dest
=
"trainingFile"
,
help
=
"File with training data set"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"--testFile"
,
dest
=
"testFile"
,
help
=
"File with test data set"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"--filterStopWords"
,
default
=
False
,
action
=
"store_true"
,
dest
=
"filterStopWords"
,
help
=
"Filtering stop words"
)
parser
.
add_option
(
"--filterSymbols"
,
default
=
False
,
action
=
"store_true"
,
dest
=
"filterSymbols"
,
help
=
"Filtering punctuation marks"
)
(
options
,
args
)
=
parser
.
parse_args
()
if
len
(
args
)
>
0
:
parser
.
error
(
"Any parameter given."
)
sys
.
exit
(
1
)
print
(
'-------------------------------- PARAMETERS --------------------------------'
)
print
(
"Path of training data set: "
+
options
.
inputPath
)
print
(
"File with training data set: "
+
str
(
options
.
trainingFile
))
print
(
"Path of test data set: "
+
options
.
inputPath
)
print
(
"File with test data set: "
+
str
(
options
.
testFile
))
print
(
"Filtering stop words: "
+
str
(
options
.
filterStopWords
))
symbols
=
[
'.'
,
','
,
':'
,
';'
,
'?'
,
'!'
,
'
\'
'
,
'"'
,
'<'
,
'>'
,
'('
,
')'
,
'-'
,
'_'
,
'/'
,
'
\\
'
,
'¿'
,
'¡'
,
'+'
,
'{'
,
'}'
,
'['
,
']'
,
'*'
,
'
%
'
,
'$'
,
'#'
,
'&'
,
'°'
,
'`'
,
'...'
]
print
(
"Filtering symbols "
+
str
(
symbols
)
+
': '
+
str
(
options
.
filterSymbols
))
print
(
'-------------------------------- PROCESSING --------------------------------'
)
print
(
'Reading corpus...'
)
t0
=
time
()
sentencesTrainingData
=
[]
sentencesTestData
=
[]
stopwords
=
[
word
.
decode
(
'utf-8'
)
for
word
in
stopwords
.
words
(
'english'
)]
with
open
(
os
.
path
.
join
(
options
.
inputPath
,
options
.
trainingFile
),
"r"
)
as
iFile
:
# with open(os.path.join(options.inputPath, options.trainingFile), "r", encoding="utf-8", errors='replace') as iFile:
for
line
in
iFile
.
readlines
():
listLine
=
[]
line
=
line
.
decode
(
"utf-8"
)
for
token
in
line
.
strip
(
'
\n
'
)
.
split
():
if
options
.
filterStopWords
:
listToken
=
token
.
split
(
'|'
)
lemma
=
listToken
[
1
]
# Original: if lemma in stopwords.words('english'):
# trainingTesting_Sklearn_crfsuite.py:269:
# UnicodeWarning: Unicode equal comparison failed to
# convert both arguments to Unicode -
# interpreting them as being unequal
if
lemma
in
stopwords
:
continue
if
options
.
filterSymbols
:
listToken
=
token
.
split
(
'|'
)
lemma
=
listToken
[
1
]
if
lemma
in
symbols
:
# if lemma == ',':
# print "Coma , identificada"
continue
listLine
.
append
(
token
)
sentencesTrainingData
.
append
(
listLine
)
print
" Sentences training data: "
+
str
(
len
(
sentencesTrainingData
))
# print sentencesTrainingData[0]
with
open
(
os
.
path
.
join
(
options
.
inputPath
,
options
.
testFile
),
"r"
)
as
iFile
:
# with open(os.path.join(options.inputPath, options.testFile), "r", encoding="utf-8", errors='replace') as iFile:
for
line
in
iFile
.
readlines
():
listLine
=
[]
line
=
line
.
decode
(
"utf-8"
)
for
token
in
line
.
strip
(
'
\n
'
)
.
split
():
if
options
.
filterStopWords
:
listToken
=
token
.
split
(
'|'
)
lemma
=
listToken
[
1
]
# Original if lemma in stopwords.words('english'):
if
lemma
in
stopwords
:
continue
if
options
.
filterSymbols
:
listToken
=
token
.
split
(
'|'
)
lemma
=
listToken
[
1
]
if
lemma
in
symbols
:
# if lemma == ',':
# print "Coma , identificada"
continue
listLine
.
append
(
token
)
sentencesTestData
.
append
(
listLine
)
print
" Sentences test data: "
+
str
(
len
(
sentencesTestData
))
# print sentencesTestData[0]
print
(
"Reading corpus done in:
%
fs"
%
(
time
()
-
t0
))
print
(
sent2features
(
sentencesTrainingData
[
0
])[
0
])
print
(
sent2features
(
sentencesTestData
[
0
])[
0
])
# print(sent2labels(sentencesTrainingData[0]))
# print(sent2labels(sentencesTestData[0]))
t0
=
time
()
X_train
=
[
sent2features
(
s
)
for
s
in
sentencesTrainingData
]
y_train
=
[
sent2labels
(
s
)
for
s
in
sentencesTrainingData
]
X_test
=
[
sent2features
(
s
)
for
s
in
sentencesTestData
]
# print X_test
y_test
=
[
sent2labels
(
s
)
for
s
in
sentencesTestData
]
# Fixed parameters
# crf = sklearn_crfsuite.CRF(
# algorithm='lbfgs',
# c1=0.1,
# c2=0.1,
# max_iterations=100,
# all_possible_transitions=True
# )
# Hyperparameter Optimization
crf
=
sklearn_crfsuite
.
CRF
(
algorithm
=
'lbfgs'
,
max_iterations
=
100
,
all_possible_transitions
=
True
)
params_space
=
{
'c1'
:
scipy
.
stats
.
expon
(
scale
=
0.5
),
'c2'
:
scipy
.
stats
.
expon
(
scale
=
0.05
),
}
# Original: labels = list(crf.classes_)
# Original: labels.remove('O')
labels
=
list
([
'GENE'
])
# use the same metric for evaluation
f1_scorer
=
make_scorer
(
metrics
.
flat_f1_score
,
average
=
'weighted'
,
labels
=
labels
)
# search
rs
=
RandomizedSearchCV
(
crf
,
params_space
,
cv
=
10
,
verbose
=
3
,
n_jobs
=-
1
,
n_iter
=
20
,
# n_iter=50,
scoring
=
f1_scorer
)
rs
.
fit
(
X_train
,
y_train
)
# Fixed parameters
# crf.fit(X_train, y_train)
# Best hiperparameters
# crf = rs.best_estimator_
nameReport
=
options
.
trainingFile
.
replace
(
'.txt'
,
'.fStopWords_'
+
str
(
options
.
filterStopWords
)
+
'.fSymbols_'
+
str
(
options
.
filterSymbols
)
+
'.txt'
)
with
open
(
os
.
path
.
join
(
options
.
outputPath
,
"reports"
,
"report_"
+
nameReport
),
mode
=
"w"
)
as
oFile
:
oFile
.
write
(
"********** TRAINING AND TESTING REPORT **********
\n
"
)
oFile
.
write
(
"Training file: "
+
options
.
trainingFile
+
'
\n
'
)
oFile
.
write
(
'
\n
'
)
oFile
.
write
(
'best params:'
+
str
(
rs
.
best_params_
)
+
'
\n
'
)
oFile
.
write
(
'best CV score:'
+
str
(
rs
.
best_score_
)
+
'
\n
'
)
oFile
.
write
(
'model size: {:0.2f}M
\n
'
.
format
(
rs
.
best_estimator_
.
size_
/
1000000
))
print
(
"Training done in:
%
fs"
%
(
time
()
-
t0
))
t0
=
time
()
# Update best crf
crf
=
rs
.
best_estimator_
# Saving model
print
(
" Saving training model..."
)
t1
=
time
()
nameModel
=
options
.
trainingFile
.
replace
(
'.txt'
,
'.fStopWords_'
+
str
(
options
.
filterStopWords
)
+
'.fSymbols_'
+
str
(
options
.
filterSymbols
)
+
'.mod'
)
joblib
.
dump
(
crf
,
os
.
path
.
join
(
options
.
outputPath
,
"models"
,
nameModel
))
print
(
" Saving training model done in:
%
fs"
%
(
time
()
-
t1
))
# Evaluation against test data
y_pred
=
crf
.
predict
(
X_test
)
print
(
"*********************************"
)
name
=
options
.
trainingFile
.
replace
(
'.txt'
,
'.fStopWords_'
+
str
(
options
.
filterStopWords
)
+
'.fSymbols_'
+
str
(
options
.
filterSymbols
)
+
'.txt'
)
with
open
(
os
.
path
.
join
(
options
.
outputPath
,
"reports"
,
"y_pred_"
+
name
),
"w"
)
as
oFile
:
for
y
in
y_pred
:
oFile
.
write
(
str
(
y
)
+
'
\n
'
)
print
(
"*********************************"
)
name
=
options
.
trainingFile
.
replace
(
'.txt'
,
'.fStopWords_'
+
str
(
options
.
filterStopWords
)
+
'.fSymbols_'
+
str
(
options
.
filterSymbols
)
+
'.txt'
)
with
open
(
os
.
path
.
join
(
options
.
outputPath
,
"reports"
,
"y_test_"
+
name
),
"w"
)
as
oFile
:
for
y
in
y_test
:
oFile
.
write
(
str
(
y
)
+
'
\n
'
)
print
(
"Prediction done in:
%
fs"
%
(
time
()
-
t0
))
# labels = list(crf.classes_)
# labels.remove('O')
with
open
(
os
.
path
.
join
(
options
.
outputPath
,
"reports"
,
"report_"
+
nameReport
),
mode
=
"a"
)
as
oFile
:
oFile
.
write
(
'
\n
'
)
oFile
.
write
(
"Flat F1: "
+
str
(
metrics
.
flat_f1_score
(
y_test
,
y_pred
,
average
=
'weighted'
,
labels
=
labels
)))
oFile
.
write
(
'
\n
'
)
# labels = list(crf.classes_)
sorted_labels
=
sorted
(
labels
,
key
=
lambda
name
:
(
name
[
1
:],
name
[
0
])
)
oFile
.
write
(
metrics
.
flat_classification_report
(
y_test
,
y_pred
,
labels
=
sorted_labels
,
digits
=
3
))
oFile
.
write
(
'
\n
'
)
oFile
.
write
(
"
\n
Top likely transitions:
\n
"
)
print_transitions
(
Counter
(
crf
.
transition_features_
)
.
most_common
(
50
),
oFile
)
oFile
.
write
(
'
\n
'
)
oFile
.
write
(
"
\n
Top unlikely transitions:
\n
"
)
print_transitions
(
Counter
(
crf
.
transition_features_
)
.
most_common
()[
-
50
:],
oFile
)
oFile
.
write
(
'
\n
'
)
oFile
.
write
(
"
\n
Top positive:
\n
"
)
print_state_features
(
Counter
(
crf
.
state_features_
)
.
most_common
(
200
),
oFile
)
oFile
.
write
(
'
\n
'
)
oFile
.
write
(
"
\n
Top negative:
\n
"
)
print_state_features
(
Counter
(
crf
.
state_features_
)
.
most_common
()[
-
200
:],
oFile
)
oFile
.
write
(
'
\n
'
)
training-validation.py
View file @
e9aa86d
...
...
@@ -428,7 +428,7 @@ if __name__ == "__main__":
# Original: labels = list(crf.classes_)
# Original: labels.remove('O')
labels
=
list
([
'
MF'
,
'TF'
,
'DFAM'
,
'DMOT'
,
'DPOS'
,
'PRO
'
])
labels
=
list
([
'
GENE
'
])
# use the same metric for evaluation
f1_scorer
=
make_scorer
(
metrics
.
flat_f1_score
,
...
...
@@ -436,7 +436,7 @@ if __name__ == "__main__":
# search
rs
=
RandomizedSearchCV
(
crf
,
params_space
,
cv
=
3
,
cv
=
10
,
verbose
=
3
,
n_jobs
=-
1
,
n_iter
=
20
,
...
...
Please
register
or
login
to post a comment