Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Carlos-Francisco Méndez-Cruz
/
conditional-random-fields
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Snippets
Network
Create a new issue
Builds
Commits
Issue Boards
Authored by
Carlos-Francisco Méndez-Cruz
2018-03-08 04:32:37 -0600
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
1393a83fd8c57a347fb5014a9f49209fbb573d71
1393a83f
1 parent
4791fb14
Obtaining training and test data sets
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
30 additions
and
174 deletions
training-validation-v1.py
training-validation-v1.py
View file @
1393a83
...
...
@@ -32,7 +32,7 @@ from nltk.corpus import stopwords
# --testFile File with test data set
# --outputPath=PATH Output path to place output files
# --filteringStopWords Filtering stop words
# --
filter
Symbols Filtering punctuation marks
# --
exclude
Symbols Filtering punctuation marks
# Output
# 1) Best model
...
...
@@ -42,116 +42,44 @@ from nltk.corpus import stopwords
# --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
# --trainingFile training-data-set-70.txt
# --testFile test-data-set-30.txt
# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields
/reports
# python3.4 training-validation-v1.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields
/reports
# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields
# python3.4 training-validation-v1.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields
#################################
# FUNCTIONS #
#################################
def
wordSize
(
text
):
lWord
=
len
(
text
)
if
lWord
==
1
:
return
'1'
elif
lWord
==
2
:
return
'2'
elif
lWord
==
3
:
return
'3'
elif
lWord
==
4
:
return
'4'
elif
lWord
==
5
:
return
'5'
elif
6
<=
lWord
<=
10
:
return
'6-10'
elif
11
<=
lWord
<=
15
:
return
'11-15'
elif
16
<=
lWord
<=
20
:
return
'16-20'
elif
21
<=
lWord
<=
30
:
return
'21-30'
else
:
return
'>30'
def
hasUpperLower
(
text
):
has
=
False
if
len
(
text
)
<
3
:
return
False
regexUp
=
nltk
.
re
.
compile
(
'[A-Z]'
)
regexLo
=
nltk
.
re
.
compile
(
'[a-z]'
)
if
(
regexUp
.
search
(
text
)
!=
None
)
and
(
regexLo
.
search
(
text
)
!=
None
):
has
=
True
return
has
def
hasDigit
(
text
):
has
=
False
if
len
(
text
)
<
3
:
return
False
myRegex
=
nltk
.
re
.
compile
(
'[0-9]'
)
if
myRegex
.
search
(
text
)
!=
None
:
has
=
True
return
has
def
hasNonAlphaNum
(
text
):
has
=
False
if
len
(
text
)
<
3
:
return
False
myRegex
=
nltk
.
re
.
compile
(
'
\
W'
)
if
myRegex
.
search
(
text
)
!=
None
:
has
=
True
return
has
def
word2features
(
sent
,
i
):
# print "i: " + str(i)
# print "sent[i]" + sent[i]
listElem
=
sent
[
i
]
.
split
(
'|'
)
word
=
listElem
[
0
]
lemma
=
listElem
[
1
]
postag
=
listElem
[
2
]
features
=
{
# Names of TF and genes change by lower and upper characters: 'word.lower()': word.lower(),
# Suffixes
'word[-3:]'
:
word
[
-
3
:],
'word[-2:]'
:
word
[
-
2
:],
'word[-1:]'
:
word
[
-
1
:],
'word.isupper()'
:
word
.
isupper
(),
'word.istitle()'
:
word
.
istitle
(),
'word.hasDigit()'
:
hasDigit
(
word
),
'word.hasNonAlphaNum'
:
hasNonAlphaNum
(
word
),
# 'word.hasUpperLower': hasUpperLower(word),
#'wordSize': wordSize(word),
# 'word.isdigit()': word.isdigit(),
#'word.isupper()': word.isupper(),
'word'
:
word
,
'lemma'
:
lemma
,
'lemma[-3:]'
:
lemma
[
-
3
:],
'lemma[-2:]'
:
lemma
[
-
2
:],
'lemma[-1:]'
:
lemma
[
-
1
:],
'postag'
:
postag
,
# Prefixes
'postag[:2]'
:
postag
[:
2
],
'postag[:1]'
:
postag
[:
1
],
'word[:3]'
:
word
[:
3
],
'word[:2]'
:
word
[:
2
],
'word[:1]'
:
word
[:
1
],
}
'''
if i > 0:
listElem = sent[i - 1].split('|')
word1 = listElem[0]
lemma1 = listElem[1]
postag1 = listElem[2]
features.update({
'-1:word.lower()'
:
word1
.
lower
(),
'-1:word.istitle()'
:
word1
.
istitle
(),
'-1:word.isupper()'
:
word1
.
isupper
(),
'-1:word.hasDigit()'
:
hasDigit
(
word1
),
'-1:word.hasNonAlphaNum'
:
hasNonAlphaNum
(
word1
),
# '-1:word.hasUpperLower': hasUpperLower(word1),
'-1:word': word1,
'-1:lemma': lemma1,
'-1:postag'
:
postag1
,
'-1:postag[:2]'
:
postag1
[:
2
],
'-1:postag[:1]'
:
postag1
[:
1
],
})
# else:
# features['BOS'] = True
if i < len(sent) - 1:
listElem = sent[i + 1].split('|')
...
...
@@ -159,37 +87,18 @@ def word2features(sent, i):
lemma1 = listElem[1]
postag1 = listElem[2]
features.update({
'+1:word.lower()'
:
word1
.
lower
(),
'+1:word.istitle()'
:
word1
.
istitle
(),
'+1:word.isupper()'
:
word1
.
isupper
(),
'+1:word.hasDigit()'
:
hasDigit
(
word1
),
'+1:word.hasNonAlphaNum'
:
hasNonAlphaNum
(
word1
),
# '+1:word.hasUpperLower': hasUpperLower(word1),
'+1:word': word1,
'+1:lemma': lemma1,
'+1:postag'
:
postag1
,
'+1:postag[:2]'
:
postag1
[:
2
],
'+1:postag[:1]'
:
postag1
[:
1
],
})
# else:
# features['EOS'] = True
if i > 1:
listElem = sent[i - 2].split('|')
word2 = listElem[0]
lemma2 = listElem[1]
postag2 = listElem[2]
features.update({
'-2:word.lower()'
:
word2
.
lower
(),
'-2:word.istitle()'
:
word2
.
istitle
(),
'-2:word.isupper()'
:
word2
.
isupper
(),
'-2:word.hasDigit()'
:
hasDigit
(
word2
),
'-2:word.hasNonAlphaNum'
:
hasNonAlphaNum
(
word2
),
# '-2:word.hasUpperLower': hasUpperLower(word2),
'-2:word': word2,
'-2:lemma': lemma2,
'-2:postag'
:
postag2
,
'-2:postag[:2]'
:
postag2
[:
2
],
'-2:postag[:1]'
:
postag2
[:
1
],
})
if i < len(sent) - 2:
...
...
@@ -198,17 +107,8 @@ def word2features(sent, i):
lemma2 = listElem[1]
postag2 = listElem[2]
features.update({
'+2:word.lower()'
:
word2
.
lower
(),
'+2:word.istitle()'
:
word2
.
istitle
(),
'+2:word.isupper()'
:
word2
.
isupper
(),
'+2:word.hasDigit()'
:
hasDigit
(
word2
),
'+2:word.hasNonAlphaNum'
:
hasNonAlphaNum
(
word2
),
# '+2:word.hasUpperLower': hasUpperLower(word2),
'+2:word': word2,
'+2:lemma': lemma2,
'+2:postag'
:
postag2
,
'+2:postag[:2]'
:
postag2
[:
2
],
'+2:postag[:1]'
:
postag2
[:
1
],
})
trigrams = False
...
...
@@ -219,17 +119,8 @@ def word2features(sent, i):
lemma3 = listElem[1]
postag3 = listElem[2]
features.update({
'-3:word.lower()'
:
word3
.
lower
(),
'-3:word.istitle()'
:
word3
.
istitle
(),
'-3:word.isupper()'
:
word3
.
isupper
(),
'-3:word.hasDigit()'
:
hasDigit
(
word3
),
'-3:word.hasNonAlphaNum'
:
hasNonAlphaNum
(
word3
),
# '-3:word.hasUpperLower': hasUpperLower(word3),
'-3:word': word3,
'-3:lemma': lemma3,
'-3:postag'
:
postag3
,
'-3:postag[:2]'
:
postag3
[:
2
],
'-3:postag[:1]'
:
postag3
[:
1
],
})
if i < len(sent) - 3:
...
...
@@ -238,19 +129,10 @@ def word2features(sent, i):
lemma3 = listElem[1]
postag3 = listElem[2]
features.update({
'+3:word.lower()'
:
word3
.
lower
(),
'+3:word.istitle()'
:
word3
.
istitle
(),
'+3:word.isupper()'
:
word3
.
isupper
(),
'+3:word.hasDigit()'
:
hasDigit
(
word3
),
'+3:word.hasNonAlphaNum'
:
hasNonAlphaNum
(
word3
),
# '+3:word.hasUpperLower': hasUpperLower(word3),
'+3:word': word3,
'+3:lemma': lemma3,
'+3:postag'
:
postag3
,
'+3:postag[:2]'
:
postag3
[:
2
],
'+3:postag[:1]'
:
postag3
[:
1
],
})
'''
return
features
...
...
@@ -260,7 +142,6 @@ def sent2features(sent):
def
sent2labels
(
sent
):
return
[
elem
.
split
(
'|'
)[
3
]
for
elem
in
sent
]
# return [label for token, postag, label in sent]
def
sent2tokens
(
sent
):
...
...
@@ -269,19 +150,11 @@ def sent2tokens(sent):
def
print_transitions
(
trans_features
,
f
):
for
(
label_from
,
label_to
),
weight
in
trans_features
:
# f.write("%-6s -> %-7s %0.6f\n" % (label_from, label_to, weight))
# f.write("label_from :" + label_from)
# f.write("label_to :" + label_to)
# f.write("label_weight :" + weight)
# f.write("{} -> {} {:0.6f}\n".format(label_from.encode("utf-8"), label_to.encode("utf-8"), weight))
f
.
write
(
"{:6} -> {:7} {:0.6f}
\n
"
.
format
(
label_from
,
label_to
,
weight
))
def
print_state_features
(
state_features
,
f
):
for
(
attr
,
label
),
weight
in
state_features
:
# f.write("%0.6f %-8s %s\n" % (weight, label, attr))
# f.write(attr.encode("utf-8"))
# '{:06.2f}'.format(3.141592653589793)
f
.
write
(
"{:0.6f} {:8} {}
\n
"
.
format
(
weight
,
label
,
attr
.
encode
(
"utf-8"
)))
...
...
@@ -303,12 +176,12 @@ if __name__ == "__main__":
help
=
"File with training data set"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"--testFile"
,
dest
=
"testFile"
,
help
=
"File with test data set"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"--
filter
StopWords"
,
default
=
False
,
action
=
"store_true"
,
dest
=
"
filter
StopWords"
,
help
=
"
Filtering
stop words"
)
parser
.
add_option
(
"--
filter
Symbols"
,
default
=
False
,
action
=
"store_true"
,
dest
=
"
filter
Symbols"
,
help
=
"
Filtering
punctuation marks"
)
parser
.
add_option
(
"--
exclude
StopWords"
,
default
=
False
,
action
=
"store_true"
,
dest
=
"
exclude
StopWords"
,
help
=
"
Exclude
stop words"
)
parser
.
add_option
(
"--
exclude
Symbols"
,
default
=
False
,
action
=
"store_true"
,
dest
=
"
exclude
Symbols"
,
help
=
"
Exclude
punctuation marks"
)
(
options
,
args
)
=
parser
.
parse_args
()
if
len
(
args
)
>
0
:
...
...
@@ -320,10 +193,10 @@ if __name__ == "__main__":
print
(
"File with training data set: "
+
str
(
options
.
trainingFile
))
print
(
"Path of test data set: "
+
options
.
inputPath
)
print
(
"File with test data set: "
+
str
(
options
.
testFile
))
print
(
"
Filtering stop words: "
+
str
(
options
.
filter
StopWords
))
print
(
"
Exclude stop words: "
+
str
(
options
.
exclude
StopWords
))
symbols
=
[
'.'
,
','
,
':'
,
';'
,
'?'
,
'!'
,
'
\'
'
,
'"'
,
'<'
,
'>'
,
'('
,
')'
,
'-'
,
'_'
,
'/'
,
'
\\
'
,
'¿'
,
'¡'
,
'+'
,
'{'
,
'}'
,
'['
,
']'
,
'*'
,
'
%
'
,
'$'
,
'#'
,
'&'
,
'°'
,
'`'
,
'...'
]
print
(
"
Filtering symbols "
+
str
(
symbols
)
+
': '
+
str
(
options
.
filter
Symbols
))
print
(
"
Exclude symbols "
+
str
(
symbols
)
+
': '
+
str
(
options
.
exclude
Symbols
))
print
(
'-------------------------------- PROCESSING --------------------------------'
)
print
(
'Reading corpus...'
)
...
...
@@ -332,67 +205,50 @@ if __name__ == "__main__":
sentencesTrainingData
=
[]
sentencesTestData
=
[]
# Original: stopwords = [word.decode('utf-8') for word in stopwords.words('english')]
stopwords
=
[
word
for
word
in
stopwords
.
words
(
'english'
)]
with
open
(
os
.
path
.
join
(
options
.
inputPath
,
options
.
trainingFile
),
"r"
)
as
iFile
:
# with open(os.path.join(options.inputPath, options.trainingFile), "r", encoding="utf-8", errors='replace') as iFile:
for
line
in
iFile
.
readlines
():
listLine
=
[]
line
=
line
.
strip
(
'
\n
'
)
for
token
in
line
.
split
():
if
options
.
filter
StopWords
:
if
options
.
exclude
StopWords
:
listToken
=
token
.
split
(
'|'
)
lemma
=
listToken
[
1
]
# Original: if lemma in stopwords.words('english'):
# trainingTesting_Sklearn_crfsuite.py:269:
# UnicodeWarning: Unicode equal comparison failed to
# convert both arguments to Unicode -
# interpreting them as being unequal
if
lemma
in
stopwords
:
continue
if
options
.
filter
Symbols
:
if
options
.
exclude
Symbols
:
listToken
=
token
.
split
(
'|'
)
lemma
=
listToken
[
1
]
if
lemma
in
symbols
:
# if lemma == ',':
# print "Coma , identificada"
continue
listLine
.
append
(
token
)
sentencesTrainingData
.
append
(
listLine
)
print
(
" Sentences training data: "
+
str
(
len
(
sentencesTrainingData
)))
# print sentencesTrainingData[0]
with
open
(
os
.
path
.
join
(
options
.
inputPath
,
options
.
testFile
),
"r"
)
as
iFile
:
# with open(os.path.join(options.inputPath, options.testFile), "r", encoding="utf-8", errors='replace') as iFile:
for
line
in
iFile
.
readlines
():
listLine
=
[]
line
=
line
.
strip
(
'
\n
'
)
for
token
in
line
.
split
():
if
options
.
filter
StopWords
:
if
options
.
exclude
StopWords
:
listToken
=
token
.
split
(
'|'
)
lemma
=
listToken
[
1
]
# Original if lemma in stopwords.words('english'):
if
lemma
in
stopwords
:
continue
if
options
.
filter
Symbols
:
if
options
.
exclude
Symbols
:
listToken
=
token
.
split
(
'|'
)
lemma
=
listToken
[
1
]
if
lemma
in
symbols
:
# if lemma == ',':
# print "Coma , identificada"
continue
listLine
.
append
(
token
)
sentencesTestData
.
append
(
listLine
)
print
(
" Sentences test data: "
+
str
(
len
(
sentencesTestData
)))
# print sentencesTestData[0]
print
(
"Reading corpus done in:
%
fs"
%
(
time
()
-
t0
))
print
(
sent2features
(
sentencesTrainingData
[
0
])[
0
])
print
(
sent2features
(
sentencesTestData
[
0
])[
0
])
# print(sent2labels(sentencesTrainingData[0]))
# print(sent2labels(sentencesTestData[0]))
t0
=
time
()
X_train
=
[
sent2features
(
s
)
for
s
in
sentencesTrainingData
]
...
...
@@ -445,8 +301,8 @@ if __name__ == "__main__":
# Best hiperparameters
# crf = rs.best_estimator_
nameReport
=
options
.
trainingFile
.
replace
(
'.txt'
,
'.fStopWords_'
+
str
(
options
.
filter
StopWords
)
+
'.fSymbols_'
+
str
(
options
.
filter
Symbols
)
+
'.txt'
)
nameReport
=
options
.
trainingFile
.
replace
(
'.txt'
,
'.fStopWords_'
+
str
(
options
.
exclude
StopWords
)
+
'.fSymbols_'
+
str
(
options
.
exclude
Symbols
)
+
'.txt'
)
with
open
(
os
.
path
.
join
(
options
.
outputPath
,
"reports"
,
"report_"
+
nameReport
),
mode
=
"w"
)
as
oFile
:
oFile
.
write
(
"********** TRAINING AND TESTING REPORT **********
\n
"
)
oFile
.
write
(
"Training file: "
+
options
.
trainingFile
+
'
\n
'
)
...
...
@@ -464,23 +320,23 @@ if __name__ == "__main__":
# Saving model
print
(
" Saving training model..."
)
t1
=
time
()
nameModel
=
options
.
trainingFile
.
replace
(
'.txt'
,
'.fStopWords_'
+
str
(
options
.
filter
StopWords
)
+
'.fSymbols_'
+
str
(
options
.
filter
Symbols
)
+
'.mod'
)
nameModel
=
options
.
trainingFile
.
replace
(
'.txt'
,
'.fStopWords_'
+
str
(
options
.
exclude
StopWords
)
+
'.fSymbols_'
+
str
(
options
.
exclude
Symbols
)
+
'.mod'
)
joblib
.
dump
(
crf
,
os
.
path
.
join
(
options
.
outputPath
,
"models"
,
nameModel
))
print
(
" Saving training model done in:
%
fs"
%
(
time
()
-
t1
))
# Evaluation against test data
y_pred
=
crf
.
predict
(
X_test
)
print
(
"*********************************"
)
name
=
options
.
trainingFile
.
replace
(
'.txt'
,
'.fStopWords_'
+
str
(
options
.
filter
StopWords
)
+
'.fSymbols_'
+
str
(
options
.
filter
Symbols
)
+
'.txt'
)
name
=
options
.
trainingFile
.
replace
(
'.txt'
,
'.fStopWords_'
+
str
(
options
.
exclude
StopWords
)
+
'.fSymbols_'
+
str
(
options
.
exclude
Symbols
)
+
'.txt'
)
with
open
(
os
.
path
.
join
(
options
.
outputPath
,
"reports"
,
"y_pred_"
+
name
),
"w"
)
as
oFile
:
for
y
in
y_pred
:
oFile
.
write
(
str
(
y
)
+
'
\n
'
)
print
(
"*********************************"
)
name
=
options
.
trainingFile
.
replace
(
'.txt'
,
'.fStopWords_'
+
str
(
options
.
filter
StopWords
)
+
'.fSymbols_'
+
str
(
options
.
filter
Symbols
)
+
'.txt'
)
name
=
options
.
trainingFile
.
replace
(
'.txt'
,
'.fStopWords_'
+
str
(
options
.
exclude
StopWords
)
+
'.fSymbols_'
+
str
(
options
.
exclude
Symbols
)
+
'.txt'
)
with
open
(
os
.
path
.
join
(
options
.
outputPath
,
"reports"
,
"y_test_"
+
name
),
"w"
)
as
oFile
:
for
y
in
y_test
:
oFile
.
write
(
str
(
y
)
+
'
\n
'
)
...
...
Please
register
or
login
to post a comment