Toggle navigation
Toggle navigation
This project
Loading...
Sign in
Carlos-Francisco Méndez-Cruz
/
automatic-extraction-growth-conditions
Go to a project
Toggle navigation
Toggle navigation pinning
Projects
Groups
Snippets
Help
Project
Activity
Repository
Graphs
Network
Create a new issue
Commits
Issue Boards
Authored by
Estefani Gaytan Nunez
2019-11-05 16:06:44 -0600
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Commit
5934b0d0db239ca28c47db032d5eee4454e80998
5934b0d0
1 parent
515e01d4
upload
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
48 additions
and
35 deletions
CRF/bin/grid_v13.sh
CRF/bin/label-split_training_test_v3.py → CRF/bin/label-split_training_test_v4.py
CRF/bin/training_validation_v10.py
CRF/bin/training_validation_v11.py
CRF/bin/training_validation_v12.py
CRF/bin/training_validation_v13.py
CRF/bin/grid_v13.sh
View file @
5934b0d
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile
test
-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_1 --version _v13 > ../outputs/Run_1.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile
test
-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_2 --version _v13 --S1 > ../outputs/Run_2.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile
test
-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_3 --version _v13 --S2 > ../outputs/Run_3.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile
test
-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_4 --version _v13 --S1 --S2 > ../outputs/Run_4.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile
test
-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_5 --version _v13 --S3 > ../outputs/Run_5.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile
test
-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_6 --version _v13 --S1 --S3 > ../outputs/Run_6.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile
test
-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_7 --version _v13 --S2 --S3 > ../outputs/Run_7.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile
test
-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_8 --version _v13 --S1 --S2 --S3 > ../outputs/Run_8.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile
test
-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_1 --version _v13 > ../outputs/Run_1
_v13
.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile
test
-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_2 --version _v13 --S1 > ../outputs/Run_2
_v13
.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile
test
-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_3 --version _v13 --S2 > ../outputs/Run_3
_v13
.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile
test
-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_4 --version _v13 --S1 --S2 > ../outputs/Run_4
_v13
.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile
test
-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_5 --version _v13 --S3 > ../outputs/Run_5
_v13
.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile
test
-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_6 --version _v13 --S1 --S3 > ../outputs/Run_6
_v13
.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile
test
-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_7 --version _v13 --S2 --S3 > ../outputs/Run_7
_v13
.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile
test
-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_8 --version _v13 --S1 --S2 --S3 > ../outputs/Run_8
_v13
.txt
...
...
CRF/bin/label-split_training_test_v
3
.py
→
CRF/bin/label-split_training_test_v
4
.py
View file @
5934b0d
...
...
@@ -25,7 +25,7 @@ import random
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets
#
#
# python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/ --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets
# python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/ --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets
--index 5
##########################################
...
...
@@ -35,17 +35,12 @@ import random
if
__name__
==
"__main__"
:
# Defining parameters
parser
=
OptionParser
()
parser
.
add_option
(
"--inputPath"
,
dest
=
"inputPath"
,
help
=
"Path of output from CoreNLP"
,
metavar
=
"PATH"
)
parser
.
add_option
(
"--outputPath"
,
dest
=
"outputPath"
,
help
=
"Output path to place output files"
,
metavar
=
"PATH"
)
parser
.
add_option
(
"--inputFile"
,
dest
=
"inputFile"
,
help
=
"File with CoreNLP-tagging sentences"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"--trainingFile"
,
dest
=
"trainingFile"
,
help
=
"File with training data set"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"--testFile"
,
dest
=
"testFile"
,
help
=
"File with test data set"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"--inputPath"
,
dest
=
"inputPath"
,
help
=
"Path of output from CoreNLP"
,
metavar
=
"PATH"
)
parser
.
add_option
(
"--outputPath"
,
dest
=
"outputPath"
,
help
=
"Output path to place output files"
,
metavar
=
"PATH"
)
parser
.
add_option
(
"--inputFile"
,
dest
=
"inputFile"
,
help
=
"File with CoreNLP-tagging sentences"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"--trainingFile"
,
dest
=
"trainingFile"
,
help
=
"File with training data set"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"--testFile"
,
dest
=
"testFile"
,
help
=
"File with test data set"
,
metavar
=
"FILE"
)
parser
.
add_option
(
"--index"
,
dest
=
"index"
,
help
=
"Select a limit CoreNLP output column"
,
metavar
=
'N'
,
type
=
int
)
(
options
,
args
)
=
parser
.
parse_args
()
if
len
(
args
)
>
0
:
...
...
@@ -59,6 +54,7 @@ if __name__ == "__main__":
print
(
"File with training data set: "
+
str
(
options
.
trainingFile
))
print
(
"Path of test data set: "
+
options
.
outputPath
)
print
(
"File with test data set: "
+
str
(
options
.
testFile
))
print
(
"CoreNLP output choosen colums: 1-"
+
str
(
options
.
index
))
print
(
'-------------------------------- PROCESSING --------------------------------'
)
## begin of tagging
in_labels
=
{
...
...
@@ -127,7 +123,7 @@ if __name__ == "__main__":
sentence
=
''
elif
w
not
in
old_labels
.
keys
():
#Building and save tagging sentence
sentence
=
sentence
+
' '
+
(
'|'
.
join
(
line
.
split
(
'
\t
'
)[
1
:
4
])
+
'|'
+
flag
+
' '
)
sentence
=
sentence
+
' '
+
(
'|'
.
join
(
line
.
split
(
'
\t
'
)[
1
:
options
.
index
])
+
'|'
+
flag
+
' '
)
print
(
"Number of sentences with at least one tag: "
+
str
(
len
(
lista
)))
print
(
"Number of sentences from CoreNLP: "
+
str
(
n
))
...
...
CRF/bin/training_validation_v10.py
View file @
5934b0d
...
...
@@ -35,6 +35,7 @@ from nltk.corpus import stopwords
# --outputPath=PATH Output path to place output files
# --nameGrid Number of run
# --version Version Report
# --nrules Number of crf transitions
# Output
# 1) Best model
...
...
@@ -47,7 +48,8 @@ from nltk.corpus import stopwords
# --testFile test-data-set-30.txt
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/
# --version _v1
# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3
# --nrules 50
# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50
##################################################################
# FEATURES #
...
...
@@ -273,6 +275,8 @@ if __name__ == "__main__":
parser
.
add_option
(
"--S3"
,
dest
=
"S3"
,
help
=
"Future Type"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_option
(
"--excludeStopWords"
,
dest
=
"excludeStopWords"
,
help
=
"Exclude stop words"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_option
(
"--excludeSymbols"
,
dest
=
"excludeSymbols"
,
help
=
"Exclude punctuation marks"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_option
(
"--nrules"
,
dest
=
"nrules"
,
help
=
"Number of crf rules on report"
,
type
=
"int"
)
(
options
,
args
)
=
parser
.
parse_args
()
...
...
@@ -288,6 +292,7 @@ if __name__ == "__main__":
print
(
"Exclude stop words: "
+
str
(
options
.
excludeStopWords
))
print
(
"Levels: "
+
str
(
options
.
S1
)
+
" "
+
str
(
options
.
S2
))
print
(
"Report file: "
+
str
(
options
.
version
))
print
(
"Number of rules on report file: "
+
str
(
options
.
nrules
))
symbols
=
[
'.'
,
','
,
':'
,
';'
,
'?'
,
'!'
,
'
\'
'
,
'"'
,
'<'
,
'>'
,
'('
,
')'
,
'-'
,
'_'
,
'/'
,
'
\\
'
,
'¿'
,
'¡'
,
'+'
,
'{'
,
...
...
@@ -451,11 +456,11 @@ if __name__ == "__main__":
oFile
.
write
(
'
\n
'
)
oFile
.
write
(
"
\n
Top likely transitions:
\n
"
)
print_transitions
(
Counter
(
crf
.
transition_features_
)
.
most_common
(
50
),
oFile
)
print_transitions
(
Counter
(
crf
.
transition_features_
)
.
most_common
(
options
.
nrules
()
),
oFile
)
oFile
.
write
(
'
\n
'
)
oFile
.
write
(
"
\n
Top unlikely transitions:
\n
"
)
print_transitions
(
Counter
(
crf
.
transition_features_
)
.
most_common
()[
-
50
:],
oFile
)
print_transitions
(
Counter
(
crf
.
transition_features_
)
.
most_common
()[
-
options
.
nrules
()
:],
oFile
)
oFile
.
write
(
'
\n
'
)
oFile
.
write
(
"
\n
Top positive:
\n
"
)
...
...
CRF/bin/training_validation_v11.py
View file @
5934b0d
...
...
@@ -35,6 +35,8 @@ from nltk.corpus import stopwords
# --outputPath=PATH Output path to place output files
# --nameGrid Number of run
# --version Version Report
# --nrules Number of crf transitions
# Output
# 1) Best model
...
...
@@ -47,7 +49,9 @@ from nltk.corpus import stopwords
# --testFile test-data-set-30.txt
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/
# --version _v1
# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3
# --nrules 50
# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50
##################################################################
# FEATURES #
...
...
@@ -273,7 +277,7 @@ if __name__ == "__main__":
parser
.
add_option
(
"--S3"
,
dest
=
"S3"
,
help
=
"Future Type"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_option
(
"--excludeStopWords"
,
dest
=
"excludeStopWords"
,
help
=
"Exclude stop words"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_option
(
"--excludeSymbols"
,
dest
=
"excludeSymbols"
,
help
=
"Exclude punctuation marks"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_option
(
"--nrules"
,
dest
=
"nrules"
,
help
=
"Number of crf rules on report"
,
type
=
"int"
)
(
options
,
args
)
=
parser
.
parse_args
()
if
len
(
args
)
>
0
:
...
...
@@ -452,11 +456,11 @@ if __name__ == "__main__":
oFile
.
write
(
'
\n
'
)
oFile
.
write
(
"
\n
Top likely transitions:
\n
"
)
print_transitions
(
Counter
(
crf
.
transition_features_
)
.
most_common
(
50
),
oFile
)
print_transitions
(
Counter
(
crf
.
transition_features_
)
.
most_common
(),
oFile
)
oFile
.
write
(
'
\n
'
)
oFile
.
write
(
"
\n
Top unlikely transitions:
\n
"
)
print_transitions
(
Counter
(
crf
.
transition_features_
)
.
most_common
()[
-
50
:],
oFile
)
print_transitions
(
Counter
(
crf
.
transition_features_
)
.
most_common
()[
-
option
.
nrules
:],
oFile
)
oFile
.
write
(
'
\n
'
)
oFile
.
write
(
"
\n
Top positive:
\n
"
)
...
...
CRF/bin/training_validation_v12.py
View file @
5934b0d
...
...
@@ -35,6 +35,7 @@ from nltk.corpus import stopwords
# --outputPath=PATH Output path to place output files
# --nameGrid Number of run
# --version Version Report
# --nrules Number of crf transitions
# Output
# 1) Best model
...
...
@@ -47,7 +48,8 @@ from nltk.corpus import stopwords
# --testFile test-data-set-30.txt
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/
# --version _v1
# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3
# --nrules 50
# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50
##################################################################
# FEATURES #
...
...
@@ -271,7 +273,7 @@ if __name__ == "__main__":
parser
.
add_option
(
"--S3"
,
dest
=
"S3"
,
help
=
"Future Type"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_option
(
"--excludeStopWords"
,
dest
=
"excludeStopWords"
,
help
=
"Exclude stop words"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_option
(
"--excludeSymbols"
,
dest
=
"excludeSymbols"
,
help
=
"Exclude punctuation marks"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_option
(
"--nrules"
,
dest
=
"nrules"
,
help
=
"Number of crf rules on report"
,
type
=
"int"
)
(
options
,
args
)
=
parser
.
parse_args
()
if
len
(
args
)
>
0
:
...
...
@@ -448,11 +450,11 @@ if __name__ == "__main__":
oFile
.
write
(
'
\n
'
)
oFile
.
write
(
"
\n
Top likely transitions:
\n
"
)
print_transitions
(
Counter
(
crf
.
transition_features_
)
.
most_common
(
50
),
oFile
)
print_transitions
(
Counter
(
crf
.
transition_features_
)
.
most_common
(
options
.
nrules
),
oFile
)
oFile
.
write
(
'
\n
'
)
oFile
.
write
(
"
\n
Top unlikely transitions:
\n
"
)
print_transitions
(
Counter
(
crf
.
transition_features_
)
.
most_common
()[
-
50
:],
oFile
)
print_transitions
(
Counter
(
crf
.
transition_features_
)
.
most_common
()[
-
options
.
nrules
:],
oFile
)
oFile
.
write
(
'
\n
'
)
oFile
.
write
(
"
\n
Top positive:
\n
"
)
...
...
CRF/bin/training_validation_v13.py
View file @
5934b0d
...
...
@@ -35,6 +35,7 @@ from nltk.corpus import stopwords
# --outputPath=PATH Output path to place output files
# --nameGrid Number of run
# --version Version Report
# --nrules Number of crf transitions
# Output
# 1) Best model
...
...
@@ -47,7 +48,9 @@ from nltk.corpus import stopwords
# --testFile test-data-set-30.txt
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/
# --version _v1
# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3
# --nrules 50
# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50
##################################################################
# FEATURES #
...
...
@@ -172,9 +175,11 @@ def word2features(sent, i, S1, S2, S3):
if len(word)>1:
features['word[:2]']= word[:2]
'''
#lemma and postag firstChar
features
[
'lemma[:1]'
]
=
lemma
[:
1
]
#features['postag[:1]']= postag[:1]
#lemma and postag secondChar
if
len
(
lemma
)
>
1
:
features
[
'lemma[:2]'
]
=
lemma
[:
2
]
...
...
@@ -275,6 +280,7 @@ if __name__ == "__main__":
parser
.
add_option
(
"--S3"
,
dest
=
"S3"
,
help
=
"Future Type"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_option
(
"--excludeStopWords"
,
dest
=
"excludeStopWords"
,
help
=
"Exclude stop words"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_option
(
"--excludeSymbols"
,
dest
=
"excludeSymbols"
,
help
=
"Exclude punctuation marks"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_option
(
"--nrules"
,
dest
=
"nrules"
,
help
=
"Number of crf rules on report"
,
type
=
"int"
)
(
options
,
args
)
=
parser
.
parse_args
()
...
...
@@ -452,11 +458,11 @@ if __name__ == "__main__":
oFile
.
write
(
'
\n
'
)
oFile
.
write
(
"
\n
Top likely transitions:
\n
"
)
print_transitions
(
Counter
(
crf
.
transition_features_
)
.
most_common
(
50
),
oFile
)
print_transitions
(
Counter
(
crf
.
transition_features_
)
.
most_common
(
options
.
nrules
),
oFile
)
oFile
.
write
(
'
\n
'
)
oFile
.
write
(
"
\n
Top unlikely transitions:
\n
"
)
print_transitions
(
Counter
(
crf
.
transition_features_
)
.
most_common
()[
-
50
:],
oFile
)
print_transitions
(
Counter
(
crf
.
transition_features_
)
.
most_common
()[
-
options
.
nrules
:],
oFile
)
oFile
.
write
(
'
\n
'
)
oFile
.
write
(
"
\n
Top positive:
\n
"
)
...
...
Please
register
or
login
to post a comment