Estefani Gaytan Nunez

upload

python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_1 --version _v13 > ../outputs/Run_1.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_2 --version _v13 --S1 > ../outputs/Run_2.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_3 --version _v13 --S2 > ../outputs/Run_3.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_4 --version _v13 --S1 --S2 > ../outputs/Run_4.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_5 --version _v13 --S3 > ../outputs/Run_5.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_6 --version _v13 --S1 --S3 > ../outputs/Run_6.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_7 --version _v13 --S2 --S3 > ../outputs/Run_7.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_8 --version _v13 --S1 --S2 --S3 > ../outputs/Run_8.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_1 --version _v13 > ../outputs/Run_1_v13.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_2 --version _v13 --S1 > ../outputs/Run_2_v13.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_3 --version _v13 --S2 > ../outputs/Run_3_v13.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_4 --version _v13 --S1 --S2 > ../outputs/Run_4_v13.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_5 --version _v13 --S3 > ../outputs/Run_5_v13.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_6 --version _v13 --S1 --S3 > ../outputs/Run_6_v13.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_7 --version _v13 --S2 --S3 > ../outputs/Run_7_v13.txt
python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_8 --version _v13 --S1 --S2 --S3 > ../outputs/Run_8_v13.txt
......
......@@ -25,7 +25,7 @@ import random
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets
#
#
# python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/ --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets
# python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/ --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --index 5
##########################################
......@@ -35,17 +35,12 @@ import random
if __name__ == "__main__":
# Defining parameters
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Path of output from CoreNLP", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
help="Output path to place output files",
metavar="PATH")
parser.add_option("--inputFile", dest="inputFile",
help="File with CoreNLP-tagging sentences", metavar="FILE")
parser.add_option("--trainingFile", dest="trainingFile",
help="File with training data set", metavar="FILE")
parser.add_option("--testFile", dest="testFile",
help="File with test data set", metavar="FILE")
parser.add_option("--inputPath", dest="inputPath", help="Path of output from CoreNLP", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
parser.add_option("--inputFile", dest="inputFile", help="File with CoreNLP-tagging sentences", metavar="FILE")
parser.add_option("--trainingFile", dest="trainingFile", help="File with training data set", metavar="FILE")
parser.add_option("--testFile", dest="testFile",help="File with test data set", metavar="FILE")
parser.add_option("--index", dest="index",help="Select a limit CoreNLP output column", metavar='N', type=int)
(options, args) = parser.parse_args()
if len(args) > 0:
......@@ -59,6 +54,7 @@ if __name__ == "__main__":
print("File with training data set: " + str(options.trainingFile))
print("Path of test data set: " + options.outputPath)
print("File with test data set: " + str(options.testFile))
print("CoreNLP output choosen colums: 1-" + str(options.index))
print('-------------------------------- PROCESSING --------------------------------')
## begin of tagging
in_labels = {
......@@ -127,7 +123,7 @@ if __name__ == "__main__":
sentence = ''
elif w not in old_labels.keys():
#Building and save tagging sentence
sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:options.index])+'|'+flag+' ')
print("Number of sentences with at least one tag: " + str(len(lista)))
print("Number of sentences from CoreNLP: " + str(n))
......
......@@ -35,6 +35,7 @@ from nltk.corpus import stopwords
# --outputPath=PATH Output path to place output files
# --nameGrid Number of run
# --version Version Report
# --nrules Number of crf transitions
# Output
# 1) Best model
......@@ -47,7 +48,8 @@ from nltk.corpus import stopwords
# --testFile test-data-set-30.txt
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/
# --version _v1
# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3
# --nrules 50
# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50
##################################################################
# FEATURES #
......@@ -273,6 +275,8 @@ if __name__ == "__main__":
parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False)
parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False)
parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False)
parser.add_option("--nrules", dest="nrules", help="Number of crf rules on report", type="int")
(options, args) = parser.parse_args()
......@@ -288,6 +292,7 @@ if __name__ == "__main__":
print("Exclude stop words: " + str(options.excludeStopWords))
print("Levels: " + str(options.S1) + " " + str(options.S2))
print("Report file: " + str(options.version))
print("Number of rules on report file: " + str(options.nrules))
symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
......@@ -451,11 +456,11 @@ if __name__ == "__main__":
oFile.write('\n')
oFile.write("\nTop likely transitions:\n")
print_transitions(Counter(crf.transition_features_).most_common(50), oFile)
print_transitions(Counter(crf.transition_features_).most_common(options.nrules()), oFile)
oFile.write('\n')
oFile.write("\nTop unlikely transitions:\n")
print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile)
print_transitions(Counter(crf.transition_features_).most_common()[-options.nrules():], oFile)
oFile.write('\n')
oFile.write("\nTop positive:\n")
......
......@@ -35,6 +35,8 @@ from nltk.corpus import stopwords
# --outputPath=PATH Output path to place output files
# --nameGrid Number of run
# --version Version Report
# --nrules Number of crf transitions
# Output
# 1) Best model
......@@ -47,7 +49,9 @@ from nltk.corpus import stopwords
# --testFile test-data-set-30.txt
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/
# --version _v1
# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3
# --nrules 50
# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50
##################################################################
# FEATURES #
......@@ -273,7 +277,7 @@ if __name__ == "__main__":
parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False)
parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False)
parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False)
parser.add_option("--nrules", dest="nrules", help="Number of crf rules on report", type="int")
(options, args) = parser.parse_args()
if len(args) > 0:
......@@ -452,11 +456,11 @@ if __name__ == "__main__":
oFile.write('\n')
oFile.write("\nTop likely transitions:\n")
print_transitions(Counter(crf.transition_features_).most_common(50), oFile)
print_transitions(Counter(crf.transition_features_).most_common(), oFile)
oFile.write('\n')
oFile.write("\nTop unlikely transitions:\n")
print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile)
print_transitions(Counter(crf.transition_features_).most_common()[-option.nrules:], oFile)
oFile.write('\n')
oFile.write("\nTop positive:\n")
......
......@@ -35,6 +35,7 @@ from nltk.corpus import stopwords
# --outputPath=PATH Output path to place output files
# --nameGrid Number of run
# --version Version Report
# --nrules Number of crf transitions
# Output
# 1) Best model
......@@ -47,7 +48,8 @@ from nltk.corpus import stopwords
# --testFile test-data-set-30.txt
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/
# --version _v1
# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3
# --nrules 50
# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50
##################################################################
# FEATURES #
......@@ -271,7 +273,7 @@ if __name__ == "__main__":
parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False)
parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False)
parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False)
parser.add_option("--nrules", dest="nrules", help="Number of crf rules on report", type="int")
(options, args) = parser.parse_args()
if len(args) > 0:
......@@ -448,11 +450,11 @@ if __name__ == "__main__":
oFile.write('\n')
oFile.write("\nTop likely transitions:\n")
print_transitions(Counter(crf.transition_features_).most_common(50), oFile)
print_transitions(Counter(crf.transition_features_).most_common(options.nrules), oFile)
oFile.write('\n')
oFile.write("\nTop unlikely transitions:\n")
print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile)
print_transitions(Counter(crf.transition_features_).most_common()[-options.nrules:], oFile)
oFile.write('\n')
oFile.write("\nTop positive:\n")
......
......@@ -35,6 +35,7 @@ from nltk.corpus import stopwords
# --outputPath=PATH Output path to place output files
# --nameGrid Number of run
# --version Version Report
# --nrules Number of crf transitions
# Output
# 1) Best model
......@@ -47,7 +48,9 @@ from nltk.corpus import stopwords
# --testFile test-data-set-30.txt
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/
# --version _v1
# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3
# --nrules 50
# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50
##################################################################
# FEATURES #
......@@ -172,9 +175,11 @@ def word2features(sent, i, S1, S2, S3):
if len(word)>1:
features['word[:2]']= word[:2]
'''
#lemma and postag firstChar
features['lemma[:1]']= lemma[:1]
#features['postag[:1]']= postag[:1]
#lemma and postag secondChar
if len(lemma)>1:
features['lemma[:2]']= lemma[:2]
......@@ -275,6 +280,7 @@ if __name__ == "__main__":
parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False)
parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False)
parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False)
parser.add_option("--nrules", dest="nrules", help="Number of crf rules on report", type="int")
(options, args) = parser.parse_args()
......@@ -452,11 +458,11 @@ if __name__ == "__main__":
oFile.write('\n')
oFile.write("\nTop likely transitions:\n")
print_transitions(Counter(crf.transition_features_).most_common(50), oFile)
print_transitions(Counter(crf.transition_features_).most_common(options.nrules), oFile)
oFile.write('\n')
oFile.write("\nTop unlikely transitions:\n")
print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile)
print_transitions(Counter(crf.transition_features_).most_common()[-options.nrules:], oFile)
oFile.write('\n')
oFile.write("\nTop positive:\n")
......