Showing
6 changed files
with
49 additions
and
36 deletions
1 | -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_1 --version _v13 > ../outputs/Run_1.txt | 1 | +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_1 --version _v13 > ../outputs/Run_1_v13.txt |
2 | -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_2 --version _v13 --S1 > ../outputs/Run_2.txt | 2 | +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_2 --version _v13 --S1 > ../outputs/Run_2_v13.txt |
3 | -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_3 --version _v13 --S2 > ../outputs/Run_3.txt | 3 | +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_3 --version _v13 --S2 > ../outputs/Run_3_v13.txt |
4 | -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_4 --version _v13 --S1 --S2 > ../outputs/Run_4.txt | 4 | +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_4 --version _v13 --S1 --S2 > ../outputs/Run_4_v13.txt |
5 | -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_5 --version _v13 --S3 > ../outputs/Run_5.txt | 5 | +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_5 --version _v13 --S3 > ../outputs/Run_5_v13.txt |
6 | -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_6 --version _v13 --S1 --S3 > ../outputs/Run_6.txt | 6 | +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_6 --version _v13 --S1 --S3 > ../outputs/Run_6_v13.txt |
7 | -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_7 --version _v13 --S2 --S3 > ../outputs/Run_7.txt | 7 | +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_7 --version _v13 --S2 --S3 > ../outputs/Run_7_v13.txt |
8 | -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_8 --version _v13 --S1 --S2 --S3 > ../outputs/Run_8.txt | 8 | +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_8 --version _v13 --S1 --S2 --S3 > ../outputs/Run_8_v13.txt |
9 | 9 | ||
10 | 10 | ... | ... |
... | @@ -25,7 +25,7 @@ import random | ... | @@ -25,7 +25,7 @@ import random |
25 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets | 25 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets |
26 | # | 26 | # |
27 | # | 27 | # |
28 | -# python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/ --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets | 28 | +# python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/ --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --index 5 |
29 | 29 | ||
30 | 30 | ||
31 | ########################################## | 31 | ########################################## |
... | @@ -35,17 +35,12 @@ import random | ... | @@ -35,17 +35,12 @@ import random |
35 | if __name__ == "__main__": | 35 | if __name__ == "__main__": |
36 | # Defining parameters | 36 | # Defining parameters |
37 | parser = OptionParser() | 37 | parser = OptionParser() |
38 | - parser.add_option("--inputPath", dest="inputPath", | 38 | + parser.add_option("--inputPath", dest="inputPath", help="Path of output from CoreNLP", metavar="PATH") |
39 | - help="Path of output from CoreNLP", metavar="PATH") | 39 | + parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH") |
40 | - parser.add_option("--outputPath", dest="outputPath", | 40 | + parser.add_option("--inputFile", dest="inputFile", help="File with CoreNLP-tagging sentences", metavar="FILE") |
41 | - help="Output path to place output files", | 41 | + parser.add_option("--trainingFile", dest="trainingFile", help="File with training data set", metavar="FILE") |
42 | - metavar="PATH") | 42 | + parser.add_option("--testFile", dest="testFile",help="File with test data set", metavar="FILE") |
43 | - parser.add_option("--inputFile", dest="inputFile", | 43 | + parser.add_option("--index", dest="index",help="Select a limit CoreNLP output column", metavar='N', type=int) |
44 | - help="File with CoreNLP-tagging sentences", metavar="FILE") | ||
45 | - parser.add_option("--trainingFile", dest="trainingFile", | ||
46 | - help="File with training data set", metavar="FILE") | ||
47 | - parser.add_option("--testFile", dest="testFile", | ||
48 | - help="File with test data set", metavar="FILE") | ||
49 | 44 | ||
50 | (options, args) = parser.parse_args() | 45 | (options, args) = parser.parse_args() |
51 | if len(args) > 0: | 46 | if len(args) > 0: |
... | @@ -59,6 +54,7 @@ if __name__ == "__main__": | ... | @@ -59,6 +54,7 @@ if __name__ == "__main__": |
59 | print("File with training data set: " + str(options.trainingFile)) | 54 | print("File with training data set: " + str(options.trainingFile)) |
60 | print("Path of test data set: " + options.outputPath) | 55 | print("Path of test data set: " + options.outputPath) |
61 | print("File with test data set: " + str(options.testFile)) | 56 | print("File with test data set: " + str(options.testFile)) |
57 | + print("CoreNLP output choosen colums: 1-" + str(options.index)) | ||
62 | print('-------------------------------- PROCESSING --------------------------------') | 58 | print('-------------------------------- PROCESSING --------------------------------') |
63 | ## begin of tagging | 59 | ## begin of tagging |
64 | in_labels = { | 60 | in_labels = { |
... | @@ -127,7 +123,7 @@ if __name__ == "__main__": | ... | @@ -127,7 +123,7 @@ if __name__ == "__main__": |
127 | sentence = '' | 123 | sentence = '' |
128 | elif w not in old_labels.keys(): | 124 | elif w not in old_labels.keys(): |
129 | #Building and save tagging sentence | 125 | #Building and save tagging sentence |
130 | - sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ') | 126 | + sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:options.index])+'|'+flag+' ') |
131 | 127 | ||
132 | print("Number of sentences with at least one tag: " + str(len(lista))) | 128 | print("Number of sentences with at least one tag: " + str(len(lista))) |
133 | print("Number of sentences from CoreNLP: " + str(n)) | 129 | print("Number of sentences from CoreNLP: " + str(n)) | ... | ... |
... | @@ -34,7 +34,8 @@ from nltk.corpus import stopwords | ... | @@ -34,7 +34,8 @@ from nltk.corpus import stopwords |
34 | # --testFile File with test data set | 34 | # --testFile File with test data set |
35 | # --outputPath=PATH Output path to place output files | 35 | # --outputPath=PATH Output path to place output files |
36 | # --nameGrid Number of run | 36 | # --nameGrid Number of run |
37 | -# --version Version Report | 37 | +# --version Version Report |
38 | +# --nrules Number of crf transitions | ||
38 | 39 | ||
39 | # Output | 40 | # Output |
40 | # 1) Best model | 41 | # 1) Best model |
... | @@ -47,7 +48,8 @@ from nltk.corpus import stopwords | ... | @@ -47,7 +48,8 @@ from nltk.corpus import stopwords |
47 | # --testFile test-data-set-30.txt | 48 | # --testFile test-data-set-30.txt |
48 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ | 49 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ |
49 | # --version _v1 | 50 | # --version _v1 |
50 | -# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 | 51 | +# --nrules 50 |
52 | +# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50 | ||
51 | 53 | ||
52 | ################################################################## | 54 | ################################################################## |
53 | # FEATURES # | 55 | # FEATURES # |
... | @@ -273,6 +275,8 @@ if __name__ == "__main__": | ... | @@ -273,6 +275,8 @@ if __name__ == "__main__": |
273 | parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False) | 275 | parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False) |
274 | parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False) | 276 | parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False) |
275 | parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False) | 277 | parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False) |
278 | + parser.add_option("--nrules", dest="nrules", help="Number of crf rules on report", type="int") | ||
279 | + | ||
276 | 280 | ||
277 | 281 | ||
278 | (options, args) = parser.parse_args() | 282 | (options, args) = parser.parse_args() |
... | @@ -288,6 +292,7 @@ if __name__ == "__main__": | ... | @@ -288,6 +292,7 @@ if __name__ == "__main__": |
288 | print("Exclude stop words: " + str(options.excludeStopWords)) | 292 | print("Exclude stop words: " + str(options.excludeStopWords)) |
289 | print("Levels: " + str(options.S1) + " " + str(options.S2)) | 293 | print("Levels: " + str(options.S1) + " " + str(options.S2)) |
290 | print("Report file: " + str(options.version)) | 294 | print("Report file: " + str(options.version)) |
295 | + print("Number of rules on report file: " + str(options.nrules)) | ||
291 | 296 | ||
292 | 297 | ||
293 | symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | 298 | symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', |
... | @@ -451,11 +456,11 @@ if __name__ == "__main__": | ... | @@ -451,11 +456,11 @@ if __name__ == "__main__": |
451 | oFile.write('\n') | 456 | oFile.write('\n') |
452 | 457 | ||
453 | oFile.write("\nTop likely transitions:\n") | 458 | oFile.write("\nTop likely transitions:\n") |
454 | - print_transitions(Counter(crf.transition_features_).most_common(50), oFile) | 459 | + print_transitions(Counter(crf.transition_features_).most_common(options.nrules()), oFile) |
455 | oFile.write('\n') | 460 | oFile.write('\n') |
456 | 461 | ||
457 | oFile.write("\nTop unlikely transitions:\n") | 462 | oFile.write("\nTop unlikely transitions:\n") |
458 | - print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile) | 463 | + print_transitions(Counter(crf.transition_features_).most_common()[-options.nrules():], oFile) |
459 | oFile.write('\n') | 464 | oFile.write('\n') |
460 | 465 | ||
461 | oFile.write("\nTop positive:\n") | 466 | oFile.write("\nTop positive:\n") | ... | ... |
... | @@ -35,6 +35,8 @@ from nltk.corpus import stopwords | ... | @@ -35,6 +35,8 @@ from nltk.corpus import stopwords |
35 | # --outputPath=PATH Output path to place output files | 35 | # --outputPath=PATH Output path to place output files |
36 | # --nameGrid Number of run | 36 | # --nameGrid Number of run |
37 | # --version Version Report | 37 | # --version Version Report |
38 | +# --nrules Number of crf transitions | ||
39 | + | ||
38 | 40 | ||
39 | # Output | 41 | # Output |
40 | # 1) Best model | 42 | # 1) Best model |
... | @@ -47,7 +49,9 @@ from nltk.corpus import stopwords | ... | @@ -47,7 +49,9 @@ from nltk.corpus import stopwords |
47 | # --testFile test-data-set-30.txt | 49 | # --testFile test-data-set-30.txt |
48 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ | 50 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ |
49 | # --version _v1 | 51 | # --version _v1 |
50 | -# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 | 52 | +# --nrules 50 |
53 | + | ||
54 | +# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50 | ||
51 | 55 | ||
52 | ################################################################## | 56 | ################################################################## |
53 | # FEATURES # | 57 | # FEATURES # |
... | @@ -273,7 +277,7 @@ if __name__ == "__main__": | ... | @@ -273,7 +277,7 @@ if __name__ == "__main__": |
273 | parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False) | 277 | parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False) |
274 | parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False) | 278 | parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False) |
275 | parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False) | 279 | parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False) |
276 | - | 280 | + parser.add_option("--nrules", dest="nrules", help="Number of crf rules on report", type="int") |
277 | 281 | ||
278 | (options, args) = parser.parse_args() | 282 | (options, args) = parser.parse_args() |
279 | if len(args) > 0: | 283 | if len(args) > 0: |
... | @@ -452,11 +456,11 @@ if __name__ == "__main__": | ... | @@ -452,11 +456,11 @@ if __name__ == "__main__": |
452 | oFile.write('\n') | 456 | oFile.write('\n') |
453 | 457 | ||
454 | oFile.write("\nTop likely transitions:\n") | 458 | oFile.write("\nTop likely transitions:\n") |
455 | - print_transitions(Counter(crf.transition_features_).most_common(50), oFile) | 459 | + print_transitions(Counter(crf.transition_features_).most_common(), oFile) |
456 | oFile.write('\n') | 460 | oFile.write('\n') |
457 | 461 | ||
458 | oFile.write("\nTop unlikely transitions:\n") | 462 | oFile.write("\nTop unlikely transitions:\n") |
459 | - print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile) | 463 | + print_transitions(Counter(crf.transition_features_).most_common()[-option.nrules:], oFile) |
460 | oFile.write('\n') | 464 | oFile.write('\n') |
461 | 465 | ||
462 | oFile.write("\nTop positive:\n") | 466 | oFile.write("\nTop positive:\n") | ... | ... |
... | @@ -35,6 +35,7 @@ from nltk.corpus import stopwords | ... | @@ -35,6 +35,7 @@ from nltk.corpus import stopwords |
35 | # --outputPath=PATH Output path to place output files | 35 | # --outputPath=PATH Output path to place output files |
36 | # --nameGrid Number of run | 36 | # --nameGrid Number of run |
37 | # --version Version Report | 37 | # --version Version Report |
38 | +# --nrules Number of crf transitions | ||
38 | 39 | ||
39 | # Output | 40 | # Output |
40 | # 1) Best model | 41 | # 1) Best model |
... | @@ -47,7 +48,8 @@ from nltk.corpus import stopwords | ... | @@ -47,7 +48,8 @@ from nltk.corpus import stopwords |
47 | # --testFile test-data-set-30.txt | 48 | # --testFile test-data-set-30.txt |
48 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ | 49 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ |
49 | # --version _v1 | 50 | # --version _v1 |
50 | -# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 | 51 | +# --nrules 50 |
52 | +# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50 | ||
51 | 53 | ||
52 | ################################################################## | 54 | ################################################################## |
53 | # FEATURES # | 55 | # FEATURES # |
... | @@ -271,7 +273,7 @@ if __name__ == "__main__": | ... | @@ -271,7 +273,7 @@ if __name__ == "__main__": |
271 | parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False) | 273 | parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False) |
272 | parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False) | 274 | parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False) |
273 | parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False) | 275 | parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False) |
274 | - | 276 | + parser.add_option("--nrules", dest="nrules", help="Number of crf rules on report", type="int") |
275 | 277 | ||
276 | (options, args) = parser.parse_args() | 278 | (options, args) = parser.parse_args() |
277 | if len(args) > 0: | 279 | if len(args) > 0: |
... | @@ -448,11 +450,11 @@ if __name__ == "__main__": | ... | @@ -448,11 +450,11 @@ if __name__ == "__main__": |
448 | oFile.write('\n') | 450 | oFile.write('\n') |
449 | 451 | ||
450 | oFile.write("\nTop likely transitions:\n") | 452 | oFile.write("\nTop likely transitions:\n") |
451 | - print_transitions(Counter(crf.transition_features_).most_common(50), oFile) | 453 | + print_transitions(Counter(crf.transition_features_).most_common(options.nrules), oFile) |
452 | oFile.write('\n') | 454 | oFile.write('\n') |
453 | 455 | ||
454 | oFile.write("\nTop unlikely transitions:\n") | 456 | oFile.write("\nTop unlikely transitions:\n") |
455 | - print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile) | 457 | + print_transitions(Counter(crf.transition_features_).most_common()[-options.nrules:], oFile) |
456 | oFile.write('\n') | 458 | oFile.write('\n') |
457 | 459 | ||
458 | oFile.write("\nTop positive:\n") | 460 | oFile.write("\nTop positive:\n") | ... | ... |
... | @@ -35,6 +35,7 @@ from nltk.corpus import stopwords | ... | @@ -35,6 +35,7 @@ from nltk.corpus import stopwords |
35 | # --outputPath=PATH Output path to place output files | 35 | # --outputPath=PATH Output path to place output files |
36 | # --nameGrid Number of run | 36 | # --nameGrid Number of run |
37 | # --version Version Report | 37 | # --version Version Report |
38 | +# --nrules Number of crf transitions | ||
38 | 39 | ||
39 | # Output | 40 | # Output |
40 | # 1) Best model | 41 | # 1) Best model |
... | @@ -47,7 +48,9 @@ from nltk.corpus import stopwords | ... | @@ -47,7 +48,9 @@ from nltk.corpus import stopwords |
47 | # --testFile test-data-set-30.txt | 48 | # --testFile test-data-set-30.txt |
48 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ | 49 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ |
49 | # --version _v1 | 50 | # --version _v1 |
50 | -# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 | 51 | +# --nrules 50 |
52 | + | ||
53 | +# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50 | ||
51 | 54 | ||
52 | ################################################################## | 55 | ################################################################## |
53 | # FEATURES # | 56 | # FEATURES # |
... | @@ -172,9 +175,11 @@ def word2features(sent, i, S1, S2, S3): | ... | @@ -172,9 +175,11 @@ def word2features(sent, i, S1, S2, S3): |
172 | if len(word)>1: | 175 | if len(word)>1: |
173 | features['word[:2]']= word[:2] | 176 | features['word[:2]']= word[:2] |
174 | ''' | 177 | ''' |
178 | + | ||
175 | #lemma and postag firstChar | 179 | #lemma and postag firstChar |
176 | features['lemma[:1]']= lemma[:1] | 180 | features['lemma[:1]']= lemma[:1] |
177 | #features['postag[:1]']= postag[:1] | 181 | #features['postag[:1]']= postag[:1] |
182 | + | ||
178 | #lemma and postag secondChar | 183 | #lemma and postag secondChar |
179 | if len(lemma)>1: | 184 | if len(lemma)>1: |
180 | features['lemma[:2]']= lemma[:2] | 185 | features['lemma[:2]']= lemma[:2] |
... | @@ -275,6 +280,7 @@ if __name__ == "__main__": | ... | @@ -275,6 +280,7 @@ if __name__ == "__main__": |
275 | parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False) | 280 | parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False) |
276 | parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False) | 281 | parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False) |
277 | parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False) | 282 | parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False) |
283 | + parser.add_option("--nrules", dest="nrules", help="Number of crf rules on report", type="int") | ||
278 | 284 | ||
279 | 285 | ||
280 | (options, args) = parser.parse_args() | 286 | (options, args) = parser.parse_args() |
... | @@ -452,11 +458,11 @@ if __name__ == "__main__": | ... | @@ -452,11 +458,11 @@ if __name__ == "__main__": |
452 | oFile.write('\n') | 458 | oFile.write('\n') |
453 | 459 | ||
454 | oFile.write("\nTop likely transitions:\n") | 460 | oFile.write("\nTop likely transitions:\n") |
455 | - print_transitions(Counter(crf.transition_features_).most_common(50), oFile) | 461 | + print_transitions(Counter(crf.transition_features_).most_common(options.nrules), oFile) |
456 | oFile.write('\n') | 462 | oFile.write('\n') |
457 | 463 | ||
458 | oFile.write("\nTop unlikely transitions:\n") | 464 | oFile.write("\nTop unlikely transitions:\n") |
459 | - print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile) | 465 | + print_transitions(Counter(crf.transition_features_).most_common()[-options.nrules:], oFile) |
460 | oFile.write('\n') | 466 | oFile.write('\n') |
461 | 467 | ||
462 | oFile.write("\nTop positive:\n") | 468 | oFile.write("\nTop positive:\n") | ... | ... |
-
Please register or login to post a comment