Showing
6 changed files
with
49 additions
and
36 deletions
| 1 | -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_1 --version _v13 > ../outputs/Run_1.txt | 1 | +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_1 --version _v13 > ../outputs/Run_1_v13.txt |
| 2 | -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_2 --version _v13 --S1 > ../outputs/Run_2.txt | 2 | +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_2 --version _v13 --S1 > ../outputs/Run_2_v13.txt |
| 3 | -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_3 --version _v13 --S2 > ../outputs/Run_3.txt | 3 | +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_3 --version _v13 --S2 > ../outputs/Run_3_v13.txt |
| 4 | -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_4 --version _v13 --S1 --S2 > ../outputs/Run_4.txt | 4 | +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_4 --version _v13 --S1 --S2 > ../outputs/Run_4_v13.txt |
| 5 | -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_5 --version _v13 --S3 > ../outputs/Run_5.txt | 5 | +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_5 --version _v13 --S3 > ../outputs/Run_5_v13.txt |
| 6 | -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_6 --version _v13 --S1 --S3 > ../outputs/Run_6.txt | 6 | +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_6 --version _v13 --S1 --S3 > ../outputs/Run_6_v13.txt |
| 7 | -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_7 --version _v13 --S2 --S3 > ../outputs/Run_7.txt | 7 | +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_7 --version _v13 --S2 --S3 > ../outputs/Run_7_v13.txt |
| 8 | -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_8 --version _v13 --S1 --S2 --S3 > ../outputs/Run_8.txt | 8 | +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_8 --version _v13 --S1 --S2 --S3 > ../outputs/Run_8_v13.txt |
| 9 | 9 | ||
| 10 | 10 | ... | ... |
| ... | @@ -25,7 +25,7 @@ import random | ... | @@ -25,7 +25,7 @@ import random |
| 25 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets | 25 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets |
| 26 | # | 26 | # |
| 27 | # | 27 | # |
| 28 | -# python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/ --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets | 28 | +# python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/ --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --index 5 |
| 29 | 29 | ||
| 30 | 30 | ||
| 31 | ########################################## | 31 | ########################################## |
| ... | @@ -35,17 +35,12 @@ import random | ... | @@ -35,17 +35,12 @@ import random |
| 35 | if __name__ == "__main__": | 35 | if __name__ == "__main__": |
| 36 | # Defining parameters | 36 | # Defining parameters |
| 37 | parser = OptionParser() | 37 | parser = OptionParser() |
| 38 | - parser.add_option("--inputPath", dest="inputPath", | 38 | + parser.add_option("--inputPath", dest="inputPath", help="Path of output from CoreNLP", metavar="PATH") |
| 39 | - help="Path of output from CoreNLP", metavar="PATH") | 39 | + parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH") |
| 40 | - parser.add_option("--outputPath", dest="outputPath", | 40 | + parser.add_option("--inputFile", dest="inputFile", help="File with CoreNLP-tagging sentences", metavar="FILE") |
| 41 | - help="Output path to place output files", | 41 | + parser.add_option("--trainingFile", dest="trainingFile", help="File with training data set", metavar="FILE") |
| 42 | - metavar="PATH") | 42 | + parser.add_option("--testFile", dest="testFile",help="File with test data set", metavar="FILE") |
| 43 | - parser.add_option("--inputFile", dest="inputFile", | 43 | + parser.add_option("--index", dest="index",help="Select a limit CoreNLP output column", metavar='N', type=int) |
| 44 | - help="File with CoreNLP-tagging sentences", metavar="FILE") | ||
| 45 | - parser.add_option("--trainingFile", dest="trainingFile", | ||
| 46 | - help="File with training data set", metavar="FILE") | ||
| 47 | - parser.add_option("--testFile", dest="testFile", | ||
| 48 | - help="File with test data set", metavar="FILE") | ||
| 49 | 44 | ||
| 50 | (options, args) = parser.parse_args() | 45 | (options, args) = parser.parse_args() |
| 51 | if len(args) > 0: | 46 | if len(args) > 0: |
| ... | @@ -59,6 +54,7 @@ if __name__ == "__main__": | ... | @@ -59,6 +54,7 @@ if __name__ == "__main__": |
| 59 | print("File with training data set: " + str(options.trainingFile)) | 54 | print("File with training data set: " + str(options.trainingFile)) |
| 60 | print("Path of test data set: " + options.outputPath) | 55 | print("Path of test data set: " + options.outputPath) |
| 61 | print("File with test data set: " + str(options.testFile)) | 56 | print("File with test data set: " + str(options.testFile)) |
| 57 | + print("CoreNLP output choosen colums: 1-" + str(options.index)) | ||
| 62 | print('-------------------------------- PROCESSING --------------------------------') | 58 | print('-------------------------------- PROCESSING --------------------------------') |
| 63 | ## begin of tagging | 59 | ## begin of tagging |
| 64 | in_labels = { | 60 | in_labels = { |
| ... | @@ -127,7 +123,7 @@ if __name__ == "__main__": | ... | @@ -127,7 +123,7 @@ if __name__ == "__main__": |
| 127 | sentence = '' | 123 | sentence = '' |
| 128 | elif w not in old_labels.keys(): | 124 | elif w not in old_labels.keys(): |
| 129 | #Building and save tagging sentence | 125 | #Building and save tagging sentence |
| 130 | - sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ') | 126 | + sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:options.index])+'|'+flag+' ') |
| 131 | 127 | ||
| 132 | print("Number of sentences with at least one tag: " + str(len(lista))) | 128 | print("Number of sentences with at least one tag: " + str(len(lista))) |
| 133 | print("Number of sentences from CoreNLP: " + str(n)) | 129 | print("Number of sentences from CoreNLP: " + str(n)) | ... | ... |
| ... | @@ -34,7 +34,8 @@ from nltk.corpus import stopwords | ... | @@ -34,7 +34,8 @@ from nltk.corpus import stopwords |
| 34 | # --testFile File with test data set | 34 | # --testFile File with test data set |
| 35 | # --outputPath=PATH Output path to place output files | 35 | # --outputPath=PATH Output path to place output files |
| 36 | # --nameGrid Number of run | 36 | # --nameGrid Number of run |
| 37 | -# --version Version Report | 37 | +# --version Version Report |
| 38 | +# --nrules Number of crf transitions | ||
| 38 | 39 | ||
| 39 | # Output | 40 | # Output |
| 40 | # 1) Best model | 41 | # 1) Best model |
| ... | @@ -47,7 +48,8 @@ from nltk.corpus import stopwords | ... | @@ -47,7 +48,8 @@ from nltk.corpus import stopwords |
| 47 | # --testFile test-data-set-30.txt | 48 | # --testFile test-data-set-30.txt |
| 48 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ | 49 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ |
| 49 | # --version _v1 | 50 | # --version _v1 |
| 50 | -# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 | 51 | +# --nrules 50 |
| 52 | +# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50 | ||
| 51 | 53 | ||
| 52 | ################################################################## | 54 | ################################################################## |
| 53 | # FEATURES # | 55 | # FEATURES # |
| ... | @@ -273,6 +275,8 @@ if __name__ == "__main__": | ... | @@ -273,6 +275,8 @@ if __name__ == "__main__": |
| 273 | parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False) | 275 | parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False) |
| 274 | parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False) | 276 | parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False) |
| 275 | parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False) | 277 | parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False) |
| 278 | + parser.add_option("--nrules", dest="nrules", help="Number of crf rules on report", type="int") | ||
| 279 | + | ||
| 276 | 280 | ||
| 277 | 281 | ||
| 278 | (options, args) = parser.parse_args() | 282 | (options, args) = parser.parse_args() |
| ... | @@ -288,6 +292,7 @@ if __name__ == "__main__": | ... | @@ -288,6 +292,7 @@ if __name__ == "__main__": |
| 288 | print("Exclude stop words: " + str(options.excludeStopWords)) | 292 | print("Exclude stop words: " + str(options.excludeStopWords)) |
| 289 | print("Levels: " + str(options.S1) + " " + str(options.S2)) | 293 | print("Levels: " + str(options.S1) + " " + str(options.S2)) |
| 290 | print("Report file: " + str(options.version)) | 294 | print("Report file: " + str(options.version)) |
| 295 | + print("Number of rules on report file: " + str(options.nrules)) | ||
| 291 | 296 | ||
| 292 | 297 | ||
| 293 | symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | 298 | symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', |
| ... | @@ -451,11 +456,11 @@ if __name__ == "__main__": | ... | @@ -451,11 +456,11 @@ if __name__ == "__main__": |
| 451 | oFile.write('\n') | 456 | oFile.write('\n') |
| 452 | 457 | ||
| 453 | oFile.write("\nTop likely transitions:\n") | 458 | oFile.write("\nTop likely transitions:\n") |
| 454 | - print_transitions(Counter(crf.transition_features_).most_common(50), oFile) | 459 | + print_transitions(Counter(crf.transition_features_).most_common(options.nrules()), oFile) |
| 455 | oFile.write('\n') | 460 | oFile.write('\n') |
| 456 | 461 | ||
| 457 | oFile.write("\nTop unlikely transitions:\n") | 462 | oFile.write("\nTop unlikely transitions:\n") |
| 458 | - print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile) | 463 | + print_transitions(Counter(crf.transition_features_).most_common()[-options.nrules():], oFile) |
| 459 | oFile.write('\n') | 464 | oFile.write('\n') |
| 460 | 465 | ||
| 461 | oFile.write("\nTop positive:\n") | 466 | oFile.write("\nTop positive:\n") | ... | ... |
| ... | @@ -35,6 +35,8 @@ from nltk.corpus import stopwords | ... | @@ -35,6 +35,8 @@ from nltk.corpus import stopwords |
| 35 | # --outputPath=PATH Output path to place output files | 35 | # --outputPath=PATH Output path to place output files |
| 36 | # --nameGrid Number of run | 36 | # --nameGrid Number of run |
| 37 | # --version Version Report | 37 | # --version Version Report |
| 38 | +# --nrules Number of crf transitions | ||
| 39 | + | ||
| 38 | 40 | ||
| 39 | # Output | 41 | # Output |
| 40 | # 1) Best model | 42 | # 1) Best model |
| ... | @@ -47,7 +49,9 @@ from nltk.corpus import stopwords | ... | @@ -47,7 +49,9 @@ from nltk.corpus import stopwords |
| 47 | # --testFile test-data-set-30.txt | 49 | # --testFile test-data-set-30.txt |
| 48 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ | 50 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ |
| 49 | # --version _v1 | 51 | # --version _v1 |
| 50 | -# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 | 52 | +# --nrules 50 |
| 53 | + | ||
| 54 | +# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50 | ||
| 51 | 55 | ||
| 52 | ################################################################## | 56 | ################################################################## |
| 53 | # FEATURES # | 57 | # FEATURES # |
| ... | @@ -273,7 +277,7 @@ if __name__ == "__main__": | ... | @@ -273,7 +277,7 @@ if __name__ == "__main__": |
| 273 | parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False) | 277 | parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False) |
| 274 | parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False) | 278 | parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False) |
| 275 | parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False) | 279 | parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False) |
| 276 | - | 280 | + parser.add_option("--nrules", dest="nrules", help="Number of crf rules on report", type="int") |
| 277 | 281 | ||
| 278 | (options, args) = parser.parse_args() | 282 | (options, args) = parser.parse_args() |
| 279 | if len(args) > 0: | 283 | if len(args) > 0: |
| ... | @@ -452,11 +456,11 @@ if __name__ == "__main__": | ... | @@ -452,11 +456,11 @@ if __name__ == "__main__": |
| 452 | oFile.write('\n') | 456 | oFile.write('\n') |
| 453 | 457 | ||
| 454 | oFile.write("\nTop likely transitions:\n") | 458 | oFile.write("\nTop likely transitions:\n") |
| 455 | - print_transitions(Counter(crf.transition_features_).most_common(50), oFile) | 459 | + print_transitions(Counter(crf.transition_features_).most_common(), oFile) |
| 456 | oFile.write('\n') | 460 | oFile.write('\n') |
| 457 | 461 | ||
| 458 | oFile.write("\nTop unlikely transitions:\n") | 462 | oFile.write("\nTop unlikely transitions:\n") |
| 459 | - print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile) | 463 | + print_transitions(Counter(crf.transition_features_).most_common()[-option.nrules:], oFile) |
| 460 | oFile.write('\n') | 464 | oFile.write('\n') |
| 461 | 465 | ||
| 462 | oFile.write("\nTop positive:\n") | 466 | oFile.write("\nTop positive:\n") | ... | ... |
| ... | @@ -35,6 +35,7 @@ from nltk.corpus import stopwords | ... | @@ -35,6 +35,7 @@ from nltk.corpus import stopwords |
| 35 | # --outputPath=PATH Output path to place output files | 35 | # --outputPath=PATH Output path to place output files |
| 36 | # --nameGrid Number of run | 36 | # --nameGrid Number of run |
| 37 | # --version Version Report | 37 | # --version Version Report |
| 38 | +# --nrules Number of crf transitions | ||
| 38 | 39 | ||
| 39 | # Output | 40 | # Output |
| 40 | # 1) Best model | 41 | # 1) Best model |
| ... | @@ -47,7 +48,8 @@ from nltk.corpus import stopwords | ... | @@ -47,7 +48,8 @@ from nltk.corpus import stopwords |
| 47 | # --testFile test-data-set-30.txt | 48 | # --testFile test-data-set-30.txt |
| 48 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ | 49 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ |
| 49 | # --version _v1 | 50 | # --version _v1 |
| 50 | -# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 | 51 | +# --nrules 50 |
| 52 | +# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50 | ||
| 51 | 53 | ||
| 52 | ################################################################## | 54 | ################################################################## |
| 53 | # FEATURES # | 55 | # FEATURES # |
| ... | @@ -271,7 +273,7 @@ if __name__ == "__main__": | ... | @@ -271,7 +273,7 @@ if __name__ == "__main__": |
| 271 | parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False) | 273 | parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False) |
| 272 | parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False) | 274 | parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False) |
| 273 | parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False) | 275 | parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False) |
| 274 | - | 276 | + parser.add_option("--nrules", dest="nrules", help="Number of crf rules on report", type="int") |
| 275 | 277 | ||
| 276 | (options, args) = parser.parse_args() | 278 | (options, args) = parser.parse_args() |
| 277 | if len(args) > 0: | 279 | if len(args) > 0: |
| ... | @@ -448,11 +450,11 @@ if __name__ == "__main__": | ... | @@ -448,11 +450,11 @@ if __name__ == "__main__": |
| 448 | oFile.write('\n') | 450 | oFile.write('\n') |
| 449 | 451 | ||
| 450 | oFile.write("\nTop likely transitions:\n") | 452 | oFile.write("\nTop likely transitions:\n") |
| 451 | - print_transitions(Counter(crf.transition_features_).most_common(50), oFile) | 453 | + print_transitions(Counter(crf.transition_features_).most_common(options.nrules), oFile) |
| 452 | oFile.write('\n') | 454 | oFile.write('\n') |
| 453 | 455 | ||
| 454 | oFile.write("\nTop unlikely transitions:\n") | 456 | oFile.write("\nTop unlikely transitions:\n") |
| 455 | - print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile) | 457 | + print_transitions(Counter(crf.transition_features_).most_common()[-options.nrules:], oFile) |
| 456 | oFile.write('\n') | 458 | oFile.write('\n') |
| 457 | 459 | ||
| 458 | oFile.write("\nTop positive:\n") | 460 | oFile.write("\nTop positive:\n") | ... | ... |
| ... | @@ -35,6 +35,7 @@ from nltk.corpus import stopwords | ... | @@ -35,6 +35,7 @@ from nltk.corpus import stopwords |
| 35 | # --outputPath=PATH Output path to place output files | 35 | # --outputPath=PATH Output path to place output files |
| 36 | # --nameGrid Number of run | 36 | # --nameGrid Number of run |
| 37 | # --version Version Report | 37 | # --version Version Report |
| 38 | +# --nrules Number of crf transitions | ||
| 38 | 39 | ||
| 39 | # Output | 40 | # Output |
| 40 | # 1) Best model | 41 | # 1) Best model |
| ... | @@ -47,7 +48,9 @@ from nltk.corpus import stopwords | ... | @@ -47,7 +48,9 @@ from nltk.corpus import stopwords |
| 47 | # --testFile test-data-set-30.txt | 48 | # --testFile test-data-set-30.txt |
| 48 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ | 49 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ |
| 49 | # --version _v1 | 50 | # --version _v1 |
| 50 | -# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 | 51 | +# --nrules 50 |
| 52 | + | ||
| 53 | +# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50 | ||
| 51 | 54 | ||
| 52 | ################################################################## | 55 | ################################################################## |
| 53 | # FEATURES # | 56 | # FEATURES # |
| ... | @@ -172,9 +175,11 @@ def word2features(sent, i, S1, S2, S3): | ... | @@ -172,9 +175,11 @@ def word2features(sent, i, S1, S2, S3): |
| 172 | if len(word)>1: | 175 | if len(word)>1: |
| 173 | features['word[:2]']= word[:2] | 176 | features['word[:2]']= word[:2] |
| 174 | ''' | 177 | ''' |
| 178 | + | ||
| 175 | #lemma and postag firstChar | 179 | #lemma and postag firstChar |
| 176 | features['lemma[:1]']= lemma[:1] | 180 | features['lemma[:1]']= lemma[:1] |
| 177 | #features['postag[:1]']= postag[:1] | 181 | #features['postag[:1]']= postag[:1] |
| 182 | + | ||
| 178 | #lemma and postag secondChar | 183 | #lemma and postag secondChar |
| 179 | if len(lemma)>1: | 184 | if len(lemma)>1: |
| 180 | features['lemma[:2]']= lemma[:2] | 185 | features['lemma[:2]']= lemma[:2] |
| ... | @@ -275,6 +280,7 @@ if __name__ == "__main__": | ... | @@ -275,6 +280,7 @@ if __name__ == "__main__": |
| 275 | parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False) | 280 | parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False) |
| 276 | parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False) | 281 | parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False) |
| 277 | parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False) | 282 | parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False) |
| 283 | + parser.add_option("--nrules", dest="nrules", help="Number of crf rules on report", type="int") | ||
| 278 | 284 | ||
| 279 | 285 | ||
| 280 | (options, args) = parser.parse_args() | 286 | (options, args) = parser.parse_args() |
| ... | @@ -452,11 +458,11 @@ if __name__ == "__main__": | ... | @@ -452,11 +458,11 @@ if __name__ == "__main__": |
| 452 | oFile.write('\n') | 458 | oFile.write('\n') |
| 453 | 459 | ||
| 454 | oFile.write("\nTop likely transitions:\n") | 460 | oFile.write("\nTop likely transitions:\n") |
| 455 | - print_transitions(Counter(crf.transition_features_).most_common(50), oFile) | 461 | + print_transitions(Counter(crf.transition_features_).most_common(options.nrules), oFile) |
| 456 | oFile.write('\n') | 462 | oFile.write('\n') |
| 457 | 463 | ||
| 458 | oFile.write("\nTop unlikely transitions:\n") | 464 | oFile.write("\nTop unlikely transitions:\n") |
| 459 | - print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile) | 465 | + print_transitions(Counter(crf.transition_features_).most_common()[-options.nrules:], oFile) |
| 460 | oFile.write('\n') | 466 | oFile.write('\n') |
| 461 | 467 | ||
| 462 | oFile.write("\nTop positive:\n") | 468 | oFile.write("\nTop positive:\n") | ... | ... |
-
Please register or login to post a comment