Estefani Gaytan Nunez

upload

1 -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_1 --version _v13 > ../outputs/Run_1.txt 1 +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_1 --version _v13 > ../outputs/Run_1_v13.txt
2 -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_2 --version _v13 --S1 > ../outputs/Run_2.txt 2 +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_2 --version _v13 --S1 > ../outputs/Run_2_v13.txt
3 -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_3 --version _v13 --S2 > ../outputs/Run_3.txt 3 +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_3 --version _v13 --S2 > ../outputs/Run_3_v13.txt
4 -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_4 --version _v13 --S1 --S2 > ../outputs/Run_4.txt 4 +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_4 --version _v13 --S1 --S2 > ../outputs/Run_4_v13.txt
5 -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_5 --version _v13 --S3 > ../outputs/Run_5.txt 5 +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_5 --version _v13 --S3 > ../outputs/Run_5_v13.txt
6 -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_6 --version _v13 --S1 --S3 > ../outputs/Run_6.txt 6 +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_6 --version _v13 --S1 --S3 > ../outputs/Run_6_v13.txt
7 -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_7 --version _v13 --S2 --S3 > ../outputs/Run_7.txt 7 +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_7 --version _v13 --S2 --S3 > ../outputs/Run_7_v13.txt
8 -python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_8 --version _v13 --S1 --S2 --S3 > ../outputs/Run_8.txt 8 +python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_8 --version _v13 --S1 --S2 --S3 > ../outputs/Run_8_v13.txt
9 9
10 10
......
...@@ -25,7 +25,7 @@ import random ...@@ -25,7 +25,7 @@ import random
25 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets 25 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets
26 # 26 #
27 # 27 #
28 -# python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/ --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets 28 +# python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/ --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --index 5
29 29
30 30
31 ########################################## 31 ##########################################
...@@ -35,17 +35,12 @@ import random ...@@ -35,17 +35,12 @@ import random
35 if __name__ == "__main__": 35 if __name__ == "__main__":
36 # Defining parameters 36 # Defining parameters
37 parser = OptionParser() 37 parser = OptionParser()
38 - parser.add_option("--inputPath", dest="inputPath", 38 + parser.add_option("--inputPath", dest="inputPath", help="Path of output from CoreNLP", metavar="PATH")
39 - help="Path of output from CoreNLP", metavar="PATH") 39 + parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
40 - parser.add_option("--outputPath", dest="outputPath", 40 + parser.add_option("--inputFile", dest="inputFile", help="File with CoreNLP-tagging sentences", metavar="FILE")
41 - help="Output path to place output files", 41 + parser.add_option("--trainingFile", dest="trainingFile", help="File with training data set", metavar="FILE")
42 - metavar="PATH") 42 + parser.add_option("--testFile", dest="testFile",help="File with test data set", metavar="FILE")
43 - parser.add_option("--inputFile", dest="inputFile", 43 + parser.add_option("--index", dest="index",help="Select a limit CoreNLP output column", metavar='N', type=int)
44 - help="File with CoreNLP-tagging sentences", metavar="FILE")
45 - parser.add_option("--trainingFile", dest="trainingFile",
46 - help="File with training data set", metavar="FILE")
47 - parser.add_option("--testFile", dest="testFile",
48 - help="File with test data set", metavar="FILE")
49 44
50 (options, args) = parser.parse_args() 45 (options, args) = parser.parse_args()
51 if len(args) > 0: 46 if len(args) > 0:
...@@ -59,6 +54,7 @@ if __name__ == "__main__": ...@@ -59,6 +54,7 @@ if __name__ == "__main__":
59 print("File with training data set: " + str(options.trainingFile)) 54 print("File with training data set: " + str(options.trainingFile))
60 print("Path of test data set: " + options.outputPath) 55 print("Path of test data set: " + options.outputPath)
61 print("File with test data set: " + str(options.testFile)) 56 print("File with test data set: " + str(options.testFile))
57 + print("CoreNLP output choosen colums: 1-" + str(options.index))
62 print('-------------------------------- PROCESSING --------------------------------') 58 print('-------------------------------- PROCESSING --------------------------------')
63 ## begin of tagging 59 ## begin of tagging
64 in_labels = { 60 in_labels = {
...@@ -127,7 +123,7 @@ if __name__ == "__main__": ...@@ -127,7 +123,7 @@ if __name__ == "__main__":
127 sentence = '' 123 sentence = ''
128 elif w not in old_labels.keys(): 124 elif w not in old_labels.keys():
129 #Building and save tagging sentence 125 #Building and save tagging sentence
130 - sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ') 126 + sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:options.index])+'|'+flag+' ')
131 127
132 print("Number of sentences with at least one tag: " + str(len(lista))) 128 print("Number of sentences with at least one tag: " + str(len(lista)))
133 print("Number of sentences from CoreNLP: " + str(n)) 129 print("Number of sentences from CoreNLP: " + str(n))
......
...@@ -35,6 +35,7 @@ from nltk.corpus import stopwords ...@@ -35,6 +35,7 @@ from nltk.corpus import stopwords
35 # --outputPath=PATH Output path to place output files 35 # --outputPath=PATH Output path to place output files
36 # --nameGrid Number of run 36 # --nameGrid Number of run
37 # --version Version Report 37 # --version Version Report
38 +# --nrules Number of crf transitions
38 39
39 # Output 40 # Output
40 # 1) Best model 41 # 1) Best model
...@@ -47,7 +48,8 @@ from nltk.corpus import stopwords ...@@ -47,7 +48,8 @@ from nltk.corpus import stopwords
47 # --testFile test-data-set-30.txt 48 # --testFile test-data-set-30.txt
48 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ 49 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/
49 # --version _v1 50 # --version _v1
50 -# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 51 +# --nrules 50
52 +# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50
51 53
52 ################################################################## 54 ##################################################################
53 # FEATURES # 55 # FEATURES #
...@@ -273,6 +275,8 @@ if __name__ == "__main__": ...@@ -273,6 +275,8 @@ if __name__ == "__main__":
273 parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False) 275 parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False)
274 parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False) 276 parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False)
275 parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False) 277 parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False)
278 + parser.add_option("--nrules", dest="nrules", help="Number of crf rules on report", type="int")
279 +
276 280
277 281
278 (options, args) = parser.parse_args() 282 (options, args) = parser.parse_args()
...@@ -288,6 +292,7 @@ if __name__ == "__main__": ...@@ -288,6 +292,7 @@ if __name__ == "__main__":
288 print("Exclude stop words: " + str(options.excludeStopWords)) 292 print("Exclude stop words: " + str(options.excludeStopWords))
289 print("Levels: " + str(options.S1) + " " + str(options.S2)) 293 print("Levels: " + str(options.S1) + " " + str(options.S2))
290 print("Report file: " + str(options.version)) 294 print("Report file: " + str(options.version))
295 + print("Number of rules on report file: " + str(options.nrules))
291 296
292 297
293 symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', 298 symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
...@@ -451,11 +456,11 @@ if __name__ == "__main__": ...@@ -451,11 +456,11 @@ if __name__ == "__main__":
451 oFile.write('\n') 456 oFile.write('\n')
452 457
453 oFile.write("\nTop likely transitions:\n") 458 oFile.write("\nTop likely transitions:\n")
454 - print_transitions(Counter(crf.transition_features_).most_common(50), oFile) 459 + print_transitions(Counter(crf.transition_features_).most_common(options.nrules()), oFile)
455 oFile.write('\n') 460 oFile.write('\n')
456 461
457 oFile.write("\nTop unlikely transitions:\n") 462 oFile.write("\nTop unlikely transitions:\n")
458 - print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile) 463 + print_transitions(Counter(crf.transition_features_).most_common()[-options.nrules():], oFile)
459 oFile.write('\n') 464 oFile.write('\n')
460 465
461 oFile.write("\nTop positive:\n") 466 oFile.write("\nTop positive:\n")
......
...@@ -35,6 +35,8 @@ from nltk.corpus import stopwords ...@@ -35,6 +35,8 @@ from nltk.corpus import stopwords
35 # --outputPath=PATH Output path to place output files 35 # --outputPath=PATH Output path to place output files
36 # --nameGrid Number of run 36 # --nameGrid Number of run
37 # --version Version Report 37 # --version Version Report
38 +# --nrules Number of crf transitions
39 +
38 40
39 # Output 41 # Output
40 # 1) Best model 42 # 1) Best model
...@@ -47,7 +49,9 @@ from nltk.corpus import stopwords ...@@ -47,7 +49,9 @@ from nltk.corpus import stopwords
47 # --testFile test-data-set-30.txt 49 # --testFile test-data-set-30.txt
48 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ 50 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/
49 # --version _v1 51 # --version _v1
50 -# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 52 +# --nrules 50
53 +
54 +# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50
51 55
52 ################################################################## 56 ##################################################################
53 # FEATURES # 57 # FEATURES #
...@@ -273,7 +277,7 @@ if __name__ == "__main__": ...@@ -273,7 +277,7 @@ if __name__ == "__main__":
273 parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False) 277 parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False)
274 parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False) 278 parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False)
275 parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False) 279 parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False)
276 - 280 + parser.add_option("--nrules", dest="nrules", help="Number of crf rules on report", type="int")
277 281
278 (options, args) = parser.parse_args() 282 (options, args) = parser.parse_args()
279 if len(args) > 0: 283 if len(args) > 0:
...@@ -452,11 +456,11 @@ if __name__ == "__main__": ...@@ -452,11 +456,11 @@ if __name__ == "__main__":
452 oFile.write('\n') 456 oFile.write('\n')
453 457
454 oFile.write("\nTop likely transitions:\n") 458 oFile.write("\nTop likely transitions:\n")
455 - print_transitions(Counter(crf.transition_features_).most_common(50), oFile) 459 + print_transitions(Counter(crf.transition_features_).most_common(), oFile)
456 oFile.write('\n') 460 oFile.write('\n')
457 461
458 oFile.write("\nTop unlikely transitions:\n") 462 oFile.write("\nTop unlikely transitions:\n")
459 - print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile) 463 + print_transitions(Counter(crf.transition_features_).most_common()[-option.nrules:], oFile)
460 oFile.write('\n') 464 oFile.write('\n')
461 465
462 oFile.write("\nTop positive:\n") 466 oFile.write("\nTop positive:\n")
......
...@@ -35,6 +35,7 @@ from nltk.corpus import stopwords ...@@ -35,6 +35,7 @@ from nltk.corpus import stopwords
35 # --outputPath=PATH Output path to place output files 35 # --outputPath=PATH Output path to place output files
36 # --nameGrid Number of run 36 # --nameGrid Number of run
37 # --version Version Report 37 # --version Version Report
38 +# --nrules Number of crf transitions
38 39
39 # Output 40 # Output
40 # 1) Best model 41 # 1) Best model
...@@ -47,7 +48,8 @@ from nltk.corpus import stopwords ...@@ -47,7 +48,8 @@ from nltk.corpus import stopwords
47 # --testFile test-data-set-30.txt 48 # --testFile test-data-set-30.txt
48 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ 49 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/
49 # --version _v1 50 # --version _v1
50 -# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 51 +# --nrules 50
52 +# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50
51 53
52 ################################################################## 54 ##################################################################
53 # FEATURES # 55 # FEATURES #
...@@ -271,7 +273,7 @@ if __name__ == "__main__": ...@@ -271,7 +273,7 @@ if __name__ == "__main__":
271 parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False) 273 parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False)
272 parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False) 274 parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False)
273 parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False) 275 parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False)
274 - 276 + parser.add_option("--nrules", dest="nrules", help="Number of crf rules on report", type="int")
275 277
276 (options, args) = parser.parse_args() 278 (options, args) = parser.parse_args()
277 if len(args) > 0: 279 if len(args) > 0:
...@@ -448,11 +450,11 @@ if __name__ == "__main__": ...@@ -448,11 +450,11 @@ if __name__ == "__main__":
448 oFile.write('\n') 450 oFile.write('\n')
449 451
450 oFile.write("\nTop likely transitions:\n") 452 oFile.write("\nTop likely transitions:\n")
451 - print_transitions(Counter(crf.transition_features_).most_common(50), oFile) 453 + print_transitions(Counter(crf.transition_features_).most_common(options.nrules), oFile)
452 oFile.write('\n') 454 oFile.write('\n')
453 455
454 oFile.write("\nTop unlikely transitions:\n") 456 oFile.write("\nTop unlikely transitions:\n")
455 - print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile) 457 + print_transitions(Counter(crf.transition_features_).most_common()[-options.nrules:], oFile)
456 oFile.write('\n') 458 oFile.write('\n')
457 459
458 oFile.write("\nTop positive:\n") 460 oFile.write("\nTop positive:\n")
......
...@@ -35,6 +35,7 @@ from nltk.corpus import stopwords ...@@ -35,6 +35,7 @@ from nltk.corpus import stopwords
35 # --outputPath=PATH Output path to place output files 35 # --outputPath=PATH Output path to place output files
36 # --nameGrid Number of run 36 # --nameGrid Number of run
37 # --version Version Report 37 # --version Version Report
38 +# --nrules Number of crf transitions
38 39
39 # Output 40 # Output
40 # 1) Best model 41 # 1) Best model
...@@ -47,7 +48,9 @@ from nltk.corpus import stopwords ...@@ -47,7 +48,9 @@ from nltk.corpus import stopwords
47 # --testFile test-data-set-30.txt 48 # --testFile test-data-set-30.txt
48 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ 49 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/
49 # --version _v1 50 # --version _v1
50 -# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 51 +# --nrules 50
52 +
53 +# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50
51 54
52 ################################################################## 55 ##################################################################
53 # FEATURES # 56 # FEATURES #
...@@ -172,9 +175,11 @@ def word2features(sent, i, S1, S2, S3): ...@@ -172,9 +175,11 @@ def word2features(sent, i, S1, S2, S3):
172 if len(word)>1: 175 if len(word)>1:
173 features['word[:2]']= word[:2] 176 features['word[:2]']= word[:2]
174 ''' 177 '''
178 +
175 #lemma and postag firstChar 179 #lemma and postag firstChar
176 features['lemma[:1]']= lemma[:1] 180 features['lemma[:1]']= lemma[:1]
177 #features['postag[:1]']= postag[:1] 181 #features['postag[:1]']= postag[:1]
182 +
178 #lemma and postag secondChar 183 #lemma and postag secondChar
179 if len(lemma)>1: 184 if len(lemma)>1:
180 features['lemma[:2]']= lemma[:2] 185 features['lemma[:2]']= lemma[:2]
...@@ -275,6 +280,7 @@ if __name__ == "__main__": ...@@ -275,6 +280,7 @@ if __name__ == "__main__":
275 parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False) 280 parser.add_option("--S3", dest="S3", help="Future Type", action="store_true", default=False)
276 parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False) 281 parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False)
277 parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False) 282 parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False)
283 + parser.add_option("--nrules", dest="nrules", help="Number of crf rules on report", type="int")
278 284
279 285
280 (options, args) = parser.parse_args() 286 (options, args) = parser.parse_args()
...@@ -452,11 +458,11 @@ if __name__ == "__main__": ...@@ -452,11 +458,11 @@ if __name__ == "__main__":
452 oFile.write('\n') 458 oFile.write('\n')
453 459
454 oFile.write("\nTop likely transitions:\n") 460 oFile.write("\nTop likely transitions:\n")
455 - print_transitions(Counter(crf.transition_features_).most_common(50), oFile) 461 + print_transitions(Counter(crf.transition_features_).most_common(options.nrules), oFile)
456 oFile.write('\n') 462 oFile.write('\n')
457 463
458 oFile.write("\nTop unlikely transitions:\n") 464 oFile.write("\nTop unlikely transitions:\n")
459 - print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile) 465 + print_transitions(Counter(crf.transition_features_).most_common()[-options.nrules:], oFile)
460 oFile.write('\n') 466 oFile.write('\n')
461 467
462 oFile.write("\nTop positive:\n") 468 oFile.write("\nTop positive:\n")
......