upload

Estefani Gaytan Nunez
Commit 5934b0d0db239ca28c47db032d5eee4454e80998 5934b0d0 1 parent 515e01d4
Showing 6 changed files with 48 additions and 35 deletions
CRF/bin/grid_v13.sh
CRF/bin/label-split_training_test_v3.py → CRF/bin/label-split_training_test_v4.py
CRF/bin/training_validation_v10.py
CRF/bin/training_validation_v11.py
CRF/bin/training_validation_v12.py
CRF/bin/training_validation_v13.py
--- a/CRF/bin/grid_v13.sh
View file @5934b0d
+++ b/CRF/bin/grid_v13.sh
View file @5934b0d
-python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_1 --version _v13 > ../outputs/Run_1.txt
+python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_1 --version _v13 > ../outputs/Run_1_v13.txt
-python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_2 --version _v13 --S1 > ../outputs/Run_2.txt
+python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_2 --version _v13 --S1 > ../outputs/Run_2_v13.txt
-python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_3 --version _v13 --S2 > ../outputs/Run_3.txt
+python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_3 --version _v13 --S2 > ../outputs/Run_3_v13.txt
-python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_4 --version _v13 --S1 --S2 > ../outputs/Run_4.txt
+python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_4 --version _v13 --S1 --S2 > ../outputs/Run_4_v13.txt
-python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_5 --version _v13 --S3 > ../outputs/Run_5.txt
+python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_5 --version _v13 --S3 > ../outputs/Run_5_v13.txt
-python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_6 --version _v13 --S1 --S3 > ../outputs/Run_6.txt
+python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_6 --version _v13 --S1 --S3 > ../outputs/Run_6_v13.txt
-python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_7 --version _v13 --S2 --S3 > ../outputs/Run_7.txt
+python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_7 --version _v13 --S2 --S3 > ../outputs/Run_7_v13.txt
-python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_8 --version _v13 --S1 --S2 --S3 > ../outputs/Run_8.txt
+python3 training_validation_v13.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --Gridname Run_8 --version _v13 --S1 --S2 --S3 > ../outputs/Run_8_v13.txt
--- a/CRF/bin/label-split_training_test_v3.py → CRF/bin/label-split_training_test_v4.py
View file @5934b0d
+++ b/CRF/bin/label-split_training_test_v3.py → CRF/bin/label-split_training_test_v4.py
View file @5934b0d
@@ -25,7 +25,7 @@ import random
 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets
 #
 # 
-# python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/ --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets
+# python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/ --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --index 5
 ##########################################
@@ -35,17 +35,12 @@ import random
 if __name__ == "__main__":
     # Defining parameters
     parser = OptionParser()
-    parser.add_option("--inputPath", dest="inputPath",
+    parser.add_option("--inputPath", dest="inputPath", help="Path of output from CoreNLP", metavar="PATH")
-                      help="Path of output from CoreNLP", metavar="PATH")
+    parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
-    parser.add_option("--outputPath", dest="outputPath",
+    parser.add_option("--inputFile", dest="inputFile", help="File with CoreNLP-tagging sentences", metavar="FILE")
-                      help="Output path to place output files",
+    parser.add_option("--trainingFile", dest="trainingFile", help="File with training data set", metavar="FILE")
-                      metavar="PATH")
+    parser.add_option("--testFile", dest="testFile",help="File with test data set", metavar="FILE")
-    parser.add_option("--inputFile", dest="inputFile",
+    parser.add_option("--index", dest="index",help="Select a limit CoreNLP output column", metavar='N', type=int)
-                      help="File with CoreNLP-tagging sentences", metavar="FILE")
-    parser.add_option("--trainingFile", dest="trainingFile",
-                      help="File with training data set", metavar="FILE")
-    parser.add_option("--testFile", dest="testFile",
-                      help="File with test data set", metavar="FILE")
     (options, args) = parser.parse_args()
     if len(args) > 0:
@@ -59,6 +54,7 @@ if __name__ == "__main__":
     print("File with training data set: " + str(options.trainingFile))
     print("Path of test data set: " + options.outputPath)
     print("File with test data set: " + str(options.testFile))
+    print("CoreNLP output choosen colums: 1-" + str(options.index))
     print('-------------------------------- PROCESSING --------------------------------')
     ## begin of tagging
     in_labels = {
@@ -127,7 +123,7 @@ if __name__ == "__main__":
 					sentence = ''				    
 				    elif w not in old_labels.keys():
                         		#Building and save tagging sentence
-					sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
+					sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:options.index])+'|'+flag+' ')
     print("Number of sentences with at least one tag: " + str(len(lista)))		
     print("Number of sentences from CoreNLP: " + str(n))
--- a/CRF/bin/training_validation_v10.py
View file @5934b0d
+++ b/CRF/bin/training_validation_v10.py
View file @5934b0d
@@ -35,6 +35,7 @@ from nltk.corpus import stopwords
 # --outputPath=PATH    	Output path to place output files
 # --nameGrid            Number of run
 # --version    		Version Report
+# --nrules		Number of crf transitions
 # Output
 # 1) Best model
@@ -47,7 +48,8 @@ from nltk.corpus import stopwords
 # --testFile			test-data-set-30.txt
 # --outputPath 			/home/egaytan/automatic-extraction-growth-conditions/CRF/
 # --version    		    _v1
-# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 
+# --nrules			50
+# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50
 ##################################################################
 #                             FEATURES                           # 
@@ -273,6 +275,8 @@ if __name__ == "__main__":
     parser.add_option("--S3",               dest="S3",              help="Future Type",                 action="store_true", default=False)
     parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words",          action="store_true", default=False)
     parser.add_option("--excludeSymbols",   dest="excludeSymbols",  help="Exclude punctuation marks",   action="store_true", default=False)
+    parser.add_option("--nrules",	    dest="nrules",          help="Number of crf rules on report", type="int")
+
     (options, args) = parser.parse_args()
@@ -288,6 +292,7 @@ if __name__ == "__main__":
     print("Exclude stop words: " + str(options.excludeStopWords))
     print("Levels: " + str(options.S1) + " " + str(options.S2))
     print("Report file: " + str(options.version))
+    print("Number of rules on report file: " + str(options.nrules))
     symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
@@ -451,11 +456,11 @@ if __name__ == "__main__":
         oFile.write('\n')
         oFile.write("\nTop likely transitions:\n")
-        print_transitions(Counter(crf.transition_features_).most_common(50), oFile)
+        print_transitions(Counter(crf.transition_features_).most_common(options.nrules()), oFile)
         oFile.write('\n')
         oFile.write("\nTop unlikely transitions:\n")
-        print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile)
+        print_transitions(Counter(crf.transition_features_).most_common()[-options.nrules():], oFile)
         oFile.write('\n')
         oFile.write("\nTop positive:\n")
--- a/CRF/bin/training_validation_v11.py
View file @5934b0d
+++ b/CRF/bin/training_validation_v11.py
View file @5934b0d
@@ -35,6 +35,8 @@ from nltk.corpus import stopwords
 # --outputPath=PATH    	Output path to place output files
 # --nameGrid            Number of run
 # --version    		    Version Report
+# --nrules              Number of crf transitions
+
 # Output
 # 1) Best model
@@ -47,7 +49,9 @@ from nltk.corpus import stopwords
 # --testFile			test-data-set-30.txt
 # --outputPath 			/home/egaytan/automatic-extraction-growth-conditions/CRF/
 # --version    		    _v1
-# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 
+# --nrules                      50
+
+# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3  --nrules 50
 ##################################################################
 #                             FEATURES                           # 
@@ -273,7 +277,7 @@ if __name__ == "__main__":
     parser.add_option("--S3",               dest="S3",              help="Future Type",                 action="store_true", default=False)
     parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words",          action="store_true", default=False)
     parser.add_option("--excludeSymbols",   dest="excludeSymbols",  help="Exclude punctuation marks",   action="store_true", default=False)
-
+    parser.add_option("--nrules",           dest="nrules",          help="Number of crf rules on report", type="int")
     (options, args) = parser.parse_args()
     if len(args) > 0:
@@ -452,11 +456,11 @@ if __name__ == "__main__":
         oFile.write('\n')
         oFile.write("\nTop likely transitions:\n")
-        print_transitions(Counter(crf.transition_features_).most_common(50), oFile)
+        print_transitions(Counter(crf.transition_features_).most_common(), oFile)
         oFile.write('\n')
         oFile.write("\nTop unlikely transitions:\n")
-        print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile)
+        print_transitions(Counter(crf.transition_features_).most_common()[-option.nrules:], oFile)
         oFile.write('\n')
         oFile.write("\nTop positive:\n")
--- a/CRF/bin/training_validation_v12.py
View file @5934b0d
+++ b/CRF/bin/training_validation_v12.py
View file @5934b0d
@@ -35,6 +35,7 @@ from nltk.corpus import stopwords
 # --outputPath=PATH    	Output path to place output files
 # --nameGrid            Number of run
 # --version    		    Version Report
+# --nrules              Number of crf transitions
 # Output
 # 1) Best model
@@ -47,7 +48,8 @@ from nltk.corpus import stopwords
 # --testFile			test-data-set-30.txt
 # --outputPath 			/home/egaytan/automatic-extraction-growth-conditions/CRF/
 # --version    		    _v1
-# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 
+# --nrules                      50
+# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50
 ##################################################################
 #                             FEATURES                           # 
@@ -271,7 +273,7 @@ if __name__ == "__main__":
     parser.add_option("--S3",               dest="S3",              help="Future Type",                 action="store_true", default=False)
     parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words",          action="store_true", default=False)
     parser.add_option("--excludeSymbols",   dest="excludeSymbols",  help="Exclude punctuation marks",   action="store_true", default=False)
-
+    parser.add_option("--nrules",           dest="nrules",          help="Number of crf rules on report", type="int")
     (options, args) = parser.parse_args()
     if len(args) > 0:
@@ -448,11 +450,11 @@ if __name__ == "__main__":
         oFile.write('\n')
         oFile.write("\nTop likely transitions:\n")
-        print_transitions(Counter(crf.transition_features_).most_common(50), oFile)
+        print_transitions(Counter(crf.transition_features_).most_common(options.nrules), oFile)
         oFile.write('\n')
         oFile.write("\nTop unlikely transitions:\n")
-        print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile)
+        print_transitions(Counter(crf.transition_features_).most_common()[-options.nrules:], oFile)
         oFile.write('\n')
         oFile.write("\nTop positive:\n")
--- a/CRF/bin/training_validation_v13.py
View file @5934b0d
+++ b/CRF/bin/training_validation_v13.py
View file @5934b0d
@@ -35,6 +35,7 @@ from nltk.corpus import stopwords
 # --outputPath=PATH    	Output path to place output files
 # --nameGrid            Number of run
 # --version    		    Version Report
+# --nrules              Number of crf transitions
 # Output
 # 1) Best model
@@ -47,7 +48,9 @@ from nltk.corpus import stopwords
 # --testFile			test-data-set-30.txt
 # --outputPath 			/home/egaytan/automatic-extraction-growth-conditions/CRF/
 # --version    		    _v1
-# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 
+# --nrules                      50
+
+# python3 training_validation_v9.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nameGrid Run1 --version _v1 --S1 --S2 --S3 --nrules 50
 ##################################################################
 #                             FEATURES                           # 
@@ -172,9 +175,11 @@ def word2features(sent, i, S1, S2, S3):
         if len(word)>1:
             features['word[:2]']= word[:2]        
         '''
+
         #lemma and postag firstChar
         features['lemma[:1]']= lemma[:1]
         #features['postag[:1]']= postag[:1]
+
         #lemma and postag secondChar
         if len(lemma)>1:
             features['lemma[:2]']= lemma[:2]       
@@ -275,6 +280,7 @@ if __name__ == "__main__":
     parser.add_option("--S3",               dest="S3",              help="Future Type",                 action="store_true", default=False)
     parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words",          action="store_true", default=False)
     parser.add_option("--excludeSymbols",   dest="excludeSymbols",  help="Exclude punctuation marks",   action="store_true", default=False)
+    parser.add_option("--nrules",           dest="nrules",          help="Number of crf rules on report", type="int")
     (options, args) = parser.parse_args()
@@ -452,11 +458,11 @@ if __name__ == "__main__":
         oFile.write('\n')
         oFile.write("\nTop likely transitions:\n")
-        print_transitions(Counter(crf.transition_features_).most_common(50), oFile)
+        print_transitions(Counter(crf.transition_features_).most_common(options.nrules), oFile)
         oFile.write('\n')
         oFile.write("\nTop unlikely transitions:\n")
-        print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile)
+        print_transitions(Counter(crf.transition_features_).most_common()[-options.nrules:], oFile)
         oFile.write('\n')
         oFile.write("\nTop positive:\n")