scripts

Estefani Gaytan Nunez
Commit fd616ef9e8fbd1952192b721036b8573f2c26f34 fd616ef9 1 parent b9839453
Showing 5 changed files with 301 additions and 1 deletions
CRF/bin/label-split_training_test_v2.py
CRF/bin/label-split_training_test_v2.py.save
CRF/bin/training_validation_v3.py
CRF/bin/training_validation_v4.py
CRF/bin/training_validation_v5.py
--- a/CRF/bin/label-split_training_test_v2.py 0 → 100644
View file @fd616ef
+++ b/CRF/bin/label-split_training_test_v2.py 0 → 100644
View file @fd616ef
+ #!/bin/python3    
+ from optparse import OptionParser
+ import re
+ import os
+ import random
+ 
+ 
+ # Objective
+ # Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging
+ # make data sets using only sentences with at least one true-tag
+ #
+ # Input parameters
+ # --inputPath=PATH    		Path of inputfile
+ # --outputPath=PATH   		Path to place output files
+ # --trainingFile=testFile  	Output training data set
+ # --testFile=testFile  	  	Output test data set
+ #
+ # Output
+ # training and test data set
+ #
+ # Examples
+ # python label-split_training_test_v2.py
+ # --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
+ # --inputFile sentences.tsv_pakal_.conll
+ # --trainingFile training-data-set-70.txt
+ # --testFile test-data-set-30.txt
+ # --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
+ #
+ # 
+ # python label-split_training_test_v2.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile raw-metadata-senteneces.txt.conll --trainingFile training-data-set-70_v2.txt --testFile test-data-set-30_v2.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
+ 
+ 
+ ##########################################
+ #               MAIN PROGRAM             #
+ ##########################################
+ 
+ if __name__ == "__main__":
+     # Defining parameters
+     parser = OptionParser()
+     parser.add_option("--inputPath", dest="inputPath",
+                       help="Path of output from CoreNLP", metavar="PATH")
+     parser.add_option("--outputPath", dest="outputPath",
+                       help="Output path to place output files",
+                       metavar="PATH")
+     parser.add_option("--inputFile", dest="inputFile",
+                       help="File with CoreNLP-tagging sentences", metavar="FILE")
+     parser.add_option("--trainingFile", dest="trainingFile",
+                       help="File with training data set", metavar="FILE")
+     parser.add_option("--testFile", dest="testFile",
+                       help="File with test data set", metavar="FILE")
+ 
+     (options, args) = parser.parse_args()
+     if len(args) > 0:
+         parser.error("Any parameter given.")
+         sys.exit(1)
+ 
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Path of CoreNLP output: " + options.inputPath)
+     print("File with CoreNLP-tagging sentences: " + str(options.inputFile))
+     print("Path of training data set: " + options.outputPath)
+     print("File with training data set: " + str(options.trainingFile))
+     print("Path of test data set: " + options.outputPath)
+     print("File with test data set: " + str(options.testFile))
+     print('-------------------------------- PROCESSING --------------------------------')
+     ## begin of tagging
+     in_labels = {
+      '<Gtype>': 'Gtype',
+      '<Gversion>': 'Gversion',
+      '<Med>': 'Med',
+      '<Phase>': 'Phase',
+      '<Supp>': 'Supp',
+      '<Technique>': 'Technique',
+      '<Temp>': 'Temp',
+      '<OD>': 'OD',
+      '<Anti>': 'Anti'
+     }
+     ## End of tagging
+     out_labels = {
+      '<Air>': 'O',    
+      '</Air>': 'O',
+      '</Gtype>': 'O',
+      '</Gversion>': 'O',
+      '</Med>': 'O',
+      '</Phase>': 'O',
+      '<Sample>': 'O',
+      '</Sample>': 'O',
+      '<Serie>': 'O',
+      '</Serie>': 'O',
+      '<Strain>': 'O',
+      '</Strain>': 'O',
+      '<Substrain>': 'O',
+      '</Substrain>': 'O',
+      '</Supp>': 'O',
+      '</Technique>': 'O',
+      '</Temp>': 'O',
+      '</OD>': 'O',
+      '<Agit>': 'O',
+      '</Agit>': 'O',
+      '<Name>': 'O',
+      '</Name>': 'O',
+      '<Orgn>': 'O',
+      '</Orgn>': 'O',
+      '</Anti>': 'O',
+      '<Vess>': 'O',
+      '</Vess>': 'O'}
+     
+     # Other label
+     flag = 'O'
+     # sentences counter    
+     lista = []
+     #First sentence
+     sentence = ''
+     with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
+ 	    for line in input_file:
+ 		    if len(line.split('\t')) > 1:
+ 			    w = line.split('\t')[1]
+ 			    if w in in_labels or w in out_labels:
+ 			    	#Tagging                    
+ 				    if w in in_labels.keys(): flag = in_labels[w]                    
+ 				    if w in out_labels: flag = out_labels[w]					
+ 			    else:                    
+ 				    if w == "PGCGROWTHCONDITIONS":                        
+ 			                        words = sentence.split(' ')
+ 				    	#End of sentence
+ 		                        	tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values() ]
+ 	               		        #At least one true-tag on sentence
+ 			                        if len(tags)> 0:
+                 			            lista.append(sentence)
+ 		                        #New setence
+ 	                            		sentence = ''                            
+ 				    else:			            	
+ 						sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
+ 
+     print("Number of sentences: " + str( len(lista) ) )
+ 
+ 
+     # Split 70 30 training and test sentences
+     trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70))
+     testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex]
+ 
+     with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile:
+       Data = [lista[i]  for i in trainingIndex]
+       oFile.write('\n'.join(Data))
+ 
+     with open(os.path.join(options.outputPath, options.testFile), "w") as oFile:
+       Data = [lista[i]  for i in testIndex]
+       oFile.write('\n'.join(Data))
+ 
+     print("==================================END===================================")
--- a/CRF/bin/label-split_training_test_v2.py.save 0 → 100644
View file @fd616ef
+++ b/CRF/bin/label-split_training_test_v2.py.save 0 → 100644
View file @fd616ef
+ #!/bin/python3    
+ from optparse import OptionParser
+ import re
+ import os
+ import random
+ 
+ 
+ # Objective
+ # Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging
+ # make data sets using only sentences with at least one true-tag
+ #
+ # Input parameters
+ # --inputPath=PATH    		Path of inputfile
+ # --outputPath=PATH   		Path to place output files
+ # --trainingFile=testFile  	Output training data set
+ # --testFile=testFile  	  	Output test data set
+ #
+ # Output
+ # training and test data set
+ #
+ # Examples
+ # python label-split_training_test_v2.py
+ # --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
+ # --inputFile sentences.tsv_pakal_.conll
+ # --trainingFile training-data-set-70.txt
+ # --testFile test-data-set-30.txt
+ # --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
+ #
+ # 
+ # python label-split_training_test_v2.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile raw-metadata-senteneces.txt.conll --trainingFile training-data-set-70_v2.txt --testFile test-data-set-30_v2.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
+ 
+ 
+ ##########################################
+ #               MAIN PROGRAM             #
+ ##########################################
+ 
+ if __name__ == "__main__":
+     # Defining parameters
+     parser = OptionParser()
+     parser.add_option("--inputPath", dest="inputPath",
+                       help="Path of output from CoreNLP", metavar="PATH")
+     parser.add_option("--outputPath", dest="outputPath",
+                       help="Output path to place output files",
+                       metavar="PATH")
+     parser.add_option("--inputFile", dest="inputFile",
+                       help="File with CoreNLP-tagging sentences", metavar="FILE")
+     parser.add_option("--trainingFile", dest="trainingFile",
+                       help="File with training data set", metavar="FILE")
+     parser.add_option("--testFile", dest="testFile",
+                       help="File with test data set", metavar="FILE")
+ 
+     (options, args) = parser.parse_args()
+     if len(args) > 0:
+         parser.error("Any parameter given.")
+         sys.exit(1)
+ 
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Path of CoreNLP output: " + options.inputPath)
+     print("File with CoreNLP-tagging sentences: " + str(options.inputFile))
+     print("Path of training data set: " + options.outputPath)
+     print("File with training data set: " + str(options.trainingFile))
+     print("Path of test data set: " + options.outputPath)
+     print("File with test data set: " + str(options.testFile))
+     print('-------------------------------- PROCESSING --------------------------------')
+     ## begin of tagging
+     in_labels = {
+      '<Gtype>': 'Gtype',
+      '<Gversion>': 'Gversion',
+      '<Med>': 'Med',
+      '<Phase>': 'Phase',
+      '<Supp>': 'Supp',
+      '<Technique>': 'Technique',
+      '<Temp>': 'Temp',
+      '<OD>': 'OD',
+      '<Anti>': 'Anti',
+      '<Agit>': 'Agit',
+      '<Vess>': 'Vess'
+     }
+     ## End of tagging
+     out_labels = {
+      '<Air>': 'O',    
+      '</Air>': 'O',
+      '</Gtype>': 'O',
+      '</Gversion>': 'O',
+      '</Med>': 'O',
+      '</Phase>': 'O',
+      '<Sample>': 'O',
+      '</Sample>': 'O',
+      '<Serie>': 'O',
+      '</Serie>': 'O',
+      '<Strain>': 'O',
+      '</Strain>': 'O',
+      '<Substrain>': 'O',
+      '</Substrain>': 'O',
+      '</Supp>': 'O',
+      '</Technique>': 'O',
+      '</Temp>': 'O',
+      '</OD>': 'O',
+      '</Anti>': 'O',
+      '</Agit>': 'O',
+      '<Name>': 'O',
+      '</Name>': 'O',
+      '<Orgn>': 'O',
+      '</Orgn>': 'O',
+      '</Vess>': 'O'}
+     
+     # Other label
+     flag = 'O'
+     # sentences counter
+     n=0
+     lista = []
+     #First sentence
+     sentence = ''
+     with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
+ 	    for line in input_file:
+ 		    if len(line.split('\t')) > 1:
+ 			    w = line.split('\t')[1]
+ 			    if w in in_labels or w in out_labels:
+ 			    	#Tagging                    
+ 				    if w in in_labels.keys(): flag = in_labels[w]                    
+ 				    if w in out_labels: flag = out_labels[w]					
+ 			    else:                    
+ 				    if w == "PGCGROWTHCONDITIONS":
+ 					words = sentence.split(' ')
+                         tags = [tag for tag in words if word.split('|')[-1] in in_labels.values() ]
+                         #At least one true-tag on sentence
+                         if len(tags)> 0:
+                             lista.append(sentence)
+                             #New setence
+                             sentence = ''
+                             n=n+1
+ 				    else:
+ 				    	#Building and save tagging sentence
+ 					sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
+ 
+ 	print("Number of sentences: " + str(n) + str(len(lista)+1))
+ 
+ 
+     # Split 70 30 training and test sentences
+     trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70))
+     testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex]
+ 
+     with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile:
+       Data = [lista[i]  for i in trainingIndex]
+       oFile.write('\n'.join(Data))
+ 
+     with open(os.path.join(options.outputPath, options.testFile), "w") as oFile:
+       Data = [lista[i]  for i in testIndex]
+       oFile.write('\n'.join(Data))
+ 
+     print("==================================END===================================")
--- a/CRF/bin/training_validation_v3.py
View file @fd616ef
+++ b/CRF/bin/training_validation_v3.py
View file @fd616ef
@@ -299,7 +299,7 @@ if __name__ == "__main__":
 
     # Original: labels = list(crf.classes_)
     # Original: labels.remove('O')
-     labels = list(['Air', 'Gtype', 'Gversion', 'Med', 'Phase', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Vess'])
+     labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Supp', 'Technique', 'Temp', 'OD', 'Anti'])
 
     # use the same metric for evaluation
     f1_scorer = make_scorer(metrics.flat_f1_score,
--- a/CRF/bin/training_validation_v4.py 0 → 100644
View file @fd616ef
+++ b/CRF/bin/training_validation_v4.py 0 → 100644
View file @fd616ef
--- a/CRF/bin/training_validation_v5.py 0 → 100644
View file @fd616ef
+++ b/CRF/bin/training_validation_v5.py 0 → 100644
View file @fd616ef