scripts

Estefani Gaytan Nunez
Commit fd616ef9e8fbd1952192b721036b8573f2c26f34 fd616ef9 1 parent b9839453
Showing 5 changed files with 301 additions and 1 deletions
CRF/bin/label-split_training_test_v2.py
CRF/bin/label-split_training_test_v2.py.save
CRF/bin/training_validation_v3.py
CRF/bin/training_validation_v4.py
CRF/bin/training_validation_v5.py
--- a/CRF/bin/label-split_training_test_v2.py 0 → 100644
View file @fd616ef
+++ b/CRF/bin/label-split_training_test_v2.py 0 → 100644
View file @fd616ef
+#!/bin/python3    
+from optparse import OptionParser
+import re
+import os
+import random
+
+
+# Objective
+# Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging
+# make data sets using only sentences with at least one true-tag
+#
+# Input parameters
+# --inputPath=PATH    		Path of inputfile
+# --outputPath=PATH   		Path to place output files
+# --trainingFile=testFile  	Output training data set
+# --testFile=testFile  	  	Output test data set
+#
+# Output
+# training and test data set
+#
+# Examples
+# python label-split_training_test_v2.py
+# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
+# --inputFile sentences.tsv_pakal_.conll
+# --trainingFile training-data-set-70.txt
+# --testFile test-data-set-30.txt
+# --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
+#
+# 
+# python label-split_training_test_v2.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile raw-metadata-senteneces.txt.conll --trainingFile training-data-set-70_v2.txt --testFile test-data-set-30_v2.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
+
+
+##########################################
+#               MAIN PROGRAM             #
+##########################################
+
+if __name__ == "__main__":
+    # Defining parameters
+    parser = OptionParser()
+    parser.add_option("--inputPath", dest="inputPath",
+                      help="Path of output from CoreNLP", metavar="PATH")
+    parser.add_option("--outputPath", dest="outputPath",
+                      help="Output path to place output files",
+                      metavar="PATH")
+    parser.add_option("--inputFile", dest="inputFile",
+                      help="File with CoreNLP-tagging sentences", metavar="FILE")
+    parser.add_option("--trainingFile", dest="trainingFile",
+                      help="File with training data set", metavar="FILE")
+    parser.add_option("--testFile", dest="testFile",
+                      help="File with test data set", metavar="FILE")
+
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("Any parameter given.")
+        sys.exit(1)
+
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path of CoreNLP output: " + options.inputPath)
+    print("File with CoreNLP-tagging sentences: " + str(options.inputFile))
+    print("Path of training data set: " + options.outputPath)
+    print("File with training data set: " + str(options.trainingFile))
+    print("Path of test data set: " + options.outputPath)
+    print("File with test data set: " + str(options.testFile))
+    print('-------------------------------- PROCESSING --------------------------------')
+    ## begin of tagging
+    in_labels = {
+     '<Gtype>': 'Gtype',
+     '<Gversion>': 'Gversion',
+     '<Med>': 'Med',
+     '<Phase>': 'Phase',
+     '<Supp>': 'Supp',
+     '<Technique>': 'Technique',
+     '<Temp>': 'Temp',
+     '<OD>': 'OD',
+     '<Anti>': 'Anti'
+    }
+    ## End of tagging
+    out_labels = {
+     '<Air>': 'O',    
+     '</Air>': 'O',
+     '</Gtype>': 'O',
+     '</Gversion>': 'O',
+     '</Med>': 'O',
+     '</Phase>': 'O',
+     '<Sample>': 'O',
+     '</Sample>': 'O',
+     '<Serie>': 'O',
+     '</Serie>': 'O',
+     '<Strain>': 'O',
+     '</Strain>': 'O',
+     '<Substrain>': 'O',
+     '</Substrain>': 'O',
+     '</Supp>': 'O',
+     '</Technique>': 'O',
+     '</Temp>': 'O',
+     '</OD>': 'O',
+     '<Agit>': 'O',
+     '</Agit>': 'O',
+     '<Name>': 'O',
+     '</Name>': 'O',
+     '<Orgn>': 'O',
+     '</Orgn>': 'O',
+     '</Anti>': 'O',
+     '<Vess>': 'O',
+     '</Vess>': 'O'}
+    
+    # Other label
+    flag = 'O'
+    # sentences counter    
+    lista = []
+    #First sentence
+    sentence = ''
+    with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
+	    for line in input_file:
+		    if len(line.split('\t')) > 1:
+			    w = line.split('\t')[1]
+			    if w in in_labels or w in out_labels:
+			    	#Tagging                    
+				    if w in in_labels.keys(): flag = in_labels[w]                    
+				    if w in out_labels: flag = out_labels[w]					
+			    else:                    
+				    if w == "PGCGROWTHCONDITIONS":                        
+			                        words = sentence.split(' ')
+				    	#End of sentence
+		                        	tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values() ]
+	               		        #At least one true-tag on sentence
+			                        if len(tags)> 0:
+                			            lista.append(sentence)
+		                        #New setence
+	                            		sentence = ''                            
+				    else:			            	
+						sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
+
+    print("Number of sentences: " + str( len(lista) ) )
+
+
+    # Split 70 30 training and test sentences
+    trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70))
+    testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex]
+
+    with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile:
+      Data = [lista[i]  for i in trainingIndex]
+      oFile.write('\n'.join(Data))
+
+    with open(os.path.join(options.outputPath, options.testFile), "w") as oFile:
+      Data = [lista[i]  for i in testIndex]
+      oFile.write('\n'.join(Data))
+
+    print("==================================END===================================")
--- a/CRF/bin/label-split_training_test_v2.py.save 0 → 100644
View file @fd616ef
+++ b/CRF/bin/label-split_training_test_v2.py.save 0 → 100644
View file @fd616ef
+#!/bin/python3    
+from optparse import OptionParser
+import re
+import os
+import random
+
+
+# Objective
+# Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging
+# make data sets using only sentences with at least one true-tag
+#
+# Input parameters
+# --inputPath=PATH    		Path of inputfile
+# --outputPath=PATH   		Path to place output files
+# --trainingFile=testFile  	Output training data set
+# --testFile=testFile  	  	Output test data set
+#
+# Output
+# training and test data set
+#
+# Examples
+# python label-split_training_test_v2.py
+# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
+# --inputFile sentences.tsv_pakal_.conll
+# --trainingFile training-data-set-70.txt
+# --testFile test-data-set-30.txt
+# --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
+#
+# 
+# python label-split_training_test_v2.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile raw-metadata-senteneces.txt.conll --trainingFile training-data-set-70_v2.txt --testFile test-data-set-30_v2.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
+
+
+##########################################
+#               MAIN PROGRAM             #
+##########################################
+
+if __name__ == "__main__":
+    # Defining parameters
+    parser = OptionParser()
+    parser.add_option("--inputPath", dest="inputPath",
+                      help="Path of output from CoreNLP", metavar="PATH")
+    parser.add_option("--outputPath", dest="outputPath",
+                      help="Output path to place output files",
+                      metavar="PATH")
+    parser.add_option("--inputFile", dest="inputFile",
+                      help="File with CoreNLP-tagging sentences", metavar="FILE")
+    parser.add_option("--trainingFile", dest="trainingFile",
+                      help="File with training data set", metavar="FILE")
+    parser.add_option("--testFile", dest="testFile",
+                      help="File with test data set", metavar="FILE")
+
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("Any parameter given.")
+        sys.exit(1)
+
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path of CoreNLP output: " + options.inputPath)
+    print("File with CoreNLP-tagging sentences: " + str(options.inputFile))
+    print("Path of training data set: " + options.outputPath)
+    print("File with training data set: " + str(options.trainingFile))
+    print("Path of test data set: " + options.outputPath)
+    print("File with test data set: " + str(options.testFile))
+    print('-------------------------------- PROCESSING --------------------------------')
+    ## begin of tagging
+    in_labels = {
+     '<Gtype>': 'Gtype',
+     '<Gversion>': 'Gversion',
+     '<Med>': 'Med',
+     '<Phase>': 'Phase',
+     '<Supp>': 'Supp',
+     '<Technique>': 'Technique',
+     '<Temp>': 'Temp',
+     '<OD>': 'OD',
+     '<Anti>': 'Anti',
+     '<Agit>': 'Agit',
+     '<Vess>': 'Vess'
+    }
+    ## End of tagging
+    out_labels = {
+     '<Air>': 'O',    
+     '</Air>': 'O',
+     '</Gtype>': 'O',
+     '</Gversion>': 'O',
+     '</Med>': 'O',
+     '</Phase>': 'O',
+     '<Sample>': 'O',
+     '</Sample>': 'O',
+     '<Serie>': 'O',
+     '</Serie>': 'O',
+     '<Strain>': 'O',
+     '</Strain>': 'O',
+     '<Substrain>': 'O',
+     '</Substrain>': 'O',
+     '</Supp>': 'O',
+     '</Technique>': 'O',
+     '</Temp>': 'O',
+     '</OD>': 'O',
+     '</Anti>': 'O',
+     '</Agit>': 'O',
+     '<Name>': 'O',
+     '</Name>': 'O',
+     '<Orgn>': 'O',
+     '</Orgn>': 'O',
+     '</Vess>': 'O'}
+    
+    # Other label
+    flag = 'O'
+    # sentences counter
+    n=0
+    lista = []
+    #First sentence
+    sentence = ''
+    with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
+	    for line in input_file:
+		    if len(line.split('\t')) > 1:
+			    w = line.split('\t')[1]
+			    if w in in_labels or w in out_labels:
+			    	#Tagging                    
+				    if w in in_labels.keys(): flag = in_labels[w]                    
+				    if w in out_labels: flag = out_labels[w]					
+			    else:                    
+				    if w == "PGCGROWTHCONDITIONS":
+					words = sentence.split(' ')
+                        tags = [tag for tag in words if word.split('|')[-1] in in_labels.values() ]
+                        #At least one true-tag on sentence
+                        if len(tags)> 0:
+                            lista.append(sentence)
+                            #New setence
+                            sentence = ''
+                            n=n+1
+				    else:
+				    	#Building and save tagging sentence
+					sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
+
+	print("Number of sentences: " + str(n) + str(len(lista)+1))
+
+
+    # Split 70 30 training and test sentences
+    trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70))
+    testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex]
+
+    with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile:
+      Data = [lista[i]  for i in trainingIndex]
+      oFile.write('\n'.join(Data))
+
+    with open(os.path.join(options.outputPath, options.testFile), "w") as oFile:
+      Data = [lista[i]  for i in testIndex]
+      oFile.write('\n'.join(Data))
+
+    print("==================================END===================================")
--- a/CRF/bin/training_validation_v3.py
View file @fd616ef
+++ b/CRF/bin/training_validation_v3.py
View file @fd616ef
@@ -299,7 +299,7 @@ if __name__ == "__main__":
     # Original: labels = list(crf.classes_)
     # Original: labels.remove('O')
-    labels = list(['Air', 'Gtype', 'Gversion', 'Med', 'Phase', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Vess'])
+    labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Supp', 'Technique', 'Temp', 'OD', 'Anti'])
     # use the same metric for evaluation
     f1_scorer = make_scorer(metrics.flat_f1_score,
--- a/CRF/bin/training_validation_v4.py 0 → 100644
View file @fd616ef
+++ b/CRF/bin/training_validation_v4.py 0 → 100644
View file @fd616ef
--- a/CRF/bin/training_validation_v5.py 0 → 100644
View file @fd616ef
+++ b/CRF/bin/training_validation_v5.py 0 → 100644
View file @fd616ef