label-split_training_test_v4.py 5.67 KB
from optparse import OptionParser
import re
import os
import random


# Objective
# Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging
#
# Input parameters
# --inputPath=PATH    		Path of inputfile
# --inputFile 			Output CoreNLP file with tagging sentences
# --outputPath=PATH   		Path to place output files
# --trainingFile=testFile  	Output training data set
# --testFile=testFile  	  	Output test data set
# --index			Select a limit CoreNLP output column
#
# Output
# training and test data set
#
# Examples
# python label-split_training_test_v1.py
# --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/test-trainig
# --inputFile raw-metadata-senteneces_v2.txt.conll
# --trainingFile training-data-set-70_v4.txt
# --testFile test-data-set-30_v4.txt
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets
# --index 5
# 
# python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/test-trainig --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --index 5


##########################################
#               MAIN PROGRAM             #
##########################################

if __name__ == "__main__":
    # Defining parameters
    parser = OptionParser()
    parser.add_option("--inputPath", dest="inputPath", help="Path of output from CoreNLP", metavar="PATH")
    parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
    parser.add_option("--inputFile", dest="inputFile", help="File with CoreNLP-tagging sentences", metavar="FILE")
    parser.add_option("--trainingFile", dest="trainingFile", help="File with training data set", metavar="FILE")
    parser.add_option("--testFile", dest="testFile",help="File with test data set", metavar="FILE")
    parser.add_option("--index", dest="index",help="Select a limit CoreNLP output column", metavar='N', type=int)

    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("Any parameter given.")
        sys.exit(1)

    print('-------------------------------- PARAMETERS --------------------------------')
    print("Path of CoreNLP output: " + options.inputPath)
    print("File with CoreNLP-tagging sentences: " + str(options.inputFile))
    print("Path of training data set: " + options.outputPath)
    print("File with training data set: " + str(options.trainingFile))
    print("Path of test data set: " + options.outputPath)
    print("File with test data set: " + str(options.testFile))
    print("CoreNLP output choosen colums: 1-" + str(options.index))
    print('-------------------------------- PROCESSING --------------------------------')
    ## begin of tagging
    in_labels = {
     '<Gtype>': 'Gtype',
     '<Gversion>': 'Gversion',
     '<Med>': 'Med',
     '<Phase>': 'Phase',        
     '<Substrain>': 'Substrain',
     '<Supp>': 'Supp',
     '<Strain>': 'Strain',
     '<Technique>': 'Technique',
     '<Temp>': 'Temp',
     '<OD>': 'OD',
     '<Anti>': 'Anti',
     '<Agit>': 'Agit',
     '<Air>': 'Air',    
     '<Vess>': 'Vess',
     '<pH>': 'pH'
    }
    ## End of tagging
    out_labels = {
     '</Gtype>': 'O',
     '</Gversion>': 'O',
     '</Med>': 'O',
     '</Phase>': 'O',
     '</Substrain>': 'O',
     '</Supp>': 'O',
     '</Strain>': 'O',
     '</Technique>': 'O',
     '</Temp>': 'O',
     '</OD>': 'O',
     '</Anti>': 'O',
     '</Agit>': 'O',
     '</Air>': 'O',
     '</Vess>': 'O',
     '</pH>': 'O'}
    old_labels = {
     '<Orgn>': 'O',
     '</Orgn>': 'O'
     }
    
    # Other label
    flag = 'O'    
    lista = []
    #First sentence
    sentence = ''
    n = 0
    with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
	    for line in input_file:
		    if len(line.split('\t')) > 1:
			    w = line.split('\t')[1]                
			    if w in in_labels or w in out_labels:
			    	#Tagging
				    if w in in_labels.keys(): flag = in_labels[w]
				    if w in out_labels: flag = out_labels[w]                    
			    else:
				    if w == "PGCGROWTHCONDITIONS":
					n=n+1
				    	words = sentence.split(' ')
        		                #End of sentence
	                	        tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values() ]
		                        #At least one true-tag on sentence
                		        if len(tags)> 0:
		                            lista.append(sentence)
					    #New setence
					sentence = ''				    
				    elif w not in old_labels.keys():
                        		#Building and save tagging sentence
					sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:options.index])+'|'+flag+' ')

    print("Number of sentences with at least one tag: " + str(len(lista)))		
    print("Number of sentences from CoreNLP: " + str(n))


    # Split 70 30 training and test sentences
    trainingIndex = random.sample(range(len(lista)), int(len(lista)*.70))
    testIndex = [n for n in range(len(lista)) if n not in trainingIndex]
    print("Number of sentences for training: " + str(len(trainingIndex)))
    print("Number of sentences for test: " + str(len(testIndex)))

    with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile:
      Data = [lista[i]  for i in trainingIndex]
      oFile.write('\n'.join(Data))

    with open(os.path.join(options.outputPath, options.testFile), "w") as oFile:
      Data = [lista[i]  for i in testIndex]
      oFile.write('\n'.join(Data))

    print("==================================END===================================")