label-split_training_test_v2.py 5.38 KB
#!/bin/python3    
from optparse import OptionParser
import re
import os
import random


# Objective
# Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging
# make data sets using only sentences with at least one true-tag
#
# Input parameters
# --inputPath=PATH    		Path of inputfile
# --outputPath=PATH   		Path to place output files
# --trainingFile=testFile  	Output training data set
# --testFile=testFile  	  	Output test data set
#
# Output
# training and test data set
#
# Examples
# python label-split_training_test_v2.py
# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
# --inputFile sentences.tsv_pakal_.conll
# --trainingFile training-data-set-70.txt
# --testFile test-data-set-30.txt
# --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
#
# 
# python label-split_training_test_v2.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile raw-metadata-senteneces.txt.conll --trainingFile training-data-set-70_v2.txt --testFile test-data-set-30_v2.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets


##########################################
#               MAIN PROGRAM             #
##########################################

if __name__ == "__main__":
    # Defining parameters
    parser = OptionParser()
    parser.add_option("--inputPath", dest="inputPath",
                      help="Path of output from CoreNLP", metavar="PATH")
    parser.add_option("--outputPath", dest="outputPath",
                      help="Output path to place output files",
                      metavar="PATH")
    parser.add_option("--inputFile", dest="inputFile",
                      help="File with CoreNLP-tagging sentences", metavar="FILE")
    parser.add_option("--trainingFile", dest="trainingFile",
                      help="File with training data set", metavar="FILE")
    parser.add_option("--testFile", dest="testFile",
                      help="File with test data set", metavar="FILE")

    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("Any parameter given.")
        sys.exit(1)

    print('-------------------------------- PARAMETERS --------------------------------')
    print("Path of CoreNLP output: " + options.inputPath)
    print("File with CoreNLP-tagging sentences: " + str(options.inputFile))
    print("Path of training data set: " + options.outputPath)
    print("File with training data set: " + str(options.trainingFile))
    print("Path of test data set: " + options.outputPath)
    print("File with test data set: " + str(options.testFile))
    print('-------------------------------- PROCESSING --------------------------------')
    ## begin of tagging
    in_labels = {
     '<Gtype>': 'Gtype',
     '<Gversion>': 'Gversion',
     '<Med>': 'Med',
     '<Phase>': 'Phase',
     '<Supp>': 'Supp',
     '<Technique>': 'Technique',
     '<Temp>': 'Temp',
     '<OD>': 'OD',
     '<Anti>': 'Anti'
    }
    ## End of tagging
    out_labels = {
     '<Air>': 'O',    
     '</Air>': 'O',
     '</Gtype>': 'O',
     '</Gversion>': 'O',
     '</Med>': 'O',
     '</Phase>': 'O',
     '<Sample>': 'O',
     '</Sample>': 'O',
     '<Serie>': 'O',
     '</Serie>': 'O',
     '<Strain>': 'O',
     '</Strain>': 'O',
     '<Substrain>': 'O',
     '</Substrain>': 'O',
     '</Supp>': 'O',
     '</Technique>': 'O',
     '</Temp>': 'O',
     '</OD>': 'O',
     '<Agit>': 'O',
     '</Agit>': 'O',
     '<Name>': 'O',
     '</Name>': 'O',
     '<Orgn>': 'O',
     '</Orgn>': 'O',
     '</Anti>': 'O',
     '<Vess>': 'O',
     '</Vess>': 'O'}
    
    # Other label
    flag = 'O'
    # sentences counter    
    lista = []
    #First sentence
    sentence = ''
    with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
	    for line in input_file:
		    if len(line.split('\t')) > 1:
			    w = line.split('\t')[1]
			    if w in in_labels or w in out_labels:
			    	#Tagging                    
				    if w in in_labels.keys(): flag = in_labels[w]                    
				    if w in out_labels: flag = out_labels[w]					
			    else:                    
				    if w == "PGCGROWTHCONDITIONS":                        
			                        words = sentence.split(' ')
				    	#End of sentence
		                        	tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values() ]
	               		        #At least one true-tag on sentence
			                        if len(tags)> 0:
                			            lista.append(sentence)
		                        #New setence
	                            		sentence = ''                            
				    else:			            	
						sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ')

    print("Number of sentences: " + str( len(lista) ) )


    # Split 70 30 training and test sentences
    trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70))
    testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex]

    with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile:
      Data = [lista[i]  for i in trainingIndex]
      oFile.write('\n'.join(Data))

    with open(os.path.join(options.outputPath, options.testFile), "w") as oFile:
      Data = [lista[i]  for i in testIndex]
      oFile.write('\n'.join(Data))

    print("==================================END===================================")