built_bg_sentences.py 4.07 KB

Raw Blame History Permalink

#!/bin/python3
import os
from optparse import OptionParser

# Objective
# Labaled separated by '|' sentences from CoreNLP-tagging files
#
# Input parameters
# --inputPath=PATH    		Path of inputfile
# --outputPath=PATH   		Path to place output files
# --outputFile=File        	Output data set
# --minWordLen                  Minimum word length
# --minSenLen			Minimum sentence length
# --index                       Select a limit CoreNLP output column
#
# Output
# Tagged sentences reconstruction
#
# Examples
# python label-split_training_test_v1.py
# --inputPath               /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
# --inputFile               bg_sentences_v2.txt.ner
# --outputFile              annot-input_bg.txt
# --outputPath              /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
# --minWordLen		    2
# --minSenLen               1
# --index		    5
#
#python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1

##########################################
#               MAIN PROGRAM             #
##########################################

if __name__ == "__main__":
    # Defining parameters
    parser = OptionParser()
    parser.add_option("--inputPath",  dest="inputPath",  help="Path of output from CoreNLP",         metavar="PATH")
    parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files",   metavar="PATH")
    parser.add_option("--inputFile",  dest="inputFile",  help="File with CoreNLP-tagging sentences", metavar="FILE")
    parser.add_option("--outputFile", dest="outputFile", help="File with training data set",         metavar="FILE")
    parser.add_option("--minWordLen", dest="wL", help="Minimum word length", type="int")
    parser.add_option("--minSenLen",  dest="sL", help="Minimum word length", type="int")
    parser.add_option("--index", dest="index",help="Select a limit CoreNLP output column", metavar='N', type=int)

    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("Any parameter given.")
        sys.exit(1)

    print('-------------------------------- PARAMETERS --------------------------------')
    print("Path of CoreNLP output: " + options.inputPath)
    print("File with CoreNLP-tagging bg-sentences: " + str(options.inputFile))
    print("Path to save data set: " + str(options.outputPath))
    print("File to save recontrsucted bg-sentences: " + str(options.outputFile))
    print('-------------------------------- PROCESSING --------------------------------')

    # sentences counter
    n=0
    lista = []
    #First sentence
    sentence = ''
    #count
    i = 0
    with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
        for line in input_file:
            if len(line.split('\t')) > 1:
                w = line.split('\t')[1]
                if w == "PGCGROWTHCONDITIONS":
                    i = i + 1
                    if len( sentence.lstrip().split(' ') ) <= options.sL and len(sentence.lstrip().split(' ')[0].split('|')[0]) <= options.wL:
                         print( "EXCLUDE: " + str(i) + "line" + sentence.lstrip() )
                    else:
                         #End of sentence
                         lista.append(sentence.lstrip())
                         #New setence
                         n = n+1
                    #New setence
                    sentence = ''
                else:
                    #Building and save tagging sentence
                    sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:options.index]))

    print("Number of sentences: " + str(n))

    with open(os.path.join(options.outputPath, options.outputFile), "w") as oFile: oFile.write('\n'.join(lista))

    print("==================================END===================================")