built_bg_sentences.py
3.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/bin/python3
import os
from optparse import OptionParser
# Objective
# Labaled separated by '|' sentences from CoreNLP-tagging files
#
# Input parameters
# --inputPath=PATH    		Path of inputfile
# --outputPath=PATH   		Path to place output files
# --outputFile=File        	Output data set
#
# Output
# Tagged sentences reconstruction
#
# Examples
# python label-split_training_test_v1.py
# --inputPath               /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
# --inputFile               annot-input_bg.txt
# --outputFile              input_bg_sentences.txt
# --outputPath              /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output
#
#python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output --outputFile annot-input_bg.txt
##########################################
#               MAIN PROGRAM             #
##########################################
if __name__ == "__main__":
    # Defining parameters
    parser = OptionParser()
    parser.add_option("--inputPath",  dest="inputPath",  help="Path of output from CoreNLP",         metavar="PATH")
    parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files",   metavar="PATH")
    parser.add_option("--inputFile",  dest="inputFile",  help="File with CoreNLP-tagging sentences", metavar="FILE")
    parser.add_option("--outputFile", dest="outputFile", help="File with training data set",         metavar="FILE")    
    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("Any parameter given.")
        sys.exit(1)
    print('-------------------------------- PARAMETERS --------------------------------')
    print("Path of CoreNLP output: " + options.inputPath)
    print("File with CoreNLP-tagging bg-sentences: " + str(options.inputFile))
    print("Path to save data set: " + str(options.outputPath))
    print("File to save recontrsucted bg-sentences: " + str(options.outputFile))
    print('-------------------------------- PROCESSING --------------------------------')    
    
    # sentences counter
    n=0
    lista = []
    #First sentence
    sentence = ''
    with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
        for line in input_file:
            if len(line.split('\t')) > 1:
                w = line.split('\t')[1]
                if w == "PGCGROWTHCONDITIONS":
                    #End of sentence
                    lista.append(sentence)
                    #New setence
                    sentence = ''
                    n=n+1                    
                else:
                    #Building and save tagging sentence
                    sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4]))
    print("Number of sentences: " + str(n))
    with open(os.path.join(options.outputPath, options.outputFile), "w") as oFile: oFile.write('\n'.join(lista))
    print("==================================END===================================")