built_bg_sentences.py
3.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/bin/python3
import os
from optparse import OptionParser
# Objective
# Labaled separated by '|' sentences from CoreNLP-tagging files
#
# Input parameters
# --inputPath=PATH Path of inputfile
# --outputPath=PATH Path to place output files
# --outputFile=File Output data set
#
# Output
# Tagged sentences reconstruction
#
# Examples
# python label-split_training_test_v1.py
# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
# --inputFile annot-input_bg.txt
# --outputFile input_bg_sentences.txt
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
#
#python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output --outputFile annot-input_bg.txt
##########################################
# MAIN PROGRAM #
##########################################
if __name__ == "__main__":
# Defining parameters
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath", help="Path of output from CoreNLP", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
parser.add_option("--inputFile", dest="inputFile", help="File with CoreNLP-tagging sentences", metavar="FILE")
parser.add_option("--outputFile", dest="outputFile", help="File with training data set", metavar="FILE")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Any parameter given.")
sys.exit(1)
print('-------------------------------- PARAMETERS --------------------------------')
print("Path of CoreNLP output: " + options.inputPath)
print("File with CoreNLP-tagging bg-sentences: " + str(options.inputFile))
print("Path to save data set: " + str(options.outputPath))
print("File to save recontrsucted bg-sentences: " + str(options.outputFile))
print('-------------------------------- PROCESSING --------------------------------')
# sentences counter
n=0
lista = []
#First sentence
sentence = ''
with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
for line in input_file:
if len(line.split('\t')) > 1:
w = line.split('\t')[1]
if w == "PGCGROWTHCONDITIONS":
#End of sentence
lista.append(sentence)
#New setence
sentence = ''
n=n+1
else:
#Building and save tagging sentence
sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4]))
print("Number of sentences: " + str(n))
with open(os.path.join(options.outputPath, options.outputFile), "w") as oFile: oFile.write('\n'.join(lista))
print("==================================END===================================")