built_bg_sentences.py
4.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/bin/python3
import os
from optparse import OptionParser
# Objective
# Labaled separated by '|' sentences from CoreNLP-tagging files
#
# Input parameters
# --inputPath=PATH Path of inputfile
# --outputPath=PATH Path to place output files
# --outputFile=File Output data set
# --minWordLen Minimum word length
# --minSenLen Minimum sentence length
# --index Select a limit CoreNLP output column
#
# Output
# Tagged sentences reconstruction
#
# Examples
# python label-split_training_test_v1.py
# --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation
# --inputFile bg_sentences_v2.txt.ner
# --outputFile annot-input_bg.txt
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
# --minWordLen 2
# --minSenLen 1
# --index 5
#
#python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1
##########################################
# MAIN PROGRAM #
##########################################
if __name__ == "__main__":
# Defining parameters
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath", help="Path of output from CoreNLP", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
parser.add_option("--inputFile", dest="inputFile", help="File with CoreNLP-tagging sentences", metavar="FILE")
parser.add_option("--outputFile", dest="outputFile", help="File with training data set", metavar="FILE")
parser.add_option("--minWordLen", dest="wL", help="Minimum word length", type="int")
parser.add_option("--minSenLen", dest="sL", help="Minimum word length", type="int")
parser.add_option("--index", dest="index",help="Select a limit CoreNLP output column", metavar='N', type=int)
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Any parameter given.")
sys.exit(1)
print('-------------------------------- PARAMETERS --------------------------------')
print("Path of CoreNLP output: " + options.inputPath)
print("File with CoreNLP-tagging bg-sentences: " + str(options.inputFile))
print("Path to save data set: " + str(options.outputPath))
print("File to save recontrsucted bg-sentences: " + str(options.outputFile))
print('-------------------------------- PROCESSING --------------------------------')
# sentences counter
n=0
lista = []
#First sentence
sentence = ''
#count
i = 0
with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
for line in input_file:
if len(line.split('\t')) > 1:
w = line.split('\t')[1]
if w == "PGCGROWTHCONDITIONS":
i = i + 1
if len( sentence.lstrip().split(' ') ) <= options.sL and len(sentence.lstrip().split(' ')[0].split('|')[0]) <= options.wL:
print( "EXCLUDE: " + str(i) + "line" + sentence.lstrip() )
else:
#End of sentence
lista.append(sentence.lstrip())
#New setence
n = n+1
#New setence
sentence = ''
else:
#Building and save tagging sentence
sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:options.index]))
print("Number of sentences: " + str(n))
with open(os.path.join(options.outputPath, options.outputFile), "w") as oFile: oFile.write('\n'.join(lista))
print("==================================END===================================")