label-split_training_test_v2.py
5.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#!/bin/python3
from optparse import OptionParser
import re
import os
import random
# Objective
# Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging
# make data sets using only sentences with at least one true-tag
#
# Input parameters
# --inputPath=PATH Path of inputfile
# --outputPath=PATH Path to place output files
# --trainingFile=testFile Output training data set
# --testFile=testFile Output test data set
#
# Output
# training and test data set
#
# Examples
# python label-split_training_test_v2.py
# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
# --inputFile sentences.tsv_pakal_.conll
# --trainingFile training-data-set-70.txt
# --testFile test-data-set-30.txt
# --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
#
#
# python label-split_training_test_v2.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile raw-metadata-senteneces.txt.conll --trainingFile training-data-set-70_v2.txt --testFile test-data-set-30_v2.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
##########################################
# MAIN PROGRAM #
##########################################
if __name__ == "__main__":
# Defining parameters
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Path of output from CoreNLP", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
help="Output path to place output files",
metavar="PATH")
parser.add_option("--inputFile", dest="inputFile",
help="File with CoreNLP-tagging sentences", metavar="FILE")
parser.add_option("--trainingFile", dest="trainingFile",
help="File with training data set", metavar="FILE")
parser.add_option("--testFile", dest="testFile",
help="File with test data set", metavar="FILE")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Any parameter given.")
sys.exit(1)
print('-------------------------------- PARAMETERS --------------------------------')
print("Path of CoreNLP output: " + options.inputPath)
print("File with CoreNLP-tagging sentences: " + str(options.inputFile))
print("Path of training data set: " + options.outputPath)
print("File with training data set: " + str(options.trainingFile))
print("Path of test data set: " + options.outputPath)
print("File with test data set: " + str(options.testFile))
print('-------------------------------- PROCESSING --------------------------------')
## begin of tagging
in_labels = {
'<Gtype>': 'Gtype',
'<Gversion>': 'Gversion',
'<Med>': 'Med',
'<Phase>': 'Phase',
'<Supp>': 'Supp',
'<Technique>': 'Technique',
'<Temp>': 'Temp',
'<OD>': 'OD',
'<Anti>': 'Anti'
}
## End of tagging
out_labels = {
'<Air>': 'O',
'</Air>': 'O',
'</Gtype>': 'O',
'</Gversion>': 'O',
'</Med>': 'O',
'</Phase>': 'O',
'<Sample>': 'O',
'</Sample>': 'O',
'<Serie>': 'O',
'</Serie>': 'O',
'<Strain>': 'O',
'</Strain>': 'O',
'<Substrain>': 'O',
'</Substrain>': 'O',
'</Supp>': 'O',
'</Technique>': 'O',
'</Temp>': 'O',
'</OD>': 'O',
'<Agit>': 'O',
'</Agit>': 'O',
'<Name>': 'O',
'</Name>': 'O',
'<Orgn>': 'O',
'</Orgn>': 'O',
'</Anti>': 'O',
'<Vess>': 'O',
'</Vess>': 'O'}
# Other label
flag = 'O'
# sentences counter
lista = []
#First sentence
sentence = ''
with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
for line in input_file:
if len(line.split('\t')) > 1:
w = line.split('\t')[1]
if w in in_labels or w in out_labels:
#Tagging
if w in in_labels.keys(): flag = in_labels[w]
if w in out_labels: flag = out_labels[w]
else:
if w == "PGCGROWTHCONDITIONS":
words = sentence.split(' ')
#End of sentence
tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values() ]
#At least one true-tag on sentence
if len(tags)> 0:
lista.append(sentence)
#New setence
sentence = ''
else:
sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
print("Number of sentences: " + str( len(lista) ) )
# Split 70 30 training and test sentences
trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70))
testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex]
with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile:
Data = [lista[i] for i in trainingIndex]
oFile.write('\n'.join(Data))
with open(os.path.join(options.outputPath, options.testFile), "w") as oFile:
Data = [lista[i] for i in testIndex]
oFile.write('\n'.join(Data))
print("==================================END===================================")