label-split_training_test_v4.py
5.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from optparse import OptionParser
import re
import os
import random
# Objective
# Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging
#
# Input parameters
# --inputPath=PATH Path of inputfile
# --inputFile Output CoreNLP file with tagging sentences
# --outputPath=PATH Path to place output files
# --trainingFile=testFile Output training data set
# --testFile=testFile Output test data set
# --index Select a limit CoreNLP output column
#
# Output
# training and test data set
#
# Examples
# python label-split_training_test_v1.py
# --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/test-trainig
# --inputFile raw-metadata-senteneces_v2.txt.conll
# --trainingFile training-data-set-70_v4.txt
# --testFile test-data-set-30_v4.txt
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets
# --index 5
#
# python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/test-trainig --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --index 5
##########################################
# MAIN PROGRAM #
##########################################
if __name__ == "__main__":
# Defining parameters
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath", help="Path of output from CoreNLP", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
parser.add_option("--inputFile", dest="inputFile", help="File with CoreNLP-tagging sentences", metavar="FILE")
parser.add_option("--trainingFile", dest="trainingFile", help="File with training data set", metavar="FILE")
parser.add_option("--testFile", dest="testFile",help="File with test data set", metavar="FILE")
parser.add_option("--index", dest="index",help="Select a limit CoreNLP output column", metavar='N', type=int)
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Any parameter given.")
sys.exit(1)
print('-------------------------------- PARAMETERS --------------------------------')
print("Path of CoreNLP output: " + options.inputPath)
print("File with CoreNLP-tagging sentences: " + str(options.inputFile))
print("Path of training data set: " + options.outputPath)
print("File with training data set: " + str(options.trainingFile))
print("Path of test data set: " + options.outputPath)
print("File with test data set: " + str(options.testFile))
print("CoreNLP output choosen colums: 1-" + str(options.index))
print('-------------------------------- PROCESSING --------------------------------')
## begin of tagging
in_labels = {
'<Gtype>': 'Gtype',
'<Gversion>': 'Gversion',
'<Med>': 'Med',
'<Phase>': 'Phase',
'<Substrain>': 'Substrain',
'<Supp>': 'Supp',
'<Strain>': 'Strain',
'<Technique>': 'Technique',
'<Temp>': 'Temp',
'<OD>': 'OD',
'<Anti>': 'Anti',
'<Agit>': 'Agit',
'<Air>': 'Air',
'<Vess>': 'Vess',
'<pH>': 'pH'
}
## End of tagging
out_labels = {
'</Gtype>': 'O',
'</Gversion>': 'O',
'</Med>': 'O',
'</Phase>': 'O',
'</Substrain>': 'O',
'</Supp>': 'O',
'</Strain>': 'O',
'</Technique>': 'O',
'</Temp>': 'O',
'</OD>': 'O',
'</Anti>': 'O',
'</Agit>': 'O',
'</Air>': 'O',
'</Vess>': 'O',
'</pH>': 'O'}
old_labels = {
'<Orgn>': 'O',
'</Orgn>': 'O'
}
# Other label
flag = 'O'
lista = []
#First sentence
sentence = ''
n = 0
with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
for line in input_file:
if len(line.split('\t')) > 1:
w = line.split('\t')[1]
if w in in_labels or w in out_labels:
#Tagging
if w in in_labels.keys(): flag = in_labels[w]
if w in out_labels: flag = out_labels[w]
else:
if w == "PGCGROWTHCONDITIONS":
n=n+1
words = sentence.split(' ')
#End of sentence
tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values() ]
#At least one true-tag on sentence
if len(tags)> 0:
lista.append(sentence)
#New setence
sentence = ''
elif w not in old_labels.keys():
#Building and save tagging sentence
sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:options.index])+'|'+flag+' ')
print("Number of sentences with at least one tag: " + str(len(lista)))
print("Number of sentences from CoreNLP: " + str(n))
# Split 70 30 training and test sentences
trainingIndex = random.sample(range(len(lista)), int(len(lista)*.70))
testIndex = [n for n in range(len(lista)) if n not in trainingIndex]
print("Number of sentences for training: " + str(len(trainingIndex)))
print("Number of sentences for test: " + str(len(testIndex)))
with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile:
Data = [lista[i] for i in trainingIndex]
oFile.write('\n'.join(Data))
with open(os.path.join(options.outputPath, options.testFile), "w") as oFile:
Data = [lista[i] for i in testIndex]
oFile.write('\n'.join(Data))
print("==================================END===================================")