Estefani Gaytan Nunez

scripts

1 +#!/bin/python3
2 +from optparse import OptionParser
3 +import re
4 +import os
5 +import random
6 +
7 +
8 +# Objective
9 +# Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging
10 +# make data sets using only sentences with at least one true-tag
11 +#
12 +# Input parameters
13 +# --inputPath=PATH Path of inputfile
14 +# --outputPath=PATH Path to place output files
15 +# --trainingFile=testFile Output training data set
16 +# --testFile=testFile Output test data set
17 +#
18 +# Output
19 +# training and test data set
20 +#
21 +# Examples
22 +# python label-split_training_test_v2.py
23 +# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
24 +# --inputFile sentences.tsv_pakal_.conll
25 +# --trainingFile training-data-set-70.txt
26 +# --testFile test-data-set-30.txt
27 +# --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
28 +#
29 +#
30 +# python label-split_training_test_v2.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile raw-metadata-senteneces.txt.conll --trainingFile training-data-set-70_v2.txt --testFile test-data-set-30_v2.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
31 +
32 +
33 +##########################################
34 +# MAIN PROGRAM #
35 +##########################################
36 +
37 +if __name__ == "__main__":
38 + # Defining parameters
39 + parser = OptionParser()
40 + parser.add_option("--inputPath", dest="inputPath",
41 + help="Path of output from CoreNLP", metavar="PATH")
42 + parser.add_option("--outputPath", dest="outputPath",
43 + help="Output path to place output files",
44 + metavar="PATH")
45 + parser.add_option("--inputFile", dest="inputFile",
46 + help="File with CoreNLP-tagging sentences", metavar="FILE")
47 + parser.add_option("--trainingFile", dest="trainingFile",
48 + help="File with training data set", metavar="FILE")
49 + parser.add_option("--testFile", dest="testFile",
50 + help="File with test data set", metavar="FILE")
51 +
52 + (options, args) = parser.parse_args()
53 + if len(args) > 0:
54 + parser.error("Any parameter given.")
55 + sys.exit(1)
56 +
57 + print('-------------------------------- PARAMETERS --------------------------------')
58 + print("Path of CoreNLP output: " + options.inputPath)
59 + print("File with CoreNLP-tagging sentences: " + str(options.inputFile))
60 + print("Path of training data set: " + options.outputPath)
61 + print("File with training data set: " + str(options.trainingFile))
62 + print("Path of test data set: " + options.outputPath)
63 + print("File with test data set: " + str(options.testFile))
64 + print('-------------------------------- PROCESSING --------------------------------')
65 + ## begin of tagging
66 + in_labels = {
67 + '<Gtype>': 'Gtype',
68 + '<Gversion>': 'Gversion',
69 + '<Med>': 'Med',
70 + '<Phase>': 'Phase',
71 + '<Supp>': 'Supp',
72 + '<Technique>': 'Technique',
73 + '<Temp>': 'Temp',
74 + '<OD>': 'OD',
75 + '<Anti>': 'Anti'
76 + }
77 + ## End of tagging
78 + out_labels = {
79 + '<Air>': 'O',
80 + '</Air>': 'O',
81 + '</Gtype>': 'O',
82 + '</Gversion>': 'O',
83 + '</Med>': 'O',
84 + '</Phase>': 'O',
85 + '<Sample>': 'O',
86 + '</Sample>': 'O',
87 + '<Serie>': 'O',
88 + '</Serie>': 'O',
89 + '<Strain>': 'O',
90 + '</Strain>': 'O',
91 + '<Substrain>': 'O',
92 + '</Substrain>': 'O',
93 + '</Supp>': 'O',
94 + '</Technique>': 'O',
95 + '</Temp>': 'O',
96 + '</OD>': 'O',
97 + '<Agit>': 'O',
98 + '</Agit>': 'O',
99 + '<Name>': 'O',
100 + '</Name>': 'O',
101 + '<Orgn>': 'O',
102 + '</Orgn>': 'O',
103 + '</Anti>': 'O',
104 + '<Vess>': 'O',
105 + '</Vess>': 'O'}
106 +
107 + # Other label
108 + flag = 'O'
109 + # sentences counter
110 + lista = []
111 + #First sentence
112 + sentence = ''
113 + with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
114 + for line in input_file:
115 + if len(line.split('\t')) > 1:
116 + w = line.split('\t')[1]
117 + if w in in_labels or w in out_labels:
118 + #Tagging
119 + if w in in_labels.keys(): flag = in_labels[w]
120 + if w in out_labels: flag = out_labels[w]
121 + else:
122 + if w == "PGCGROWTHCONDITIONS":
123 + words = sentence.split(' ')
124 + #End of sentence
125 + tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values() ]
126 + #At least one true-tag on sentence
127 + if len(tags)> 0:
128 + lista.append(sentence)
129 + #New setence
130 + sentence = ''
131 + else:
132 + sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
133 +
134 + print("Number of sentences: " + str( len(lista) ) )
135 +
136 +
137 + # Split 70 30 training and test sentences
138 + trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70))
139 + testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex]
140 +
141 + with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile:
142 + Data = [lista[i] for i in trainingIndex]
143 + oFile.write('\n'.join(Data))
144 +
145 + with open(os.path.join(options.outputPath, options.testFile), "w") as oFile:
146 + Data = [lista[i] for i in testIndex]
147 + oFile.write('\n'.join(Data))
148 +
149 + print("==================================END===================================")
1 +#!/bin/python3
2 +from optparse import OptionParser
3 +import re
4 +import os
5 +import random
6 +
7 +
8 +# Objective
9 +# Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging
10 +# make data sets using only sentences with at least one true-tag
11 +#
12 +# Input parameters
13 +# --inputPath=PATH Path of inputfile
14 +# --outputPath=PATH Path to place output files
15 +# --trainingFile=testFile Output training data set
16 +# --testFile=testFile Output test data set
17 +#
18 +# Output
19 +# training and test data set
20 +#
21 +# Examples
22 +# python label-split_training_test_v2.py
23 +# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
24 +# --inputFile sentences.tsv_pakal_.conll
25 +# --trainingFile training-data-set-70.txt
26 +# --testFile test-data-set-30.txt
27 +# --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
28 +#
29 +#
30 +# python label-split_training_test_v2.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile raw-metadata-senteneces.txt.conll --trainingFile training-data-set-70_v2.txt --testFile test-data-set-30_v2.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
31 +
32 +
33 +##########################################
34 +# MAIN PROGRAM #
35 +##########################################
36 +
37 +if __name__ == "__main__":
38 + # Defining parameters
39 + parser = OptionParser()
40 + parser.add_option("--inputPath", dest="inputPath",
41 + help="Path of output from CoreNLP", metavar="PATH")
42 + parser.add_option("--outputPath", dest="outputPath",
43 + help="Output path to place output files",
44 + metavar="PATH")
45 + parser.add_option("--inputFile", dest="inputFile",
46 + help="File with CoreNLP-tagging sentences", metavar="FILE")
47 + parser.add_option("--trainingFile", dest="trainingFile",
48 + help="File with training data set", metavar="FILE")
49 + parser.add_option("--testFile", dest="testFile",
50 + help="File with test data set", metavar="FILE")
51 +
52 + (options, args) = parser.parse_args()
53 + if len(args) > 0:
54 + parser.error("Any parameter given.")
55 + sys.exit(1)
56 +
57 + print('-------------------------------- PARAMETERS --------------------------------')
58 + print("Path of CoreNLP output: " + options.inputPath)
59 + print("File with CoreNLP-tagging sentences: " + str(options.inputFile))
60 + print("Path of training data set: " + options.outputPath)
61 + print("File with training data set: " + str(options.trainingFile))
62 + print("Path of test data set: " + options.outputPath)
63 + print("File with test data set: " + str(options.testFile))
64 + print('-------------------------------- PROCESSING --------------------------------')
65 + ## begin of tagging
66 + in_labels = {
67 + '<Gtype>': 'Gtype',
68 + '<Gversion>': 'Gversion',
69 + '<Med>': 'Med',
70 + '<Phase>': 'Phase',
71 + '<Supp>': 'Supp',
72 + '<Technique>': 'Technique',
73 + '<Temp>': 'Temp',
74 + '<OD>': 'OD',
75 + '<Anti>': 'Anti',
76 + '<Agit>': 'Agit',
77 + '<Vess>': 'Vess'
78 + }
79 + ## End of tagging
80 + out_labels = {
81 + '<Air>': 'O',
82 + '</Air>': 'O',
83 + '</Gtype>': 'O',
84 + '</Gversion>': 'O',
85 + '</Med>': 'O',
86 + '</Phase>': 'O',
87 + '<Sample>': 'O',
88 + '</Sample>': 'O',
89 + '<Serie>': 'O',
90 + '</Serie>': 'O',
91 + '<Strain>': 'O',
92 + '</Strain>': 'O',
93 + '<Substrain>': 'O',
94 + '</Substrain>': 'O',
95 + '</Supp>': 'O',
96 + '</Technique>': 'O',
97 + '</Temp>': 'O',
98 + '</OD>': 'O',
99 + '</Anti>': 'O',
100 + '</Agit>': 'O',
101 + '<Name>': 'O',
102 + '</Name>': 'O',
103 + '<Orgn>': 'O',
104 + '</Orgn>': 'O',
105 + '</Vess>': 'O'}
106 +
107 + # Other label
108 + flag = 'O'
109 + # sentences counter
110 + n=0
111 + lista = []
112 + #First sentence
113 + sentence = ''
114 + with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
115 + for line in input_file:
116 + if len(line.split('\t')) > 1:
117 + w = line.split('\t')[1]
118 + if w in in_labels or w in out_labels:
119 + #Tagging
120 + if w in in_labels.keys(): flag = in_labels[w]
121 + if w in out_labels: flag = out_labels[w]
122 + else:
123 + if w == "PGCGROWTHCONDITIONS":
124 + words = sentence.split(' ')
125 + tags = [tag for tag in words if word.split('|')[-1] in in_labels.values() ]
126 + #At least one true-tag on sentence
127 + if len(tags)> 0:
128 + lista.append(sentence)
129 + #New setence
130 + sentence = ''
131 + n=n+1
132 + else:
133 + #Building and save tagging sentence
134 + sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
135 +
136 + print("Number of sentences: " + str(n) + str(len(lista)+1))
137 +
138 +
139 + # Split 70 30 training and test sentences
140 + trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70))
141 + testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex]
142 +
143 + with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile:
144 + Data = [lista[i] for i in trainingIndex]
145 + oFile.write('\n'.join(Data))
146 +
147 + with open(os.path.join(options.outputPath, options.testFile), "w") as oFile:
148 + Data = [lista[i] for i in testIndex]
149 + oFile.write('\n'.join(Data))
150 +
151 + print("==================================END===================================")
...@@ -299,7 +299,7 @@ if __name__ == "__main__": ...@@ -299,7 +299,7 @@ if __name__ == "__main__":
299 299
300 # Original: labels = list(crf.classes_) 300 # Original: labels = list(crf.classes_)
301 # Original: labels.remove('O') 301 # Original: labels.remove('O')
302 - labels = list(['Air', 'Gtype', 'Gversion', 'Med', 'Phase', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Vess']) 302 + labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Supp', 'Technique', 'Temp', 'OD', 'Anti'])
303 303
304 # use the same metric for evaluation 304 # use the same metric for evaluation
305 f1_scorer = make_scorer(metrics.flat_f1_score, 305 f1_scorer = make_scorer(metrics.flat_f1_score,
......
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.