Estefani Gaytan Nunez

scripts

1 +#!/bin/python3
2 +from optparse import OptionParser
3 +import re
4 +import os
5 +import random
6 +
7 +
8 +# Objective
9 +# Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging
10 +# make data sets using only sentences with at least one true-tag
11 +#
12 +# Input parameters
13 +# --inputPath=PATH Path of inputfile
14 +# --outputPath=PATH Path to place output files
15 +# --trainingFile=testFile Output training data set
16 +# --testFile=testFile Output test data set
17 +#
18 +# Output
19 +# training and test data set
20 +#
21 +# Examples
22 +# python label-split_training_test_v2.py
23 +# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
24 +# --inputFile sentences.tsv_pakal_.conll
25 +# --trainingFile training-data-set-70.txt
26 +# --testFile test-data-set-30.txt
27 +# --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
28 +#
29 +#
30 +# python label-split_training_test_v2.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile raw-metadata-senteneces.txt.conll --trainingFile training-data-set-70_v2.txt --testFile test-data-set-30_v2.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
31 +
32 +
33 +##########################################
34 +# MAIN PROGRAM #
35 +##########################################
36 +
37 +if __name__ == "__main__":
38 + # Defining parameters
39 + parser = OptionParser()
40 + parser.add_option("--inputPath", dest="inputPath",
41 + help="Path of output from CoreNLP", metavar="PATH")
42 + parser.add_option("--outputPath", dest="outputPath",
43 + help="Output path to place output files",
44 + metavar="PATH")
45 + parser.add_option("--inputFile", dest="inputFile",
46 + help="File with CoreNLP-tagging sentences", metavar="FILE")
47 + parser.add_option("--trainingFile", dest="trainingFile",
48 + help="File with training data set", metavar="FILE")
49 + parser.add_option("--testFile", dest="testFile",
50 + help="File with test data set", metavar="FILE")
51 +
52 + (options, args) = parser.parse_args()
53 + if len(args) > 0:
54 + parser.error("Any parameter given.")
55 + sys.exit(1)
56 +
57 + print('-------------------------------- PARAMETERS --------------------------------')
58 + print("Path of CoreNLP output: " + options.inputPath)
59 + print("File with CoreNLP-tagging sentences: " + str(options.inputFile))
60 + print("Path of training data set: " + options.outputPath)
61 + print("File with training data set: " + str(options.trainingFile))
62 + print("Path of test data set: " + options.outputPath)
63 + print("File with test data set: " + str(options.testFile))
64 + print('-------------------------------- PROCESSING --------------------------------')
65 + ## begin of tagging
66 + in_labels = {
67 + '<Gtype>': 'Gtype',
68 + '<Gversion>': 'Gversion',
69 + '<Med>': 'Med',
70 + '<Phase>': 'Phase',
71 + '<Supp>': 'Supp',
72 + '<Technique>': 'Technique',
73 + '<Temp>': 'Temp',
74 + '<OD>': 'OD',
75 + '<Anti>': 'Anti'
76 + }
77 + ## End of tagging
78 + out_labels = {
79 + '<Air>': 'O',
80 + '</Air>': 'O',
81 + '</Gtype>': 'O',
82 + '</Gversion>': 'O',
83 + '</Med>': 'O',
84 + '</Phase>': 'O',
85 + '<Sample>': 'O',
86 + '</Sample>': 'O',
87 + '<Serie>': 'O',
88 + '</Serie>': 'O',
89 + '<Strain>': 'O',
90 + '</Strain>': 'O',
91 + '<Substrain>': 'O',
92 + '</Substrain>': 'O',
93 + '</Supp>': 'O',
94 + '</Technique>': 'O',
95 + '</Temp>': 'O',
96 + '</OD>': 'O',
97 + '<Agit>': 'O',
98 + '</Agit>': 'O',
99 + '<Name>': 'O',
100 + '</Name>': 'O',
101 + '<Orgn>': 'O',
102 + '</Orgn>': 'O',
103 + '</Anti>': 'O',
104 + '<Vess>': 'O',
105 + '</Vess>': 'O'}
106 +
107 + # Other label
108 + flag = 'O'
109 + # sentences counter
110 + lista = []
111 + #First sentence
112 + sentence = ''
113 + with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
114 + for line in input_file:
115 + if len(line.split('\t')) > 1:
116 + w = line.split('\t')[1]
117 + if w in in_labels or w in out_labels:
118 + #Tagging
119 + if w in in_labels.keys(): flag = in_labels[w]
120 + if w in out_labels: flag = out_labels[w]
121 + else:
122 + if w == "PGCGROWTHCONDITIONS":
123 + words = sentence.split(' ')
124 + #End of sentence
125 + tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values() ]
126 + #At least one true-tag on sentence
127 + if len(tags)> 0:
128 + lista.append(sentence)
129 + #New setence
130 + sentence = ''
131 + else:
132 + sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
133 +
134 + print("Number of sentences: " + str( len(lista) ) )
135 +
136 +
137 + # Split 70 30 training and test sentences
138 + trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70))
139 + testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex]
140 +
141 + with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile:
142 + Data = [lista[i] for i in trainingIndex]
143 + oFile.write('\n'.join(Data))
144 +
145 + with open(os.path.join(options.outputPath, options.testFile), "w") as oFile:
146 + Data = [lista[i] for i in testIndex]
147 + oFile.write('\n'.join(Data))
148 +
149 + print("==================================END===================================")
1 +#!/bin/python3
2 +from optparse import OptionParser
3 +import re
4 +import os
5 +import random
6 +
7 +
8 +# Objective
9 +# Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging
10 +# make data sets using only sentences with at least one true-tag
11 +#
12 +# Input parameters
13 +# --inputPath=PATH Path of inputfile
14 +# --outputPath=PATH Path to place output files
15 +# --trainingFile=testFile Output training data set
16 +# --testFile=testFile Output test data set
17 +#
18 +# Output
19 +# training and test data set
20 +#
21 +# Examples
22 +# python label-split_training_test_v2.py
23 +# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
24 +# --inputFile sentences.tsv_pakal_.conll
25 +# --trainingFile training-data-set-70.txt
26 +# --testFile test-data-set-30.txt
27 +# --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
28 +#
29 +#
30 +# python label-split_training_test_v2.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile raw-metadata-senteneces.txt.conll --trainingFile training-data-set-70_v2.txt --testFile test-data-set-30_v2.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
31 +
32 +
33 +##########################################
34 +# MAIN PROGRAM #
35 +##########################################
36 +
37 +if __name__ == "__main__":
38 + # Defining parameters
39 + parser = OptionParser()
40 + parser.add_option("--inputPath", dest="inputPath",
41 + help="Path of output from CoreNLP", metavar="PATH")
42 + parser.add_option("--outputPath", dest="outputPath",
43 + help="Output path to place output files",
44 + metavar="PATH")
45 + parser.add_option("--inputFile", dest="inputFile",
46 + help="File with CoreNLP-tagging sentences", metavar="FILE")
47 + parser.add_option("--trainingFile", dest="trainingFile",
48 + help="File with training data set", metavar="FILE")
49 + parser.add_option("--testFile", dest="testFile",
50 + help="File with test data set", metavar="FILE")
51 +
52 + (options, args) = parser.parse_args()
53 + if len(args) > 0:
54 + parser.error("Any parameter given.")
55 + sys.exit(1)
56 +
57 + print('-------------------------------- PARAMETERS --------------------------------')
58 + print("Path of CoreNLP output: " + options.inputPath)
59 + print("File with CoreNLP-tagging sentences: " + str(options.inputFile))
60 + print("Path of training data set: " + options.outputPath)
61 + print("File with training data set: " + str(options.trainingFile))
62 + print("Path of test data set: " + options.outputPath)
63 + print("File with test data set: " + str(options.testFile))
64 + print('-------------------------------- PROCESSING --------------------------------')
65 + ## begin of tagging
66 + in_labels = {
67 + '<Gtype>': 'Gtype',
68 + '<Gversion>': 'Gversion',
69 + '<Med>': 'Med',
70 + '<Phase>': 'Phase',
71 + '<Supp>': 'Supp',
72 + '<Technique>': 'Technique',
73 + '<Temp>': 'Temp',
74 + '<OD>': 'OD',
75 + '<Anti>': 'Anti',
76 + '<Agit>': 'Agit',
77 + '<Vess>': 'Vess'
78 + }
79 + ## End of tagging
80 + out_labels = {
81 + '<Air>': 'O',
82 + '</Air>': 'O',
83 + '</Gtype>': 'O',
84 + '</Gversion>': 'O',
85 + '</Med>': 'O',
86 + '</Phase>': 'O',
87 + '<Sample>': 'O',
88 + '</Sample>': 'O',
89 + '<Serie>': 'O',
90 + '</Serie>': 'O',
91 + '<Strain>': 'O',
92 + '</Strain>': 'O',
93 + '<Substrain>': 'O',
94 + '</Substrain>': 'O',
95 + '</Supp>': 'O',
96 + '</Technique>': 'O',
97 + '</Temp>': 'O',
98 + '</OD>': 'O',
99 + '</Anti>': 'O',
100 + '</Agit>': 'O',
101 + '<Name>': 'O',
102 + '</Name>': 'O',
103 + '<Orgn>': 'O',
104 + '</Orgn>': 'O',
105 + '</Vess>': 'O'}
106 +
107 + # Other label
108 + flag = 'O'
109 + # sentences counter
110 + n=0
111 + lista = []
112 + #First sentence
113 + sentence = ''
114 + with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
115 + for line in input_file:
116 + if len(line.split('\t')) > 1:
117 + w = line.split('\t')[1]
118 + if w in in_labels or w in out_labels:
119 + #Tagging
120 + if w in in_labels.keys(): flag = in_labels[w]
121 + if w in out_labels: flag = out_labels[w]
122 + else:
123 + if w == "PGCGROWTHCONDITIONS":
124 + words = sentence.split(' ')
125 + tags = [tag for tag in words if word.split('|')[-1] in in_labels.values() ]
126 + #At least one true-tag on sentence
127 + if len(tags)> 0:
128 + lista.append(sentence)
129 + #New setence
130 + sentence = ''
131 + n=n+1
132 + else:
133 + #Building and save tagging sentence
134 + sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
135 +
136 + print("Number of sentences: " + str(n) + str(len(lista)+1))
137 +
138 +
139 + # Split 70 30 training and test sentences
140 + trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70))
141 + testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex]
142 +
143 + with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile:
144 + Data = [lista[i] for i in trainingIndex]
145 + oFile.write('\n'.join(Data))
146 +
147 + with open(os.path.join(options.outputPath, options.testFile), "w") as oFile:
148 + Data = [lista[i] for i in testIndex]
149 + oFile.write('\n'.join(Data))
150 +
151 + print("==================================END===================================")
...@@ -299,7 +299,7 @@ if __name__ == "__main__": ...@@ -299,7 +299,7 @@ if __name__ == "__main__":
299 299
300 # Original: labels = list(crf.classes_) 300 # Original: labels = list(crf.classes_)
301 # Original: labels.remove('O') 301 # Original: labels.remove('O')
302 - labels = list(['Air', 'Gtype', 'Gversion', 'Med', 'Phase', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Vess']) 302 + labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Supp', 'Technique', 'Temp', 'OD', 'Anti'])
303 303
304 # use the same metric for evaluation 304 # use the same metric for evaluation
305 f1_scorer = make_scorer(metrics.flat_f1_score, 305 f1_scorer = make_scorer(metrics.flat_f1_score,
......
1 +# -*- coding: UTF-8 -*-
2 +
3 +import os
4 +from itertools import chain
5 +from optparse import OptionParser
6 +from time import time
7 +from collections import Counter
8 +import re
9 +
10 +import nltk
11 +import sklearn
12 +import scipy.stats
13 +import sys
14 +
15 +from sklearn.externals import joblib
16 +from sklearn.metrics import make_scorer
17 +from sklearn.cross_validation import cross_val_score
18 +from sklearn.grid_search import RandomizedSearchCV
19 +
20 +import sklearn_crfsuite
21 +from sklearn_crfsuite import scorers
22 +from sklearn_crfsuite import metrics
23 +
24 +from nltk.corpus import stopwords
25 +
26 +
27 +# Objective
28 +# Training and evaluation of CRFs with sklearn-crfsuite.
29 +#
30 +# Input parameters
31 +# --inputPath=PATH Path of training and test data set
32 +# --trainingFile File with training data set
33 +# --testFile File with test data set
34 +# --outputPath=PATH Output path to place output files
35 +
36 +# Output
37 +# 1) Best model
38 +
39 +# Examples
40 +# python training_validation_v3.py
41 +# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
42 +# --trainingFile training-data-set-70.txt
43 +# --testFile test-data-set-30.txt
44 +# --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/
45 +# python3.4 training-validation_v3.py --inputPatTH /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/
46 +
47 +#################################
48 +# FUNCTIONS #
49 +#################################
50 +
51 +def isGreek(word):
52 + alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω',
53 + 'α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω']
54 + if word in alphabet:
55 + return True
56 + else:
57 + return False
58 +
59 +def word2features(sent, i):
60 + listElem = sent[i].split('|')
61 + word = listElem[0]
62 + lemma = listElem[1]
63 + postag = listElem[2]
64 +
65 + features = {
66 + # Suffixes
67 + #'word[-3:]': word[-3:],
68 + #'word[-2:]': word[-2:],
69 + #'word[-1:]': word[-1:],
70 + #'word.isupper()': word.isupper(),
71 + 'word': word,
72 + 'lemma': lemma,
73 + #'postag': postag,
74 + #'lemma[-3:]': lemma[-3:],
75 + #'lemma[-2:]': lemma[-2:],
76 + #'lemma[-1:]': lemma[-1:],
77 + #'lemma[+3:]': lemma[:3],
78 + #'lemma[+2:]': lemma[:2],
79 + #'lemma[+1:]': lemma[:1],
80 + #'word[:3]': word[:3],
81 + #'word[:2]': word[:2],
82 + #'word[:1]': word[:1],
83 + #'endsConLow()={}'.format(endsConLow(word)): endsConLow(word),
84 + 'isNumber()': word.isdigit(),
85 + 'isGreek(){}'.format(isGreek(word)): isGreek(word),
86 + 'isupper()' : word.isupper(),
87 + 'islower()' : word.islower()
88 + }
89 + if i > 0:
90 + listElem = sent[i - 1].split('|')
91 + word1 = listElem[0]
92 + lemma1 = listElem[1]
93 + postag1 = listElem[2]
94 + features.update({
95 + #'-1:word': word1,
96 + '-1:lemma': lemma1,
97 + '-1:postag': postag1,
98 + })
99 +
100 + if i < len(sent) - 1:
101 + listElem = sent[i + 1].split('|')
102 + #word1 = listElem[0]
103 + lemma1 = listElem[1]
104 + postag1 = listElem[2]
105 + features.update({
106 + #'+1:word': word1,
107 + '+1:lemma': lemma1,
108 + '+1:postag': postag1,
109 + })
110 +
111 + '''
112 + if i > 1:
113 + listElem = sent[i - 2].split('|')
114 + word2 = listElem[0]
115 + lemma2 = listElem[1]
116 + postag2 = listElem[2]
117 + features.update({
118 + '-2:word': word2,
119 + '-2:lemma': lemma2,
120 + })
121 +
122 + if i < len(sent) - 2:
123 + listElem = sent[i + 2].split('|')
124 + word2 = listElem[0]
125 + lemma2 = listElem[1]
126 + postag2 = listElem[2]
127 + features.update({
128 + '+2:word': word2,
129 + '+2:lemma': lemma2,
130 + })
131 +
132 + trigrams = False
133 + if trigrams:
134 + if i > 2:
135 + listElem = sent[i - 3].split('|')
136 + word3 = listElem[0]
137 + lemma3 = listElem[1]
138 + postag3 = listElem[2]
139 + features.update({
140 + '-3:word': word3,
141 + '-3:lemma': lemma3,
142 + })
143 +
144 + if i < len(sent) - 3:
145 + listElem = sent[i + 3].split('|')
146 + word3 = listElem[0]
147 + lemma3 = listElem[1]
148 + postag3 = listElem[2]
149 + features.update({
150 + '+3:word': word3,
151 + '+3:lemma': lemma3,
152 + })
153 + '''
154 + return features
155 +
156 +
157 +def sent2features(sent):
158 + return [word2features(sent, i) for i in range(len(sent))]
159 +
160 +
161 +def sent2labels(sent):
162 + return [elem.split('|')[3] for elem in sent]
163 +
164 +
165 +def sent2tokens(sent):
166 + return [token for token, postag, label in sent]
167 +
168 +
169 +def print_transitions(trans_features, f):
170 + for (label_from, label_to), weight in trans_features:
171 + f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight))
172 +
173 +
174 +def print_state_features(state_features, f):
175 + for (attr, label), weight in state_features:
176 + f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8")))
177 +
178 +
179 +__author__ = 'CMendezC'
180 +
181 +##########################################
182 +# MAIN PROGRAM #
183 +##########################################
184 +
185 +if __name__ == "__main__":
186 + # Defining parameters
187 + parser = OptionParser()
188 + parser.add_option("--inputPath", dest="inputPath",
189 + help="Path of training data set", metavar="PATH")
190 + parser.add_option("--outputPath", dest="outputPath",
191 + help="Output path to place output files",
192 + metavar="PATH")
193 + parser.add_option("--trainingFile", dest="trainingFile",
194 + help="File with training data set", metavar="FILE")
195 + parser.add_option("--testFile", dest="testFile",
196 + help="File with test data set", metavar="FILE")
197 + parser.add_option("--excludeStopWords", default=False,
198 + action="store_true", dest="excludeStopWords",
199 + help="Exclude stop words")
200 + parser.add_option("--excludeSymbols", default=False,
201 + action="store_true", dest="excludeSymbols",
202 + help="Exclude punctuation marks")
203 + parser.add_option("--reportFile", dest="reportFile",
204 + help="Report file", metavar="FILE")
205 +
206 + (options, args) = parser.parse_args()
207 + if len(args) > 0:
208 + parser.error("Any parameter given.")
209 + sys.exit(1)
210 +
211 + print('-------------------------------- PARAMETERS --------------------------------')
212 + print("Path of training data set: " + options.inputPath)
213 + print("File with training data set: " + str(options.trainingFile))
214 + print("Path of test data set: " + options.inputPath)
215 + print("File with test data set: " + str(options.testFile))
216 + print("Exclude stop words: " + str(options.excludeStopWords))
217 + print("Report file: " + str(options.reportFile))
218 +
219 + symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
220 + '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
221 + #print("Exclude symbols " + str(symbols) + ': ' + str(options.excludeSymbols))
222 + print("Exclude symbols: " + str(options.excludeSymbols))
223 +
224 + print('-------------------------------- PROCESSING --------------------------------')
225 + print('Reading corpus...')
226 + t0 = time()
227 +
228 + sentencesTrainingData = []
229 + sentencesTestData = []
230 +
231 + stopwords = [word for word in stopwords.words('english')]
232 +
233 + with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile:
234 + for line in iFile.readlines():
235 + listLine = []
236 + line = line.strip('\n')
237 + for token in line.split():
238 + if options.excludeStopWords:
239 + listToken = token.split('|')
240 + lemma = listToken[1]
241 + if lemma in stopwords:
242 + continue
243 + if options.excludeSymbols:
244 + listToken = token.split('|')
245 + lemma = listToken[1]
246 + if lemma in symbols:
247 + continue
248 + listLine.append(token)
249 + sentencesTrainingData.append(listLine)
250 + print(" Sentences training data: " + str(len(sentencesTrainingData)))
251 +
252 + with open(os.path.join(options.inputPath, options.testFile), "r") as iFile:
253 + for line in iFile.readlines():
254 + listLine = []
255 + line = line.strip('\n')
256 + for token in line.split():
257 + if options.excludeStopWords:
258 + listToken = token.split('|')
259 + lemma = listToken[1]
260 + if lemma in stopwords:
261 + continue
262 + if options.excludeSymbols:
263 + listToken = token.split('|')
264 + lemma = listToken[1]
265 + if lemma in symbols:
266 + continue
267 + listLine.append(token)
268 + sentencesTestData.append(listLine)
269 + print(" Sentences test data: " + str(len(sentencesTestData)))
270 +
271 + print("Reading corpus done in: %fs" % (time() - t0))
272 +
273 + print(sent2features(sentencesTrainingData[0])[0])
274 + print(sent2features(sentencesTestData[0])[0])
275 + t0 = time()
276 +
277 + X_train = [sent2features(s) for s in sentencesTrainingData]
278 + y_train = [sent2labels(s) for s in sentencesTrainingData]
279 +
280 + X_test = [sent2features(s) for s in sentencesTestData]
281 + # print X_test
282 + y_test = [sent2labels(s) for s in sentencesTestData]
283 +
284 + # Fixed parameters
285 + # crf = sklearn_crfsuite.CRF(
286 + # algorithm='lbfgs',
287 + # c1=0.1,
288 + # c2=0.1,
289 + # max_iterations=100,
290 + # all_possible_transitions=True
291 + # )
292 +
293 + # Hyperparameter Optimization
294 + crf = sklearn_crfsuite.CRF(
295 + algorithm='lbfgs',
296 + max_iterations=100,
297 + all_possible_transitions=True
298 + )
299 + params_space = {
300 + 'c1': scipy.stats.expon(scale=0.5),
301 + 'c2': scipy.stats.expon(scale=0.05),
302 + }
303 +
304 + # Original: labels = list(crf.classes_)
305 + # Original: labels.remove('O')
306 + labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Supp', 'Technique', 'Temp', 'OD', 'Anti'])
307 +
308 + # use the same metric for evaluation
309 + f1_scorer = make_scorer(metrics.flat_f1_score,
310 + average='weighted', labels=labels)
311 +
312 + # search
313 + rs = RandomizedSearchCV(crf, params_space,
314 + cv=10,
315 + verbose=3,
316 + n_jobs=-1,
317 + n_iter=20,
318 + # n_iter=50,
319 + scoring=f1_scorer)
320 + rs.fit(X_train, y_train)
321 +
322 + # Fixed parameters
323 + # crf.fit(X_train, y_train)
324 +
325 + # Best hiperparameters
326 + # crf = rs.best_estimator_
327 + #nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(options.excludeSymbols) + '.txt')
328 + nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.reportFile))
329 + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile:
330 + oFile.write("********** TRAINING AND TESTING REPORT **********\n")
331 + oFile.write("Training file: " + options.trainingFile + '\n')
332 + oFile.write('\n')
333 + oFile.write('best params:' + str(rs.best_params_) + '\n')
334 + oFile.write('best CV score:' + str(rs.best_score_) + '\n')
335 + oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000))
336 +
337 + print("Training done in: %fs" % (time() - t0))
338 + t0 = time()
339 +
340 + # Update best crf
341 + crf = rs.best_estimator_
342 +
343 + # Saving model
344 + print(" Saving training model...")
345 + t1 = time()
346 + nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
347 + options.excludeSymbols) + '.mod')
348 + joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
349 + print(" Saving training model done in: %fs" % (time() - t1))
350 +
351 + # Evaluation against test data
352 + y_pred = crf.predict(X_test)
353 + print("*********************************")
354 + name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
355 + options.excludeSymbols) + '.txt')
356 + with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile:
357 + for y in y_pred:
358 + oFile.write(str(y) + '\n')
359 +
360 + print("*********************************")
361 + name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
362 + options.excludeSymbols) + '.txt')
363 + with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile:
364 + for y in y_test:
365 + oFile.write(str(y) + '\n')
366 +
367 + print("Prediction done in: %fs" % (time() - t0))
368 +
369 + # labels = list(crf.classes_)
370 + # labels.remove('O')
371 +
372 + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile:
373 + oFile.write('\n')
374 + oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)))
375 + oFile.write('\n')
376 + # labels = list(crf.classes_)
377 + sorted_labels = sorted(
378 + labels,
379 + key=lambda name: (name[1:], name[0])
380 + )
381 + oFile.write(metrics.flat_classification_report(
382 + y_test, y_pred, labels=sorted_labels, digits=3
383 + ))
384 + oFile.write('\n')
385 +
386 + oFile.write("\nTop likely transitions:\n")
387 + print_transitions(Counter(crf.transition_features_).most_common(50), oFile)
388 + oFile.write('\n')
389 +
390 + oFile.write("\nTop unlikely transitions:\n")
391 + print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile)
392 + oFile.write('\n')
393 +
394 + oFile.write("\nTop positive:\n")
395 + print_state_features(Counter(crf.state_features_).most_common(200), oFile)
396 + oFile.write('\n')
397 +
398 + oFile.write("\nTop negative:\n")
399 + print_state_features(Counter(crf.state_features_).most_common()[-200:], oFile)
400 + oFile.write('\n')
401 +
1 +# -*- coding: UTF-8 -*-
2 +
3 +import os
4 +from itertools import chain
5 +from optparse import OptionParser
6 +from time import time
7 +from collections import Counter
8 +import re
9 +
10 +import nltk
11 +import sklearn
12 +import scipy.stats
13 +import sys
14 +
15 +from sklearn.externals import joblib
16 +from sklearn.metrics import make_scorer
17 +from sklearn.cross_validation import cross_val_score
18 +from sklearn.grid_search import RandomizedSearchCV
19 +
20 +import sklearn_crfsuite
21 +from sklearn_crfsuite import scorers
22 +from sklearn_crfsuite import metrics
23 +
24 +from nltk.corpus import stopwords
25 +
26 +
27 +# Objective
28 +# Training and evaluation of CRFs with sklearn-crfsuite.
29 +#
30 +# Input parameters
31 +# --inputPath=PATH Path of training and test data set
32 +# --trainingFile File with training data set
33 +# --testFile File with test data set
34 +# --outputPath=PATH Output path to place output files
35 +# --reportFile Report Fileneme
36 +
37 +# Output
38 +# 1) Best model
39 +
40 +# Examples
41 +# python training_validation_v5.py
42 +# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
43 +# --trainingFile training-data-set-70.txt
44 +# --testFile test-data-set-30.txt
45 +# --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/
46 +# --reportFile report_1
47 +# python3.4 training-validation_v5.py --inputPatTH /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/
48 +
49 +#################################
50 +# FUNCTIONS #
51 +#################################
52 +
53 +def isGreek(word):
54 + alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω',
55 + 'α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω']
56 + if word in alphabet:
57 + return True
58 + else:
59 + return False
60 +def hUpper(word):
61 + for l in word:
62 + if l.isupper(): return True
63 + return False
64 +
65 +def hLower(word):
66 + for l in word:
67 + if l.islower(): return True
68 + return False
69 +
70 +def hGreek(word):
71 + for l in word:
72 + if isGreek(l): return True
73 + return False
74 +
75 +
76 +def word2features(sent, i, S1, S2):
77 + listElem = sent[i].split('|')
78 + word = listElem[0]
79 + lemma = listElem[1]
80 + postag = listElem[2]
81 + ner = listElem[3]
82 +
83 + features = {
84 + #General
85 + 'lemma': lemma,
86 + 'postag': postag
87 + }
88 +
89 + if S1:
90 + #S1
91 + features['word']: word
92 + features['hUpper']: hUpper(word)
93 + features['hLower']: hUpper(word)
94 + features['hGreek']: hGreek(word)
95 + #features['hAlfNum']: hAlfNum(word)
96 +
97 + if S2:
98 + #S2
99 + features['isUpper']: word.isupper()
100 + features['isLower']: word.isLower()
101 + features['isGreek']: isGreek(word)
102 + features['isNumber']: word.isdigit()
103 +
104 + if i > 0:
105 + listElem = sent[i - 1].split('|')
106 + word1 = listElem[0]
107 + lemma1 = listElem[1]
108 + postag1 = listElem[2]
109 + features.update({
110 + #Word anterioir
111 + '-1:word': word1,
112 + #LemaG posterior
113 + '-1:lemma': lemma1,
114 + #PostG posterior
115 + '-1:postag': postag1,
116 + })
117 +
118 + if i < len(sent) - 1:
119 + listElem = sent[i + 1].split('|')
120 + word1 = listElem[0]
121 + lemma1 = listElem[1]
122 + postag1 = listElem[2]
123 + features.update({
124 + #Word anterioir
125 + '+1:word': word1,
126 + #LemaG posterior
127 + '+1:lemma': lemma1,
128 + #PostG posterior
129 + '+1:postag': postag1,
130 + })
131 + return features
132 +
133 +
134 +def sent2features(sent, S1, S2):
135 + return [word2features(sent, i, S1, S2) for i in range(len(sent))]
136 +
137 +
138 +def sent2labels(sent):
139 + return [elem.split('|')[3] for elem in sent]
140 +
141 +
142 +def sent2tokens(sent):
143 + return [token for token, postag, label in sent]
144 +
145 +
146 +def print_transitions(trans_features, f):
147 + for (label_from, label_to), weight in trans_features:
148 + f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight))
149 +
150 +
151 +def print_state_features(state_features, f):
152 + for (attr, label), weight in state_features:
153 + f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8")))
154 +
155 +
156 +__author__ = 'CMendezC'
157 +
158 +##########################################
159 +# MAIN PROGRAM #
160 +##########################################
161 +
162 +if __name__ == "__main__":
163 + # Defining parameters
164 + parser = OptionParser()
165 + parser.add_option("--inputPath", dest="inputPath",
166 + help="Path of training data set", metavar="PATH")
167 + parser.add_option("--outputPath", dest="outputPath",
168 + help="Output path to place output files",
169 + metavar="PATH")
170 + parser.add_option("--trainingFile", dest="trainingFile",
171 + help="File with training data set", metavar="FILE")
172 + parser.add_option("--testFile", dest="testFile",
173 + help="File with test data set", metavar="FILE")
174 + parser.add_option("--excludeStopWords", default=False,
175 + action="store_true", dest="excludeStopWords",
176 + help="Exclude stop words")
177 + parser.add_option("--excludeSymbols", default=False,
178 + action="store_true", dest="excludeSymbols",
179 + help="Exclude punctuation marks")
180 + parser.add_option("--reportFile", dest="reportFile",
181 + help="Report file", metavar="FILE")
182 + parser.add_option("--S1", default=False,
183 + action="store_true", dest="S1",
184 + help="Level specificity")
185 + parser.add_option("--S2", default=False,
186 + action="store_true", dest="S2",
187 + help="Level specificity")
188 +
189 + (options, args) = parser.parse_args()
190 + if len(args) > 0:
191 + parser.error("Any parameter given.")
192 + sys.exit(1)
193 +
194 + print('-------------------------------- PARAMETERS --------------------------------')
195 + print("Path of training data set: " + options.inputPath)
196 + print("File with training data set: " + str(options.trainingFile))
197 + print("Path of test data set: " + options.inputPath)
198 + print("File with test data set: " + str(options.testFile))
199 + print("Exclude stop words: " + str(options.excludeStopWords))
200 + print("Levels: " + str(options.S1) + " " + str(options.S2))
201 + print("Report file: " + str(options.reportFile))
202 +
203 +
204 + symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
205 + '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
206 + print("Exclude symbols: " + str(options.excludeSymbols))
207 +
208 + print('-------------------------------- PROCESSING --------------------------------')
209 + print('Reading corpus...')
210 + t0 = time()
211 +
212 + sentencesTrainingData = []
213 + sentencesTestData = []
214 +
215 + stopwords = [word for word in stopwords.words('english')]
216 +
217 + with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile:
218 + for line in iFile.readlines():
219 + listLine = []
220 + line = line.strip('\n')
221 + for token in line.split():
222 + if options.excludeStopWords:
223 + listToken = token.split('|')
224 + lemma = listToken[1]
225 + if lemma in stopwords:
226 + continue
227 + if options.excludeSymbols:
228 + listToken = token.split('|')
229 + lemma = listToken[1]
230 + if lemma in symbols:
231 + continue
232 + listLine.append(token)
233 + sentencesTrainingData.append(listLine)
234 + print(" Sentences training data: " + str(len(sentencesTrainingData)))
235 +
236 + with open(os.path.join(options.inputPath, options.testFile), "r") as iFile:
237 + for line in iFile.readlines():
238 + listLine = []
239 + line = line.strip('\n')
240 + for token in line.split():
241 + if options.excludeStopWords:
242 + listToken = token.split('|')
243 + lemma = listToken[1]
244 + if lemma in stopwords:
245 + continue
246 + if options.excludeSymbols:
247 + listToken = token.split('|')
248 + lemma = listToken[1]
249 + if lemma in symbols:
250 + continue
251 + listLine.append(token)
252 + sentencesTestData.append(listLine)
253 + print(" Sentences test data: " + str(len(sentencesTestData)))
254 +
255 + print("Reading corpus done in: %fs" % (time() - t0))
256 +
257 + if options.S1: S1 = 0
258 + else: S1 = 1
259 + if options.S2: S2 = 0
260 + else: S2 = 1
261 +
262 + print(sent2features(sentencesTrainingData[0], S1, S2)[0])
263 + print(sent2features(sentencesTestData[0], S1, S2)[0])
264 + t0 = time()
265 +
266 + X_train = [sent2features(s, S1, S2) for s in sentencesTrainingData]
267 + y_train = [sent2labels(s) for s in sentencesTrainingData]
268 +
269 + X_test = [sent2features(s, S1, S2) for s in sentencesTestData]
270 + # print X_test
271 + y_test = [sent2labels(s) for s in sentencesTestData]
272 +
273 + # Fixed parameters
274 + # crf = sklearn_crfsuite.CRF(
275 + # algorithm='lbfgs',
276 + # c1=0.1,
277 + # c2=0.1,
278 + # max_iterations=100,
279 + # all_possible_transitions=True
280 + # )
281 +
282 + # Hyperparameter Optimization
283 + crf = sklearn_crfsuite.CRF(
284 + algorithm='lbfgs',
285 + max_iterations=100,
286 + all_possible_transitions=True
287 + )
288 + params_space = {
289 + 'c1': scipy.stats.expon(scale=0.5),
290 + 'c2': scipy.stats.expon(scale=0.05),
291 + }
292 +
293 + # Original: labels = list(crf.classes_)
294 + # Original: labels.remove('O')
295 + labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Supp', 'Technique', 'Temp', 'OD', 'Anti'])
296 +
297 + # use the same metric for evaluation
298 + f1_scorer = make_scorer(metrics.flat_f1_score,
299 + average='weighted', labels=labels)
300 +
301 + # search
302 + rs = RandomizedSearchCV(crf, params_space,
303 + cv=10,
304 + verbose=3,
305 + n_jobs=-1,
306 + n_iter=20,
307 + # n_iter=50,
308 + scoring=f1_scorer)
309 + rs.fit(X_train, y_train)
310 +
311 + # Fixed parameters
312 + # crf.fit(X_train, y_train)
313 +
314 + # Best hiperparameters
315 + # crf = rs.best_estimator_
316 + nameReport = options.trainingFile.replace('.txt', str(options.reportFile) + '.txt')
317 + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile:
318 + oFile.write("********** TRAINING AND TESTING REPORT **********\n")
319 + oFile.write("Training file: " + options.trainingFile + '\n')
320 + oFile.write('\n')
321 + oFile.write('best params:' + str(rs.best_params_) + '\n')
322 + oFile.write('best CV score:' + str(rs.best_score_) + '\n')
323 + oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000))
324 +
325 + print("Training done in: %fs" % (time() - t0))
326 + t0 = time()
327 +
328 + # Update best crf
329 + crf = rs.best_estimator_
330 +
331 + # Saving model
332 + print(" Saving training model...")
333 + t1 = time()
334 + nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
335 + options.excludeSymbols) + '.mod')
336 + joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
337 + print(" Saving training model done in: %fs" % (time() - t1))
338 +
339 + # Evaluation against test data
340 + y_pred = crf.predict(X_test)
341 + print("*********************************")
342 + name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
343 + options.excludeSymbols) + '.txt')
344 + with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile:
345 + for y in y_pred:
346 + oFile.write(str(y) + '\n')
347 +
348 + print("*********************************")
349 + name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
350 + options.excludeSymbols) + '.txt')
351 + with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile:
352 + for y in y_test:
353 + oFile.write(str(y) + '\n')
354 +
355 + print("Prediction done in: %fs" % (time() - t0))
356 +
357 + # labels = list(crf.classes_)
358 + # labels.remove('O')
359 +
360 + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile:
361 + oFile.write('\n')
362 + oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)))
363 + oFile.write('\n')
364 + # labels = list(crf.classes_)
365 + sorted_labels = sorted(
366 + labels,
367 + key=lambda name: (name[1:], name[0])
368 + )
369 + oFile.write(metrics.flat_classification_report(
370 + y_test, y_pred, labels=sorted_labels, digits=3
371 + ))
372 + oFile.write('\n')
373 +
374 + oFile.write("\nTop likely transitions:\n")
375 + print_transitions(Counter(crf.transition_features_).most_common(50), oFile)
376 + oFile.write('\n')
377 +
378 + oFile.write("\nTop unlikely transitions:\n")
379 + print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile)
380 + oFile.write('\n')
381 +
382 + oFile.write("\nTop positive:\n")
383 + print_state_features(Counter(crf.state_features_).most_common(200), oFile)
384 + oFile.write('\n')
385 +
386 + oFile.write("\nTop negative:\n")
387 + print_state_features(Counter(crf.state_features_).most_common()[-200:], oFile)
388 + oFile.write('\n')
389 +
390 +