Carlos-Francisco Méndez-Cruz

Conditional Random Fields

This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
1 -# -*- coding: UTF-8 -*-
2 -
3 -from optparse import OptionParser
4 -import os
5 -import sys
6 -from time import time
7 -import re
8 -
9 -__author__ = 'CMendezC'
10 -
11 -# Objective: Take text-annotated-abstracts-original.txt as input
12 -# for obtaining abstracts separated in files without tags and collecting dictionary of genes
13 -# for tagging after NLP pipeline.
14 -
15 -# Parameters:
16 -# 1) --inputPath Input path.
17 -# 2) --inputFile Input file.
18 -# 3) --outputPath Output path
19 -
20 -# Execution:
21 -# python3 prepare-abstracts.py
22 -# --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
23 -# --inputFile text-annotated-abstracts.txt
24 -# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original
25 -# --dicPath /export/space1/users/compu2/bionlp/nlp-preprocessing-pipeline/dictionaries
26 -# --dicFile genes.txt
27 -# python3 prepare-abstracts.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets --inputFile text-annotated-abstracts-original.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original --dicPath /export/space1/users/compu2/bionlp/nlp-preprocessing-pipeline/dictionaries --dicFile genes.txt
28 -
29 -
30 -if __name__ == "__main__":
31 - # Parameter definition
32 - parser = OptionParser()
33 - parser.add_option("--inputPath", dest="inputPath",
34 - help="Input path", metavar="PATH")
35 - parser.add_option("--inputFile", dest="inputFile",
36 - help="Input file", metavar="FILE")
37 - parser.add_option("--outputPath", dest="outputPath",
38 - help="Output path", metavar="PATH")
39 - parser.add_option("--dicPath", dest="dicPath",
40 - help="Dictionary path", metavar="PATH")
41 - parser.add_option("--dicFile", dest="dicFile",
42 - help="Dictionary file", metavar="FILE")
43 -
44 - (options, args) = parser.parse_args()
45 - if len(args) > 0:
46 - parser.error("None parameters indicated.")
47 - sys.exit(1)
48 -
49 - # Printing parameter values
50 - print('-------------------------------- PARAMETERS --------------------------------')
51 - print("Input path: " + str(options.inputPath))
52 - print("Input file", str(options.inputFile))
53 - print("Output path: " + str(options.outputPath))
54 - print("Dictionary path: " + str(options.dicPath))
55 - print("Dictionary file", str(options.dicFile))
56 -
57 - filesWritten = 0
58 - t0 = time()
59 - hashGenes = {}
60 -
61 - rePmid = re.compile(r'([\d]+)\|t\|')
62 - reGene = re.compile(r'<g>([^<]+)</g>')
63 - reTags = re.compile(r'(<g>|</g>|<d>|</d>|<i>|</i>)')
64 - with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile:
65 - print("Reading file..." + options.inputFile)
66 - for line in iFile:
67 - line = line.strip('\r\n')
68 - for gene in reGene.findall(line):
69 - # print("genes: {}".format(gene))
70 - if gene not in hashGenes:
71 - hashGenes[gene] = 1
72 - else:
73 - hashGenes[gene] += 1
74 - line = reTags.sub('', line)
75 - result = rePmid.match(line)
76 - if result:
77 - line = rePmid.sub('', line)
78 - with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile:
79 - oFile.write(line)
80 - else:
81 - print("Warning: line without PMID")
82 - with open(os.path.join(options.dicPath, options.dicFile), "w", encoding="utf-8", errors="replace") as dFile:
83 - for gene in hashGenes.keys():
84 - dFile.write("{}\n".format(gene))
85 -
86 -
1 -# -*- coding: UTF-8 -*-
2 -
3 -from optparse import OptionParser
4 -import os
5 -import sys
6 -from time import time
7 -
8 -__author__ = 'CMendezC'
9 -
10 -# Objective: Join transformed files for obtaining training and test data sets
11 -
12 -# Parameters:
13 -# 1) --inputPath Path to read files.
14 -# 2) --trainingFile File name for training data.
15 -# 3) --testFile File name for test data.
16 -# 4) --outputPath Path to write files.
17 -
18 -# Ouput:
19 -# 1) Files created.
20 -
21 -# Execution:
22 -# python3.4 prepare-training-test.py
23 -# --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/transformed
24 -# --trainingFile training-data-set-70.txt
25 -# --testFile test-data-set-30.txt
26 -# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
27 -# python3.4 prepare-training-test.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/transformed --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
28 -
29 -###########################################################
30 -# MAIN PROGRAM #
31 -###########################################################
32 -
33 -if __name__ == "__main__":
34 - # Parameter definition
35 - parser = OptionParser()
36 - parser.add_option("--inputPath", dest="inputPath",
37 - help="Path to read files", metavar="PATH")
38 - parser.add_option("--trainingFile", dest="trainingFile",
39 - help="File for training examples", metavar="FILE")
40 - parser.add_option("--testFile", dest="testFile",
41 - help="File for test examples", metavar="FILE")
42 - parser.add_option("--outputPath", dest="outputPath",
43 - help="Path to write output file, feature parameter is concatenated to file name.", metavar="PATH")
44 -
45 - (options, args) = parser.parse_args()
46 - if len(args) > 0:
47 - parser.error("None parameters indicated.")
48 - sys.exit(1)
49 -
50 - # Printing parameter values
51 - print('-------------------------------- PARAMETERS --------------------------------')
52 - print("Path to read files: " + str(options.inputPath))
53 - print("File for training examples", str(options.trainingFile))
54 - print("File for test examples", str(options.testFile))
55 - print("Path to write output files: " + str(options.outputPath))
56 -
57 - t0 = time()
58 - trainingDataset = []
59 - testDataset = []
60 -
61 - counter = 1
62 - for path, dirs, files in os.walk(options.inputPath):
63 - # For each file in dir
64 - for file in files:
65 - if counter <= 70:
66 - print(" Joining file {} {} to training data set".format(counter, file))
67 - with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
68 - for line in iFile:
69 - line = line.strip('\r\n')
70 - trainingDataset.append(line)
71 - elif counter > 70 and counter <= 100:
72 - print(" Joining file {} {} to test data set".format(counter, file))
73 - with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
74 - for line in iFile:
75 - line = line.strip('\r\n')
76 - testDataset.append(line)
77 - counter += 1
78 - with open(os.path.join(options.outputPath, options.trainingFile), "w", encoding="utf-8", errors="replace") as oFile:
79 - for line in trainingDataset:
80 - oFile.write("{}\n".format(line))
81 - with open(os.path.join(options.outputPath, options.testFile), "w", encoding="utf-8", errors="replace") as oFile:
82 - for line in testDataset:
83 - oFile.write("{}\n".format(line))
84 -
This diff is collapsed. Click to expand it.
...@@ -49,12 +49,6 @@ from nltk.corpus import stopwords ...@@ -49,12 +49,6 @@ from nltk.corpus import stopwords
49 ################################# 49 #################################
50 # FUNCTIONS # 50 # FUNCTIONS #
51 ################################# 51 #################################
52 -def endsConLow(word):
53 - miregex = re.compile(r'[^aeiouA-Z0-9]$')
54 - if miregex.search(word):
55 - return True
56 - else:
57 - return False
58 52
59 def word2features(sent, i): 53 def word2features(sent, i):
60 listElem = sent[i].split('|') 54 listElem = sent[i].split('|')
...@@ -63,21 +57,9 @@ def word2features(sent, i): ...@@ -63,21 +57,9 @@ def word2features(sent, i):
63 postag = listElem[2] 57 postag = listElem[2]
64 58
65 features = { 59 features = {
66 - # Suffixes
67 - 'word[-3:]': word[-3:],
68 - 'word[-2:]': word[-2:],
69 - 'word[-1:]': word[-1:],
70 - #'word.isupper()': word.isupper(),
71 'word': word, 60 'word': word,
72 - 'lemma': lemma, 61 + #'lemma': lemma,
73 - 'postag': postag, 62 + #'postag': postag,
74 - 'lemma[-3:]': lemma[-3:],
75 - 'lemma[-2:]': lemma[-2:],
76 - 'lemma[-1:]': lemma[-1:],
77 - 'word[:3]': word[:3],
78 - 'word[:2]': word[:2],
79 - 'word[:1]': word[:1],
80 - 'endsConLow()={}'.format(endsConLow(word)): endsConLow(word),
81 } 63 }
82 if i > 0: 64 if i > 0:
83 listElem = sent[i - 1].split('|') 65 listElem = sent[i - 1].split('|')
...@@ -86,8 +68,8 @@ def word2features(sent, i): ...@@ -86,8 +68,8 @@ def word2features(sent, i):
86 postag1 = listElem[2] 68 postag1 = listElem[2]
87 features.update({ 69 features.update({
88 '-1:word': word1, 70 '-1:word': word1,
89 - '-1:lemma': lemma1, 71 + #'-1:lemma': lemma1,
90 - '-1:postag': postag1, 72 + #'-1:postag': postag1,
91 }) 73 })
92 74
93 if i < len(sent) - 1: 75 if i < len(sent) - 1:
...@@ -97,55 +79,10 @@ def word2features(sent, i): ...@@ -97,55 +79,10 @@ def word2features(sent, i):
97 postag1 = listElem[2] 79 postag1 = listElem[2]
98 features.update({ 80 features.update({
99 '+1:word': word1, 81 '+1:word': word1,
100 - '+1:lemma': lemma1, 82 + #'+1:lemma': lemma1,
101 - '+1:postag': postag1, 83 + #'+1:postag': postag1,
102 }) 84 })
103 85
104 - '''
105 - if i > 1:
106 - listElem = sent[i - 2].split('|')
107 - word2 = listElem[0]
108 - lemma2 = listElem[1]
109 - postag2 = listElem[2]
110 - features.update({
111 - '-2:word': word2,
112 - '-2:lemma': lemma2,
113 - })
114 -
115 - if i < len(sent) - 2:
116 - listElem = sent[i + 2].split('|')
117 - word2 = listElem[0]
118 - lemma2 = listElem[1]
119 - postag2 = listElem[2]
120 - features.update({
121 - '+2:word': word2,
122 - '+2:lemma': lemma2,
123 - })
124 -
125 - trigrams = False
126 - if trigrams:
127 - if i > 2:
128 - listElem = sent[i - 3].split('|')
129 - word3 = listElem[0]
130 - lemma3 = listElem[1]
131 - postag3 = listElem[2]
132 - features.update({
133 - '-3:word': word3,
134 - '-3:lemma': lemma3,
135 - })
136 -
137 - if i < len(sent) - 3:
138 - listElem = sent[i + 3].split('|')
139 - word3 = listElem[0]
140 - lemma3 = listElem[1]
141 - postag3 = listElem[2]
142 - features.update({
143 - '+3:word': word3,
144 - '+3:lemma': lemma3,
145 - })
146 - '''
147 - return features
148 -
149 86
150 def sent2features(sent): 87 def sent2features(sent):
151 return [word2features(sent, i) for i in range(len(sent))] 88 return [word2features(sent, i) for i in range(len(sent))]
...@@ -271,24 +208,13 @@ if __name__ == "__main__": ...@@ -271,24 +208,13 @@ if __name__ == "__main__":
271 y_test = [sent2labels(s) for s in sentencesTestData] 208 y_test = [sent2labels(s) for s in sentencesTestData]
272 209
273 # Fixed parameters 210 # Fixed parameters
274 - # crf = sklearn_crfsuite.CRF(
275 - # algorithm='lbfgs',
276 - # c1=0.1,
277 - # c2=0.1,
278 - # max_iterations=100,
279 - # all_possible_transitions=True
280 - # )
281 -
282 - # Hyperparameter Optimization
283 crf = sklearn_crfsuite.CRF( 211 crf = sklearn_crfsuite.CRF(
284 algorithm='lbfgs', 212 algorithm='lbfgs',
213 + c1=0.1,
214 + c2=0.1,
285 max_iterations=100, 215 max_iterations=100,
286 all_possible_transitions=True 216 all_possible_transitions=True
287 ) 217 )
288 - params_space = {
289 - 'c1': scipy.stats.expon(scale=0.5),
290 - 'c2': scipy.stats.expon(scale=0.05),
291 - }
292 218
293 # Original: labels = list(crf.classes_) 219 # Original: labels = list(crf.classes_)
294 # Original: labels.remove('O') 220 # Original: labels.remove('O')
...@@ -298,18 +224,8 @@ if __name__ == "__main__": ...@@ -298,18 +224,8 @@ if __name__ == "__main__":
298 f1_scorer = make_scorer(metrics.flat_f1_score, 224 f1_scorer = make_scorer(metrics.flat_f1_score,
299 average='weighted', labels=labels) 225 average='weighted', labels=labels)
300 226
301 - # search
302 - rs = RandomizedSearchCV(crf, params_space,
303 - cv=10,
304 - verbose=3,
305 - n_jobs=-1,
306 - n_iter=20,
307 - # n_iter=50,
308 - scoring=f1_scorer)
309 - rs.fit(X_train, y_train)
310 -
311 # Fixed parameters 227 # Fixed parameters
312 - # crf.fit(X_train, y_train) 228 + crf.fit(X_train, y_train)
313 229
314 # Best hiperparameters 230 # Best hiperparameters
315 # crf = rs.best_estimator_ 231 # crf = rs.best_estimator_
...@@ -319,16 +235,13 @@ if __name__ == "__main__": ...@@ -319,16 +235,13 @@ if __name__ == "__main__":
319 oFile.write("********** TRAINING AND TESTING REPORT **********\n") 235 oFile.write("********** TRAINING AND TESTING REPORT **********\n")
320 oFile.write("Training file: " + options.trainingFile + '\n') 236 oFile.write("Training file: " + options.trainingFile + '\n')
321 oFile.write('\n') 237 oFile.write('\n')
322 - oFile.write('best params:' + str(rs.best_params_) + '\n') 238 + oFile.write('best params:' + str(crf.best_params_) + '\n')
323 - oFile.write('best CV score:' + str(rs.best_score_) + '\n') 239 + oFile.write('best CV score:' + str(crf.best_score_) + '\n')
324 - oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000)) 240 + oFile.write('model size: {:0.2f}M\n'.format(crf.best_estimator_.size_ / 1000000))
325 241
326 print("Training done in: %fs" % (time() - t0)) 242 print("Training done in: %fs" % (time() - t0))
327 t0 = time() 243 t0 = time()
328 244
329 - # Update best crf
330 - crf = rs.best_estimator_
331 -
332 # Saving model 245 # Saving model
333 print(" Saving training model...") 246 print(" Saving training model...")
334 t1 = time() 247 t1 = time()
...@@ -337,7 +250,7 @@ if __name__ == "__main__": ...@@ -337,7 +250,7 @@ if __name__ == "__main__":
337 joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel)) 250 joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
338 print(" Saving training model done in: %fs" % (time() - t1)) 251 print(" Saving training model done in: %fs" % (time() - t1))
339 252
340 - # Evaluation against test data 253 + # Evaluation against evaluation data
341 y_pred = crf.predict(X_test) 254 y_pred = crf.predict(X_test)
342 print("*********************************") 255 print("*********************************")
343 name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str( 256 name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str(
......