Showing
10 changed files
with
13 additions
and
270 deletions
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
No preview for this file type
prepare-abstracts.py
deleted
100644 → 0
1 | -# -*- coding: UTF-8 -*- | ||
2 | - | ||
3 | -from optparse import OptionParser | ||
4 | -import os | ||
5 | -import sys | ||
6 | -from time import time | ||
7 | -import re | ||
8 | - | ||
9 | -__author__ = 'CMendezC' | ||
10 | - | ||
11 | -# Objective: Take text-annotated-abstracts-original.txt as input | ||
12 | -# for obtaining abstracts separated in files without tags and collecting dictionary of genes | ||
13 | -# for tagging after NLP pipeline. | ||
14 | - | ||
15 | -# Parameters: | ||
16 | -# 1) --inputPath Input path. | ||
17 | -# 2) --inputFile Input file. | ||
18 | -# 3) --outputPath Output path | ||
19 | - | ||
20 | -# Execution: | ||
21 | -# python3 prepare-abstracts.py | ||
22 | -# --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets | ||
23 | -# --inputFile text-annotated-abstracts.txt | ||
24 | -# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original | ||
25 | -# --dicPath /export/space1/users/compu2/bionlp/nlp-preprocessing-pipeline/dictionaries | ||
26 | -# --dicFile genes.txt | ||
27 | -# python3 prepare-abstracts.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets --inputFile text-annotated-abstracts-original.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/original --dicPath /export/space1/users/compu2/bionlp/nlp-preprocessing-pipeline/dictionaries --dicFile genes.txt | ||
28 | - | ||
29 | - | ||
30 | -if __name__ == "__main__": | ||
31 | - # Parameter definition | ||
32 | - parser = OptionParser() | ||
33 | - parser.add_option("--inputPath", dest="inputPath", | ||
34 | - help="Input path", metavar="PATH") | ||
35 | - parser.add_option("--inputFile", dest="inputFile", | ||
36 | - help="Input file", metavar="FILE") | ||
37 | - parser.add_option("--outputPath", dest="outputPath", | ||
38 | - help="Output path", metavar="PATH") | ||
39 | - parser.add_option("--dicPath", dest="dicPath", | ||
40 | - help="Dictionary path", metavar="PATH") | ||
41 | - parser.add_option("--dicFile", dest="dicFile", | ||
42 | - help="Dictionary file", metavar="FILE") | ||
43 | - | ||
44 | - (options, args) = parser.parse_args() | ||
45 | - if len(args) > 0: | ||
46 | - parser.error("None parameters indicated.") | ||
47 | - sys.exit(1) | ||
48 | - | ||
49 | - # Printing parameter values | ||
50 | - print('-------------------------------- PARAMETERS --------------------------------') | ||
51 | - print("Input path: " + str(options.inputPath)) | ||
52 | - print("Input file", str(options.inputFile)) | ||
53 | - print("Output path: " + str(options.outputPath)) | ||
54 | - print("Dictionary path: " + str(options.dicPath)) | ||
55 | - print("Dictionary file", str(options.dicFile)) | ||
56 | - | ||
57 | - filesWritten = 0 | ||
58 | - t0 = time() | ||
59 | - hashGenes = {} | ||
60 | - | ||
61 | - rePmid = re.compile(r'([\d]+)\|t\|') | ||
62 | - reGene = re.compile(r'<g>([^<]+)</g>') | ||
63 | - reTags = re.compile(r'(<g>|</g>|<d>|</d>|<i>|</i>)') | ||
64 | - with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile: | ||
65 | - print("Reading file..." + options.inputFile) | ||
66 | - for line in iFile: | ||
67 | - line = line.strip('\r\n') | ||
68 | - for gene in reGene.findall(line): | ||
69 | - # print("genes: {}".format(gene)) | ||
70 | - if gene not in hashGenes: | ||
71 | - hashGenes[gene] = 1 | ||
72 | - else: | ||
73 | - hashGenes[gene] += 1 | ||
74 | - line = reTags.sub('', line) | ||
75 | - result = rePmid.match(line) | ||
76 | - if result: | ||
77 | - line = rePmid.sub('', line) | ||
78 | - with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile: | ||
79 | - oFile.write(line) | ||
80 | - else: | ||
81 | - print("Warning: line without PMID") | ||
82 | - with open(os.path.join(options.dicPath, options.dicFile), "w", encoding="utf-8", errors="replace") as dFile: | ||
83 | - for gene in hashGenes.keys(): | ||
84 | - dFile.write("{}\n".format(gene)) | ||
85 | - | ||
86 | - |
prepare-training-test.py
deleted
100644 → 0
1 | -# -*- coding: UTF-8 -*- | ||
2 | - | ||
3 | -from optparse import OptionParser | ||
4 | -import os | ||
5 | -import sys | ||
6 | -from time import time | ||
7 | - | ||
8 | -__author__ = 'CMendezC' | ||
9 | - | ||
10 | -# Objective: Join transformed files for obtaining training and test data sets | ||
11 | - | ||
12 | -# Parameters: | ||
13 | -# 1) --inputPath Path to read files. | ||
14 | -# 2) --trainingFile File name for training data. | ||
15 | -# 3) --testFile File name for test data. | ||
16 | -# 4) --outputPath Path to write files. | ||
17 | - | ||
18 | -# Ouput: | ||
19 | -# 1) Files created. | ||
20 | - | ||
21 | -# Execution: | ||
22 | -# python3.4 prepare-training-test.py | ||
23 | -# --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/transformed | ||
24 | -# --trainingFile training-data-set-70.txt | ||
25 | -# --testFile test-data-set-30.txt | ||
26 | -# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets | ||
27 | -# python3.4 prepare-training-test.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/transformed --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets | ||
28 | - | ||
29 | -########################################################### | ||
30 | -# MAIN PROGRAM # | ||
31 | -########################################################### | ||
32 | - | ||
33 | -if __name__ == "__main__": | ||
34 | - # Parameter definition | ||
35 | - parser = OptionParser() | ||
36 | - parser.add_option("--inputPath", dest="inputPath", | ||
37 | - help="Path to read files", metavar="PATH") | ||
38 | - parser.add_option("--trainingFile", dest="trainingFile", | ||
39 | - help="File for training examples", metavar="FILE") | ||
40 | - parser.add_option("--testFile", dest="testFile", | ||
41 | - help="File for test examples", metavar="FILE") | ||
42 | - parser.add_option("--outputPath", dest="outputPath", | ||
43 | - help="Path to write output file, feature parameter is concatenated to file name.", metavar="PATH") | ||
44 | - | ||
45 | - (options, args) = parser.parse_args() | ||
46 | - if len(args) > 0: | ||
47 | - parser.error("None parameters indicated.") | ||
48 | - sys.exit(1) | ||
49 | - | ||
50 | - # Printing parameter values | ||
51 | - print('-------------------------------- PARAMETERS --------------------------------') | ||
52 | - print("Path to read files: " + str(options.inputPath)) | ||
53 | - print("File for training examples", str(options.trainingFile)) | ||
54 | - print("File for test examples", str(options.testFile)) | ||
55 | - print("Path to write output files: " + str(options.outputPath)) | ||
56 | - | ||
57 | - t0 = time() | ||
58 | - trainingDataset = [] | ||
59 | - testDataset = [] | ||
60 | - | ||
61 | - counter = 1 | ||
62 | - for path, dirs, files in os.walk(options.inputPath): | ||
63 | - # For each file in dir | ||
64 | - for file in files: | ||
65 | - if counter <= 70: | ||
66 | - print(" Joining file {} {} to training data set".format(counter, file)) | ||
67 | - with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile: | ||
68 | - for line in iFile: | ||
69 | - line = line.strip('\r\n') | ||
70 | - trainingDataset.append(line) | ||
71 | - elif counter > 70 and counter <= 100: | ||
72 | - print(" Joining file {} {} to test data set".format(counter, file)) | ||
73 | - with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile: | ||
74 | - for line in iFile: | ||
75 | - line = line.strip('\r\n') | ||
76 | - testDataset.append(line) | ||
77 | - counter += 1 | ||
78 | - with open(os.path.join(options.outputPath, options.trainingFile), "w", encoding="utf-8", errors="replace") as oFile: | ||
79 | - for line in trainingDataset: | ||
80 | - oFile.write("{}\n".format(line)) | ||
81 | - with open(os.path.join(options.outputPath, options.testFile), "w", encoding="utf-8", errors="replace") as oFile: | ||
82 | - for line in testDataset: | ||
83 | - oFile.write("{}\n".format(line)) | ||
84 | - |
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
training-validation-original.py
deleted
100644 → 0
This diff is collapsed. Click to expand it.
... | @@ -49,12 +49,6 @@ from nltk.corpus import stopwords | ... | @@ -49,12 +49,6 @@ from nltk.corpus import stopwords |
49 | ################################# | 49 | ################################# |
50 | # FUNCTIONS # | 50 | # FUNCTIONS # |
51 | ################################# | 51 | ################################# |
52 | -def endsConLow(word): | ||
53 | - miregex = re.compile(r'[^aeiouA-Z0-9]$') | ||
54 | - if miregex.search(word): | ||
55 | - return True | ||
56 | - else: | ||
57 | - return False | ||
58 | 52 | ||
59 | def word2features(sent, i): | 53 | def word2features(sent, i): |
60 | listElem = sent[i].split('|') | 54 | listElem = sent[i].split('|') |
... | @@ -63,21 +57,9 @@ def word2features(sent, i): | ... | @@ -63,21 +57,9 @@ def word2features(sent, i): |
63 | postag = listElem[2] | 57 | postag = listElem[2] |
64 | 58 | ||
65 | features = { | 59 | features = { |
66 | - # Suffixes | ||
67 | - 'word[-3:]': word[-3:], | ||
68 | - 'word[-2:]': word[-2:], | ||
69 | - 'word[-1:]': word[-1:], | ||
70 | - #'word.isupper()': word.isupper(), | ||
71 | 'word': word, | 60 | 'word': word, |
72 | - 'lemma': lemma, | 61 | + #'lemma': lemma, |
73 | - 'postag': postag, | 62 | + #'postag': postag, |
74 | - 'lemma[-3:]': lemma[-3:], | ||
75 | - 'lemma[-2:]': lemma[-2:], | ||
76 | - 'lemma[-1:]': lemma[-1:], | ||
77 | - 'word[:3]': word[:3], | ||
78 | - 'word[:2]': word[:2], | ||
79 | - 'word[:1]': word[:1], | ||
80 | - 'endsConLow()={}'.format(endsConLow(word)): endsConLow(word), | ||
81 | } | 63 | } |
82 | if i > 0: | 64 | if i > 0: |
83 | listElem = sent[i - 1].split('|') | 65 | listElem = sent[i - 1].split('|') |
... | @@ -86,8 +68,8 @@ def word2features(sent, i): | ... | @@ -86,8 +68,8 @@ def word2features(sent, i): |
86 | postag1 = listElem[2] | 68 | postag1 = listElem[2] |
87 | features.update({ | 69 | features.update({ |
88 | '-1:word': word1, | 70 | '-1:word': word1, |
89 | - '-1:lemma': lemma1, | 71 | + #'-1:lemma': lemma1, |
90 | - '-1:postag': postag1, | 72 | + #'-1:postag': postag1, |
91 | }) | 73 | }) |
92 | 74 | ||
93 | if i < len(sent) - 1: | 75 | if i < len(sent) - 1: |
... | @@ -97,55 +79,10 @@ def word2features(sent, i): | ... | @@ -97,55 +79,10 @@ def word2features(sent, i): |
97 | postag1 = listElem[2] | 79 | postag1 = listElem[2] |
98 | features.update({ | 80 | features.update({ |
99 | '+1:word': word1, | 81 | '+1:word': word1, |
100 | - '+1:lemma': lemma1, | 82 | + #'+1:lemma': lemma1, |
101 | - '+1:postag': postag1, | 83 | + #'+1:postag': postag1, |
102 | }) | 84 | }) |
103 | 85 | ||
104 | - ''' | ||
105 | - if i > 1: | ||
106 | - listElem = sent[i - 2].split('|') | ||
107 | - word2 = listElem[0] | ||
108 | - lemma2 = listElem[1] | ||
109 | - postag2 = listElem[2] | ||
110 | - features.update({ | ||
111 | - '-2:word': word2, | ||
112 | - '-2:lemma': lemma2, | ||
113 | - }) | ||
114 | - | ||
115 | - if i < len(sent) - 2: | ||
116 | - listElem = sent[i + 2].split('|') | ||
117 | - word2 = listElem[0] | ||
118 | - lemma2 = listElem[1] | ||
119 | - postag2 = listElem[2] | ||
120 | - features.update({ | ||
121 | - '+2:word': word2, | ||
122 | - '+2:lemma': lemma2, | ||
123 | - }) | ||
124 | - | ||
125 | - trigrams = False | ||
126 | - if trigrams: | ||
127 | - if i > 2: | ||
128 | - listElem = sent[i - 3].split('|') | ||
129 | - word3 = listElem[0] | ||
130 | - lemma3 = listElem[1] | ||
131 | - postag3 = listElem[2] | ||
132 | - features.update({ | ||
133 | - '-3:word': word3, | ||
134 | - '-3:lemma': lemma3, | ||
135 | - }) | ||
136 | - | ||
137 | - if i < len(sent) - 3: | ||
138 | - listElem = sent[i + 3].split('|') | ||
139 | - word3 = listElem[0] | ||
140 | - lemma3 = listElem[1] | ||
141 | - postag3 = listElem[2] | ||
142 | - features.update({ | ||
143 | - '+3:word': word3, | ||
144 | - '+3:lemma': lemma3, | ||
145 | - }) | ||
146 | - ''' | ||
147 | - return features | ||
148 | - | ||
149 | 86 | ||
150 | def sent2features(sent): | 87 | def sent2features(sent): |
151 | return [word2features(sent, i) for i in range(len(sent))] | 88 | return [word2features(sent, i) for i in range(len(sent))] |
... | @@ -271,24 +208,13 @@ if __name__ == "__main__": | ... | @@ -271,24 +208,13 @@ if __name__ == "__main__": |
271 | y_test = [sent2labels(s) for s in sentencesTestData] | 208 | y_test = [sent2labels(s) for s in sentencesTestData] |
272 | 209 | ||
273 | # Fixed parameters | 210 | # Fixed parameters |
274 | - # crf = sklearn_crfsuite.CRF( | ||
275 | - # algorithm='lbfgs', | ||
276 | - # c1=0.1, | ||
277 | - # c2=0.1, | ||
278 | - # max_iterations=100, | ||
279 | - # all_possible_transitions=True | ||
280 | - # ) | ||
281 | - | ||
282 | - # Hyperparameter Optimization | ||
283 | crf = sklearn_crfsuite.CRF( | 211 | crf = sklearn_crfsuite.CRF( |
284 | algorithm='lbfgs', | 212 | algorithm='lbfgs', |
213 | + c1=0.1, | ||
214 | + c2=0.1, | ||
285 | max_iterations=100, | 215 | max_iterations=100, |
286 | all_possible_transitions=True | 216 | all_possible_transitions=True |
287 | ) | 217 | ) |
288 | - params_space = { | ||
289 | - 'c1': scipy.stats.expon(scale=0.5), | ||
290 | - 'c2': scipy.stats.expon(scale=0.05), | ||
291 | - } | ||
292 | 218 | ||
293 | # Original: labels = list(crf.classes_) | 219 | # Original: labels = list(crf.classes_) |
294 | # Original: labels.remove('O') | 220 | # Original: labels.remove('O') |
... | @@ -298,18 +224,8 @@ if __name__ == "__main__": | ... | @@ -298,18 +224,8 @@ if __name__ == "__main__": |
298 | f1_scorer = make_scorer(metrics.flat_f1_score, | 224 | f1_scorer = make_scorer(metrics.flat_f1_score, |
299 | average='weighted', labels=labels) | 225 | average='weighted', labels=labels) |
300 | 226 | ||
301 | - # search | ||
302 | - rs = RandomizedSearchCV(crf, params_space, | ||
303 | - cv=10, | ||
304 | - verbose=3, | ||
305 | - n_jobs=-1, | ||
306 | - n_iter=20, | ||
307 | - # n_iter=50, | ||
308 | - scoring=f1_scorer) | ||
309 | - rs.fit(X_train, y_train) | ||
310 | - | ||
311 | # Fixed parameters | 227 | # Fixed parameters |
312 | - # crf.fit(X_train, y_train) | 228 | + crf.fit(X_train, y_train) |
313 | 229 | ||
314 | # Best hiperparameters | 230 | # Best hiperparameters |
315 | # crf = rs.best_estimator_ | 231 | # crf = rs.best_estimator_ |
... | @@ -319,16 +235,13 @@ if __name__ == "__main__": | ... | @@ -319,16 +235,13 @@ if __name__ == "__main__": |
319 | oFile.write("********** TRAINING AND TESTING REPORT **********\n") | 235 | oFile.write("********** TRAINING AND TESTING REPORT **********\n") |
320 | oFile.write("Training file: " + options.trainingFile + '\n') | 236 | oFile.write("Training file: " + options.trainingFile + '\n') |
321 | oFile.write('\n') | 237 | oFile.write('\n') |
322 | - oFile.write('best params:' + str(rs.best_params_) + '\n') | 238 | + oFile.write('best params:' + str(crf.best_params_) + '\n') |
323 | - oFile.write('best CV score:' + str(rs.best_score_) + '\n') | 239 | + oFile.write('best CV score:' + str(crf.best_score_) + '\n') |
324 | - oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000)) | 240 | + oFile.write('model size: {:0.2f}M\n'.format(crf.best_estimator_.size_ / 1000000)) |
325 | 241 | ||
326 | print("Training done in: %fs" % (time() - t0)) | 242 | print("Training done in: %fs" % (time() - t0)) |
327 | t0 = time() | 243 | t0 = time() |
328 | 244 | ||
329 | - # Update best crf | ||
330 | - crf = rs.best_estimator_ | ||
331 | - | ||
332 | # Saving model | 245 | # Saving model |
333 | print(" Saving training model...") | 246 | print(" Saving training model...") |
334 | t1 = time() | 247 | t1 = time() |
... | @@ -337,7 +250,7 @@ if __name__ == "__main__": | ... | @@ -337,7 +250,7 @@ if __name__ == "__main__": |
337 | joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel)) | 250 | joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel)) |
338 | print(" Saving training model done in: %fs" % (time() - t1)) | 251 | print(" Saving training model done in: %fs" % (time() - t1)) |
339 | 252 | ||
340 | - # Evaluation against test data | 253 | + # Evaluation against evaluation data |
341 | y_pred = crf.predict(X_test) | 254 | y_pred = crf.predict(X_test) |
342 | print("*********************************") | 255 | print("*********************************") |
343 | name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str( | 256 | name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.excludeStopWords) + '.fSymbols_' + str( | ... | ... |
-
Please register or login to post a comment