Showing
4 changed files
with
85 additions
and
2 deletions
prepare-training-test.py
0 → 100644
1 | +# -*- coding: UTF-8 -*- | ||
2 | + | ||
3 | +from optparse import OptionParser | ||
4 | +import os | ||
5 | +import sys | ||
6 | +from time import time | ||
7 | + | ||
8 | +__author__ = 'CMendezC' | ||
9 | + | ||
10 | +# Objective: Join transformed files for obtaining training and test data sets | ||
11 | + | ||
12 | +# Parameters: | ||
13 | +# 1) --inputPath Path to read files. | ||
14 | +# 2) --trainingFile File name for training data. | ||
15 | +# 3) --testFile File name for test data. | ||
16 | +# 4) --outputPath Path to write files. | ||
17 | + | ||
18 | +# Ouput: | ||
19 | +# 1) Files created. | ||
20 | + | ||
21 | +# Execution: | ||
22 | +# python prepare-training-test.py | ||
23 | +# --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/transformed | ||
24 | +# --trainingFile training-data-set-70.txt | ||
25 | +# --testFile test-data-set-30.txt | ||
26 | +# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets | ||
27 | +# python prepare-training-test.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/transformed --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets | ||
28 | + | ||
29 | +########################################################### | ||
30 | +# MAIN PROGRAM # | ||
31 | +########################################################### | ||
32 | + | ||
33 | +if __name__ == "__main__": | ||
34 | + # Parameter definition | ||
35 | + parser = OptionParser() | ||
36 | + parser.add_option("--inputPath", dest="inputPath", | ||
37 | + help="Path to read files", metavar="PATH") | ||
38 | + parser.add_option("--trainingFile", dest="trainingFile", | ||
39 | + help="File for training examples", metavar="FILE") | ||
40 | + parser.add_option("--testFile", dest="testFile", | ||
41 | + help="File for test examples", metavar="FILE") | ||
42 | + parser.add_option("--outputPath", dest="outputPath", | ||
43 | + help="Path to write output file, feature parameter is concatenated to file name.", metavar="PATH") | ||
44 | + | ||
45 | + (options, args) = parser.parse_args() | ||
46 | + if len(args) > 0: | ||
47 | + parser.error("None parameters indicated.") | ||
48 | + sys.exit(1) | ||
49 | + | ||
50 | + # Printing parameter values | ||
51 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
52 | + print("Path to read files: " + str(options.inputPath)) | ||
53 | + print("File for training examples", str(options.trainingFile)) | ||
54 | + print("File for test examples", str(options.testFile)) | ||
55 | + print("Path to write output files: " + str(options.outputPath)) | ||
56 | + | ||
57 | + t0 = time() | ||
58 | + trainingDataset = [] | ||
59 | + testDataset = [] | ||
60 | + | ||
61 | + counter = 1 | ||
62 | + for path, dirs, files in os.walk(options.inputPath): | ||
63 | + # For each file in dir | ||
64 | + for file in files: | ||
65 | + if counter <= 70: | ||
66 | + print(" Joining file {} to training data set".format(file)) | ||
67 | + with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile: | ||
68 | + for line in iFile: | ||
69 | + line = line.strip('\r\n') | ||
70 | + trainingDataset.append(line) | ||
71 | + if counter > 70 and counter <= 100: | ||
72 | + print(" Joining file {} to test data set".format(file)) | ||
73 | + with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile: | ||
74 | + for line in iFile: | ||
75 | + line = line.strip('\r\n') | ||
76 | + testDataset.append(line) | ||
77 | + with open(os.path.join(options.outputPath, options.trainingFile), "r", encoding="utf-8", errors="replace") as oFile: | ||
78 | + for line in trainingDataset: | ||
79 | + oFile.write("{}\n".format(line)) | ||
80 | + with open(os.path.join(options.outputPath, options.testFile), "r", encoding="utf-8", errors="replace") as oFile: | ||
81 | + for line in testDataset: | ||
82 | + oFile.write("{}\n".format(line)) | ||
83 | + |
This diff is collapsed. Click to expand it.
training-validation-v1.py
0 → 100644
This diff is collapsed. Click to expand it.
... | @@ -428,7 +428,7 @@ if __name__ == "__main__": | ... | @@ -428,7 +428,7 @@ if __name__ == "__main__": |
428 | 428 | ||
429 | # Original: labels = list(crf.classes_) | 429 | # Original: labels = list(crf.classes_) |
430 | # Original: labels.remove('O') | 430 | # Original: labels.remove('O') |
431 | - labels = list(['MF', 'TF', 'DFAM', 'DMOT', 'DPOS', 'PRO']) | 431 | + labels = list(['GENE']) |
432 | 432 | ||
433 | # use the same metric for evaluation | 433 | # use the same metric for evaluation |
434 | f1_scorer = make_scorer(metrics.flat_f1_score, | 434 | f1_scorer = make_scorer(metrics.flat_f1_score, |
... | @@ -436,7 +436,7 @@ if __name__ == "__main__": | ... | @@ -436,7 +436,7 @@ if __name__ == "__main__": |
436 | 436 | ||
437 | # search | 437 | # search |
438 | rs = RandomizedSearchCV(crf, params_space, | 438 | rs = RandomizedSearchCV(crf, params_space, |
439 | - cv=3, | 439 | + cv=10, |
440 | verbose=3, | 440 | verbose=3, |
441 | n_jobs=-1, | 441 | n_jobs=-1, |
442 | n_iter=20, | 442 | n_iter=20, | ... | ... |
-
Please register or login to post a comment