Showing
4 changed files
with
85 additions
and
2 deletions
prepare-training-test.py
0 → 100644
| 1 | +# -*- coding: UTF-8 -*- | ||
| 2 | + | ||
| 3 | +from optparse import OptionParser | ||
| 4 | +import os | ||
| 5 | +import sys | ||
| 6 | +from time import time | ||
| 7 | + | ||
| 8 | +__author__ = 'CMendezC' | ||
| 9 | + | ||
| 10 | +# Objective: Join transformed files for obtaining training and test data sets | ||
| 11 | + | ||
| 12 | +# Parameters: | ||
| 13 | +# 1) --inputPath Path to read files. | ||
| 14 | +# 2) --trainingFile File name for training data. | ||
| 15 | +# 3) --testFile File name for test data. | ||
| 16 | +# 4) --outputPath Path to write files. | ||
| 17 | + | ||
| 18 | +# Ouput: | ||
| 19 | +# 1) Files created. | ||
| 20 | + | ||
| 21 | +# Execution: | ||
| 22 | +# python prepare-training-test.py | ||
| 23 | +# --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/transformed | ||
| 24 | +# --trainingFile training-data-set-70.txt | ||
| 25 | +# --testFile test-data-set-30.txt | ||
| 26 | +# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets | ||
| 27 | +# python prepare-training-test.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/transformed --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets | ||
| 28 | + | ||
| 29 | +########################################################### | ||
| 30 | +# MAIN PROGRAM # | ||
| 31 | +########################################################### | ||
| 32 | + | ||
| 33 | +if __name__ == "__main__": | ||
| 34 | + # Parameter definition | ||
| 35 | + parser = OptionParser() | ||
| 36 | + parser.add_option("--inputPath", dest="inputPath", | ||
| 37 | + help="Path to read files", metavar="PATH") | ||
| 38 | + parser.add_option("--trainingFile", dest="trainingFile", | ||
| 39 | + help="File for training examples", metavar="FILE") | ||
| 40 | + parser.add_option("--testFile", dest="testFile", | ||
| 41 | + help="File for test examples", metavar="FILE") | ||
| 42 | + parser.add_option("--outputPath", dest="outputPath", | ||
| 43 | + help="Path to write output file, feature parameter is concatenated to file name.", metavar="PATH") | ||
| 44 | + | ||
| 45 | + (options, args) = parser.parse_args() | ||
| 46 | + if len(args) > 0: | ||
| 47 | + parser.error("None parameters indicated.") | ||
| 48 | + sys.exit(1) | ||
| 49 | + | ||
| 50 | + # Printing parameter values | ||
| 51 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
| 52 | + print("Path to read files: " + str(options.inputPath)) | ||
| 53 | + print("File for training examples", str(options.trainingFile)) | ||
| 54 | + print("File for test examples", str(options.testFile)) | ||
| 55 | + print("Path to write output files: " + str(options.outputPath)) | ||
| 56 | + | ||
| 57 | + t0 = time() | ||
| 58 | + trainingDataset = [] | ||
| 59 | + testDataset = [] | ||
| 60 | + | ||
| 61 | + counter = 1 | ||
| 62 | + for path, dirs, files in os.walk(options.inputPath): | ||
| 63 | + # For each file in dir | ||
| 64 | + for file in files: | ||
| 65 | + if counter <= 70: | ||
| 66 | + print(" Joining file {} to training data set".format(file)) | ||
| 67 | + with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile: | ||
| 68 | + for line in iFile: | ||
| 69 | + line = line.strip('\r\n') | ||
| 70 | + trainingDataset.append(line) | ||
| 71 | + if counter > 70 and counter <= 100: | ||
| 72 | + print(" Joining file {} to test data set".format(file)) | ||
| 73 | + with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile: | ||
| 74 | + for line in iFile: | ||
| 75 | + line = line.strip('\r\n') | ||
| 76 | + testDataset.append(line) | ||
| 77 | + with open(os.path.join(options.outputPath, options.trainingFile), "r", encoding="utf-8", errors="replace") as oFile: | ||
| 78 | + for line in trainingDataset: | ||
| 79 | + oFile.write("{}\n".format(line)) | ||
| 80 | + with open(os.path.join(options.outputPath, options.testFile), "r", encoding="utf-8", errors="replace") as oFile: | ||
| 81 | + for line in testDataset: | ||
| 82 | + oFile.write("{}\n".format(line)) | ||
| 83 | + |
This diff is collapsed. Click to expand it.
training-validation-v1.py
0 → 100644
This diff is collapsed. Click to expand it.
| ... | @@ -428,7 +428,7 @@ if __name__ == "__main__": | ... | @@ -428,7 +428,7 @@ if __name__ == "__main__": |
| 428 | 428 | ||
| 429 | # Original: labels = list(crf.classes_) | 429 | # Original: labels = list(crf.classes_) |
| 430 | # Original: labels.remove('O') | 430 | # Original: labels.remove('O') |
| 431 | - labels = list(['MF', 'TF', 'DFAM', 'DMOT', 'DPOS', 'PRO']) | 431 | + labels = list(['GENE']) |
| 432 | 432 | ||
| 433 | # use the same metric for evaluation | 433 | # use the same metric for evaluation |
| 434 | f1_scorer = make_scorer(metrics.flat_f1_score, | 434 | f1_scorer = make_scorer(metrics.flat_f1_score, |
| ... | @@ -436,7 +436,7 @@ if __name__ == "__main__": | ... | @@ -436,7 +436,7 @@ if __name__ == "__main__": |
| 436 | 436 | ||
| 437 | # search | 437 | # search |
| 438 | rs = RandomizedSearchCV(crf, params_space, | 438 | rs = RandomizedSearchCV(crf, params_space, |
| 439 | - cv=3, | 439 | + cv=10, |
| 440 | verbose=3, | 440 | verbose=3, |
| 441 | n_jobs=-1, | 441 | n_jobs=-1, |
| 442 | n_iter=20, | 442 | n_iter=20, | ... | ... |
-
Please register or login to post a comment