Carlos-Francisco Méndez-Cruz

Obtaining training and test data sets

1 +# -*- coding: UTF-8 -*-
2 +
3 +from optparse import OptionParser
4 +import os
5 +import sys
6 +from time import time
7 +
8 +__author__ = 'CMendezC'
9 +
10 +# Objective: Join transformed files for obtaining training and test data sets
11 +
12 +# Parameters:
13 +# 1) --inputPath Path to read files.
14 +# 2) --trainingFile File name for training data.
15 +# 3) --testFile File name for test data.
16 +# 4) --outputPath Path to write files.
17 +
18 +# Ouput:
19 +# 1) Files created.
20 +
21 +# Execution:
22 +# python prepare-training-test.py
23 +# --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/transformed
24 +# --trainingFile training-data-set-70.txt
25 +# --testFile test-data-set-30.txt
26 +# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
27 +# python prepare-training-test.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/transformed --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
28 +
29 +###########################################################
30 +# MAIN PROGRAM #
31 +###########################################################
32 +
33 +if __name__ == "__main__":
34 + # Parameter definition
35 + parser = OptionParser()
36 + parser.add_option("--inputPath", dest="inputPath",
37 + help="Path to read files", metavar="PATH")
38 + parser.add_option("--trainingFile", dest="trainingFile",
39 + help="File for training examples", metavar="FILE")
40 + parser.add_option("--testFile", dest="testFile",
41 + help="File for test examples", metavar="FILE")
42 + parser.add_option("--outputPath", dest="outputPath",
43 + help="Path to write output file, feature parameter is concatenated to file name.", metavar="PATH")
44 +
45 + (options, args) = parser.parse_args()
46 + if len(args) > 0:
47 + parser.error("None parameters indicated.")
48 + sys.exit(1)
49 +
50 + # Printing parameter values
51 + print('-------------------------------- PARAMETERS --------------------------------')
52 + print("Path to read files: " + str(options.inputPath))
53 + print("File for training examples", str(options.trainingFile))
54 + print("File for test examples", str(options.testFile))
55 + print("Path to write output files: " + str(options.outputPath))
56 +
57 + t0 = time()
58 + trainingDataset = []
59 + testDataset = []
60 +
61 + counter = 1
62 + for path, dirs, files in os.walk(options.inputPath):
63 + # For each file in dir
64 + for file in files:
65 + if counter <= 70:
66 + print(" Joining file {} to training data set".format(file))
67 + with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
68 + for line in iFile:
69 + line = line.strip('\r\n')
70 + trainingDataset.append(line)
71 + if counter > 70 and counter <= 100:
72 + print(" Joining file {} to test data set".format(file))
73 + with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
74 + for line in iFile:
75 + line = line.strip('\r\n')
76 + testDataset.append(line)
77 + with open(os.path.join(options.outputPath, options.trainingFile), "r", encoding="utf-8", errors="replace") as oFile:
78 + for line in trainingDataset:
79 + oFile.write("{}\n".format(line))
80 + with open(os.path.join(options.outputPath, options.testFile), "r", encoding="utf-8", errors="replace") as oFile:
81 + for line in testDataset:
82 + oFile.write("{}\n".format(line))
83 +
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
...@@ -428,7 +428,7 @@ if __name__ == "__main__": ...@@ -428,7 +428,7 @@ if __name__ == "__main__":
428 428
429 # Original: labels = list(crf.classes_) 429 # Original: labels = list(crf.classes_)
430 # Original: labels.remove('O') 430 # Original: labels.remove('O')
431 - labels = list(['MF', 'TF', 'DFAM', 'DMOT', 'DPOS', 'PRO']) 431 + labels = list(['GENE'])
432 432
433 # use the same metric for evaluation 433 # use the same metric for evaluation
434 f1_scorer = make_scorer(metrics.flat_f1_score, 434 f1_scorer = make_scorer(metrics.flat_f1_score,
...@@ -436,7 +436,7 @@ if __name__ == "__main__": ...@@ -436,7 +436,7 @@ if __name__ == "__main__":
436 436
437 # search 437 # search
438 rs = RandomizedSearchCV(crf, params_space, 438 rs = RandomizedSearchCV(crf, params_space,
439 - cv=3, 439 + cv=10,
440 verbose=3, 440 verbose=3,
441 n_jobs=-1, 441 n_jobs=-1,
442 n_iter=20, 442 n_iter=20,
......