Carlos-Francisco Méndez-Cruz

Obtaining training and test data sets

1 +# -*- coding: UTF-8 -*-
2 +
3 +from optparse import OptionParser
4 +import os
5 +import sys
6 +from time import time
7 +
8 +__author__ = 'CMendezC'
9 +
10 +# Objective: Join transformed files for obtaining training and test data sets
11 +
12 +# Parameters:
13 +# 1) --inputPath Path to read files.
14 +# 2) --trainingFile File name for training data.
15 +# 3) --testFile File name for test data.
16 +# 4) --outputPath Path to write files.
17 +
18 +# Ouput:
19 +# 1) Files created.
20 +
21 +# Execution:
22 +# python prepare-training-test.py
23 +# --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/transformed
24 +# --trainingFile training-data-set-70.txt
25 +# --testFile test-data-set-30.txt
26 +# --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
27 +# python prepare-training-test.py --inputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets/transformed --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /export/space1/users/compu2/bionlp/conditional-random-fields/data-sets
28 +
29 +###########################################################
30 +# MAIN PROGRAM #
31 +###########################################################
32 +
33 +if __name__ == "__main__":
34 + # Parameter definition
35 + parser = OptionParser()
36 + parser.add_option("--inputPath", dest="inputPath",
37 + help="Path to read files", metavar="PATH")
38 + parser.add_option("--trainingFile", dest="trainingFile",
39 + help="File for training examples", metavar="FILE")
40 + parser.add_option("--testFile", dest="testFile",
41 + help="File for test examples", metavar="FILE")
42 + parser.add_option("--outputPath", dest="outputPath",
43 + help="Path to write output file, feature parameter is concatenated to file name.", metavar="PATH")
44 +
45 + (options, args) = parser.parse_args()
46 + if len(args) > 0:
47 + parser.error("None parameters indicated.")
48 + sys.exit(1)
49 +
50 + # Printing parameter values
51 + print('-------------------------------- PARAMETERS --------------------------------')
52 + print("Path to read files: " + str(options.inputPath))
53 + print("File for training examples", str(options.trainingFile))
54 + print("File for test examples", str(options.testFile))
55 + print("Path to write output files: " + str(options.outputPath))
56 +
57 + t0 = time()
58 + trainingDataset = []
59 + testDataset = []
60 +
61 + counter = 1
62 + for path, dirs, files in os.walk(options.inputPath):
63 + # For each file in dir
64 + for file in files:
65 + if counter <= 70:
66 + print(" Joining file {} to training data set".format(file))
67 + with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
68 + for line in iFile:
69 + line = line.strip('\r\n')
70 + trainingDataset.append(line)
71 + if counter > 70 and counter <= 100:
72 + print(" Joining file {} to test data set".format(file))
73 + with open(os.path.join(path, file), "r", encoding="utf-8", errors="replace") as iFile:
74 + for line in iFile:
75 + line = line.strip('\r\n')
76 + testDataset.append(line)
77 + with open(os.path.join(options.outputPath, options.trainingFile), "r", encoding="utf-8", errors="replace") as oFile:
78 + for line in trainingDataset:
79 + oFile.write("{}\n".format(line))
80 + with open(os.path.join(options.outputPath, options.testFile), "r", encoding="utf-8", errors="replace") as oFile:
81 + for line in testDataset:
82 + oFile.write("{}\n".format(line))
83 +
1 -# -*- coding: UTF-8 -*-
2 -
3 -from optparse import OptionParser
4 -import os
5 -import sys
6 -from time import time
7 -import json
8 -from nltk.corpus import stopwords
9 -
10 -__author__ = 'CMendezC'
11 -
12 -# Objective: Take transformed file with format word|lemma|tag,
13 -# for example: Multiple|multiple|JJ genetic|genetic|JJ variants|variant|NNS have|have|VBP
14 -# and create file with an additional tagging for CRF training. For example:
15 -# the|the|dt N-terminal|N-terminal|NN| domain|domain|NN -->
16 -# the|the|dt|O N-terminal|N-terminal|DPOS domain|NN|O
17 -# Additionally, we are going to aTag frequent words belonging to each aspect with corresponding aspect aTag (DOM or RP).
18 -# We expect that these words are going to have one aTag in some context and different one in others.
19 -# The frequent words were obtained by another program (corpusAnalysis) and save into files that are loaded here.
20 -# In output file we only maintain the lemma and the tag or the word and the tag.
21 -# This additional tagging is going to give us clues for aspect classification.
22 -
23 -# Parameters:
24 -# 1) --inputPath Path to read files.
25 -# 2) --trainingFile File name with training data.
26 -# 3) --testFile File name with test data.
27 -# 4) --outputPath Path to write files. File names are concatenated with feature name.
28 -# 5) ELIMINATED --feature Type of feature to extract and create file: lemma
29 -# 6) --termPath Path to read term files
30 -# 7) --termFiles JSON file with terms files and tags
31 -# 8) --termPath Path to read JSON file with information about frequent words files
32 -# 9) --inputFileFreq JSON file with information about frequent words
33 -# 10 --skip=N Skip N words to form skip mentions
34 -# 11) --stopWords Filtering stop words
35 -# 12) --filterPunctMarks Filtering punctuation marks
36 -
37 -# Ouput:
38 -# 1) Files created. Name of feature is concatenated
39 -
40 -# Execution:
41 -# ASPECTS
42 -# python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\aspects_TrainingTest_RP_DOM_20160723\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\aspects_TrainingTest_RP_DOM_20160723\CRF_trainingTest_Datasets --feature lemma,word --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
43 -
44 -# SENTENCES
45 -# python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\sentences_TrainingTest_RP_DOM_20160725\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\sentences_TrainingTest_RP_DOM_20160725\CRF_trainingTest_Datasets --feature lemma,word --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
46 -
47 -# none: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json
48 -# stopwords: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --stopWords
49 -# stopwords AND filterPunctMarks: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --stopWords --filterPunctMarks
50 -# filterPunctMarks: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --filterPunctMarks
51 -# ¿? --SKIP
52 -
53 -def getSkipMentions(aList, aSkip):
54 - hashTemp = {}
55 - for j in range(0, aSkip):
56 - listTemp = []
57 - for i in range(0, len(aList), aSkip+1):
58 - listTemp.append(aList[i + j])
59 - hashTemp[j] = listTemp
60 - return hashTemp
61 -
62 -###########################################################
63 -# MAIN PROGRAM #
64 -###########################################################
65 -
66 -if __name__ == "__main__":
67 - # Parameter definition
68 - parser = OptionParser()
69 - parser.add_option("--inputPath", dest="inputPath",
70 - help="Path to read files", metavar="PATH")
71 - parser.add_option("--trainingFile", dest="trainingFile",
72 - help="File with training examples", metavar="FILE")
73 - parser.add_option("--testFile", dest="testFile",
74 - help="File with test examples", metavar="FILE")
75 - parser.add_option("--trainingClassesFile", dest="trainingClassesFile",
76 - help="File with training classes", metavar="FILE")
77 - parser.add_option("--testClassesFile", dest="testClassesFile",
78 - help="File with test classes", metavar="FILE")
79 - parser.add_option("--outputPath", dest="outputPath",
80 - help="Path to write output file, feature parameter is concatenated to file name.", metavar="PATH")
81 - parser.add_option("--termPath", dest="termPath",
82 - help="Path to read term files", metavar="PATH")
83 - parser.add_option("--termFiles", dest="termFiles",
84 - help="JSON file with terms files and tags", metavar="PATH")
85 - parser.add_option("--inputFileFreq", dest="inputFileFreq",
86 - help="JSON file with information about frequent words", metavar="PATH")
87 - parser.add_option("--skip", type="int",
88 - dest="skip", default=0,
89 - help="Skip mentions", metavar="N")
90 - parser.add_option("--filterStopWords", default=False,
91 - action="store_true", dest="filterStopWords",
92 - help="Filtering stop words")
93 - parser.add_option("--filterPunctMarks", default=False,
94 - action="store_true", dest="filterPunctMarks",
95 - help="Filtering punctuation marks")
96 -
97 - (options, args) = parser.parse_args()
98 - if len(args) > 0:
99 - parser.error("None parameters indicated.")
100 - sys.exit(1)
101 -
102 - # Printing parameter values
103 - print('-------------------------------- PARAMETERS --------------------------------')
104 - print("Path to read files: " + str(options.inputPath))
105 - print("File with training examples", str(options.trainingFile))
106 - print("File with test examples", str(options.testFile))
107 - print("File with training classes", str(options.trainingClassesFile))
108 - print("File with test classes", str(options.testClassesFile))
109 - print("File with training classes", str(options.trainingClassesFile))
110 - print("File with test classes", str(options.testClassesFile))
111 - print("Path to write output files: " + str(options.outputPath))
112 - print("JSON file with information about frequent words: " + str(options.inputFileFreq))
113 - print("Skip mentions: " + str(options.skip))
114 - print("Filtering stop words: " + str(options.stopWords))
115 - punctMarks = ['.', ',', ':', ';', '?', '!', '\'', '"']
116 - print("Filtering puntuation marks " + str(punctMarks) + ': '+ str(options.filterPunctMarks))
117 -
118 - filesRead = 0
119 - t0 = time()
120 -
121 - print('Loading biological term files...')
122 - with open(os.path.join(options.termPath, options.termFiles)) as data_file:
123 - hashes = json.load(data_file)
124 - print(' Loading biological term files... done')
125 -
126 - hashTagAspect = hashes["hashTagAspect"]
127 -
128 - print('Loading frequent words...')
129 - with open(os.path.join(options.termPath, options.inputFileFreq)) as data_file:
130 - hashAspectFreqWords = json.load(data_file)
131 - print(' Loading frequent words... done')
132 -
133 - listFiles = [options.trainingFile, options.testFile]
134 - listClassesFiles = [options.trainingClassesFile, options.testClassesFile]
135 -
136 - for iFile, cFile in zip(listFiles, listClassesFiles):
137 - with open(os.path.join(options.inputPath, iFile), "r", encoding="utf-8", errors="replace") as tFile:
138 - print("Reading file..." + iFile)
139 - lines = [l.strip('\n') for l in tFile.readlines()]
140 - filesRead += 1
141 - with open(os.path.join(options.inputPath, cFile), "r", encoding="utf-8", errors="replace") as clFile:
142 - print("Reading file..." + cFile)
143 - classes = [c.strip('\n') for c in clFile.readlines()]
144 - listLines = []
145 - print("Processing files... ")
146 - for line, c in zip(lines, classes):
147 - # print("class: ", c)
148 - listTokenLine = []
149 - # listLemmaLine = []
150 - for tok in line.split():
151 - tokList = tok.split("|")
152 - word = tokList[0]
153 - lemma = tokList[1]
154 - tag = tokList[2]
155 - # Filtering stopwords
156 - if options.stopWords:
157 - if lemma in stopwords.words('english'):
158 - continue
159 - if options.filterPunctMarks:
160 - if lemma in punctMarks:
161 - continue
162 - # if tag in hashTagAspect:
163 - # We change tag for aspect tag only in the case of aspect tag coincide with class.
164 - # We want that CRF learn when to change term tag to aspect tag in correct context
165 - if tag in hashTagAspect:
166 - if hashTagAspect[tag] == c:
167 - aTag = hashTagAspect[tag]
168 - else:
169 - aTag = 'O'
170 - else:
171 - if c in hashAspectFreqWords:
172 - # print("class: ", c)
173 - hashFreqWords = hashAspectFreqWords[c]
174 - # We verify if word or lemma is in frequent words.
175 - # These frequent words are word-forms (tokens)
176 - if word.lower() in hashFreqWords or lemma in hashFreqWords:
177 - aTag = c
178 - else:
179 - aTag = 'O'
180 - else:
181 - aTag = 'O'
182 - listTokenLine.append(word + "|" + lemma + "|" + tag + "|" + aTag)
183 - # if feature == "word":
184 - listLines.append(listTokenLine)
185 - # if feature == "lemma":
186 - # listLines = listLemmaLine.strip() + '\n'
187 - if options.skip > 0:
188 - t0 = time()
189 - skipTemp = options.skip
190 - for i in range(1, options.skip):
191 - hashTemp = getSkipMentions(listLines, skipTemp)
192 - # skipTemp -= 1
193 - for key in hashTemp:
194 - listLines = hashTemp[key]
195 - with open(os.path.join(options.outputPath, iFile.replace('.txt', '.StopWords_' + str(
196 - options.stopWords) + '.FilterPunctMarks_' + str(options.filterPunctMarks) + '.Skip_' + str(skipTemp) + '.txt')), "w",
197 - encoding="utf-8") as oFile:
198 - for line in listLines:
199 - oFile.write(line)
200 - print("Skip mention done in: %fs" % (time() - t0))
201 - else:
202 - with open(os.path.join(options.outputPath, iFile.replace('.txt', '.StopWords_' + str(
203 - options.stopWords) + '.FilterPunctMarks_' + str(options.filterPunctMarks) + '.Skip_' + str(options.skip) + '.txt')), "w",
204 - encoding="utf-8") as oFile:
205 - for line in listLines:
206 - for token in line:
207 - oFile.write(token + ' ')
208 - oFile.write('\n')
209 -
210 - print("Files processed: " + str(filesRead))
...\ No newline at end of file ...\ No newline at end of file
1 +# -*- coding: UTF-8 -*-
2 +
3 +import os
4 +from itertools import chain
5 +from optparse import OptionParser
6 +from time import time
7 +from collections import Counter
8 +
9 +import nltk
10 +import sklearn
11 +import scipy.stats
12 +import sys
13 +
14 +from sklearn.externals import joblib
15 +from sklearn.metrics import make_scorer
16 +from sklearn.cross_validation import cross_val_score
17 +from sklearn.grid_search import RandomizedSearchCV
18 +
19 +import sklearn_crfsuite
20 +from sklearn_crfsuite import scorers
21 +from sklearn_crfsuite import metrics
22 +
23 +from nltk.corpus import stopwords
24 +
25 +
26 +# Objective
27 +# Training and evaluation of CRFs with sklearn-crfsuite.
28 +#
29 +# Input parameters
30 +# --inputPath=PATH Path of training and test data set
31 +# --trainingFile File with training data set
32 +# --testFile File with test data set
33 +# --outputPath=PATH Output path to place output files
34 +# --filteringStopWords Filtering stop words
35 +# --filterSymbols Filtering punctuation marks
36 +
37 +# Output
38 +# 1) Best model
39 +
40 +# Examples
41 +# Sentences
42 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS > output.TrainingTestingCRF.20161106_1.txt
43 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords > output.TrainingTestingCRF.20161106_2.txt
44 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterSymbols > output.TrainingTestingCRF.20161106_3.txt
45 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords --filterSymbols > output.TrainingTestingCRF.20161106_4.txt
46 +
47 +# Aspects
48 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS > output.TrainingTestingCRF.20161106_5.txt
49 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords > output.TrainingTestingCRF.20161106_6.txt
50 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterSymbols > output.TrainingTestingCRF.20161106_7.txt
51 +# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords --filterSymbols > output.TrainingTestingCRF.20161106_8.txt
52 +
53 +#################################
54 +# FUNCTIONS #
55 +#################################
56 +
57 +def wordSize(text):
58 + lWord = len(text)
59 + if lWord == 1:
60 + return '1'
61 + elif lWord == 2:
62 + return '2'
63 + elif lWord == 3:
64 + return '3'
65 + elif lWord == 4:
66 + return '4'
67 + elif lWord == 5:
68 + return '5'
69 + elif 6 <= lWord <= 10:
70 + return '6-10'
71 + elif 11 <= lWord <= 15:
72 + return '11-15'
73 + elif 16 <= lWord <= 20:
74 + return '16-20'
75 + elif 21 <= lWord <= 30:
76 + return '21-30'
77 + else:
78 + return '>30'
79 +
80 +def hasUpperLower(text):
81 + has = False
82 + if len(text) < 3:
83 + return False
84 + regexUp = nltk.re.compile('[A-Z]')
85 + regexLo = nltk.re.compile('[a-z]')
86 + if (regexUp.search(text) != None) and (regexLo.search(text) != None):
87 + has = True
88 + return has
89 +
90 +def hasDigit(text):
91 + has = False
92 + if len(text) < 3:
93 + return False
94 + myRegex = nltk.re.compile('[0-9]')
95 + if myRegex.search(text) != None:
96 + has = True
97 + return has
98 +
99 +
100 +def hasNonAlphaNum(text):
101 + has = False
102 + if len(text) < 3:
103 + return False
104 + myRegex = nltk.re.compile('\W')
105 + if myRegex.search(text) != None:
106 + has = True
107 + return has
108 +
109 +def word2features(sent, i):
110 + # print "i: " + str(i)
111 + # print "sent[i]" + sent[i]
112 + listElem = sent[i].split('|')
113 + word = listElem[0]
114 + lemma = listElem[1]
115 + postag = listElem[2]
116 +
117 + features = {
118 + # Names of TF and genes change by lower and upper characters: 'word.lower()': word.lower(),
119 + # Suffixes
120 + 'word[-3:]': word[-3:],
121 + 'word[-2:]': word[-2:],
122 + 'word[-1:]': word[-1:],
123 + 'word.isupper()': word.isupper(),
124 + 'word.istitle()': word.istitle(),
125 + 'word.hasDigit()': hasDigit(word),
126 + 'word.hasNonAlphaNum': hasNonAlphaNum(word),
127 + # 'word.hasUpperLower': hasUpperLower(word),
128 + #'wordSize': wordSize(word),
129 + # 'word.isdigit()': word.isdigit(),
130 + 'word': word,
131 + 'lemma': lemma,
132 + 'lemma[-3:]': lemma[-3:],
133 + 'lemma[-2:]': lemma[-2:],
134 + 'lemma[-1:]': lemma[-1:],
135 + 'postag': postag,
136 + # Prefixes
137 + 'postag[:2]': postag[:2],
138 + 'postag[:1]': postag[:1],
139 + }
140 + if i > 0:
141 + listElem = sent[i - 1].split('|')
142 + word1 = listElem[0]
143 + lemma1 = listElem[1]
144 + postag1 = listElem[2]
145 + features.update({
146 + '-1:word.lower()': word1.lower(),
147 + '-1:word.istitle()': word1.istitle(),
148 + '-1:word.isupper()': word1.isupper(),
149 + '-1:word.hasDigit()': hasDigit(word1),
150 + '-1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
151 + # '-1:word.hasUpperLower': hasUpperLower(word1),
152 + '-1:word': word1,
153 + '-1:lemma': lemma1,
154 + '-1:postag': postag1,
155 + '-1:postag[:2]': postag1[:2],
156 + '-1:postag[:1]': postag1[:1],
157 + })
158 + # else:
159 + # features['BOS'] = True
160 +
161 + if i < len(sent) - 1:
162 + listElem = sent[i + 1].split('|')
163 + word1 = listElem[0]
164 + lemma1 = listElem[1]
165 + postag1 = listElem[2]
166 + features.update({
167 + '+1:word.lower()': word1.lower(),
168 + '+1:word.istitle()': word1.istitle(),
169 + '+1:word.isupper()': word1.isupper(),
170 + '+1:word.hasDigit()': hasDigit(word1),
171 + '+1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
172 + # '+1:word.hasUpperLower': hasUpperLower(word1),
173 + '+1:word': word1,
174 + '+1:lemma': lemma1,
175 + '+1:postag': postag1,
176 + '+1:postag[:2]': postag1[:2],
177 + '+1:postag[:1]': postag1[:1],
178 + })
179 + # else:
180 + # features['EOS'] = True
181 + if i > 1:
182 + listElem = sent[i - 2].split('|')
183 + word2 = listElem[0]
184 + lemma2 = listElem[1]
185 + postag2 = listElem[2]
186 + features.update({
187 + '-2:word.lower()': word2.lower(),
188 + '-2:word.istitle()': word2.istitle(),
189 + '-2:word.isupper()': word2.isupper(),
190 + '-2:word.hasDigit()': hasDigit(word2),
191 + '-2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
192 + # '-2:word.hasUpperLower': hasUpperLower(word2),
193 + '-2:word': word2,
194 + '-2:lemma': lemma2,
195 + '-2:postag': postag2,
196 + '-2:postag[:2]': postag2[:2],
197 + '-2:postag[:1]': postag2[:1],
198 + })
199 +
200 + if i < len(sent) - 2:
201 + listElem = sent[i + 2].split('|')
202 + word2 = listElem[0]
203 + lemma2 = listElem[1]
204 + postag2 = listElem[2]
205 + features.update({
206 + '+2:word.lower()': word2.lower(),
207 + '+2:word.istitle()': word2.istitle(),
208 + '+2:word.isupper()': word2.isupper(),
209 + '+2:word.hasDigit()': hasDigit(word2),
210 + '+2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
211 + # '+2:word.hasUpperLower': hasUpperLower(word2),
212 + '+2:word': word2,
213 + '+2:lemma': lemma2,
214 + '+2:postag': postag2,
215 + '+2:postag[:2]': postag2[:2],
216 + '+2:postag[:1]': postag2[:1],
217 + })
218 +
219 + trigrams = False
220 + if trigrams:
221 + if i > 2:
222 + listElem = sent[i - 3].split('|')
223 + word3 = listElem[0]
224 + lemma3 = listElem[1]
225 + postag3 = listElem[2]
226 + features.update({
227 + '-3:word.lower()': word3.lower(),
228 + '-3:word.istitle()': word3.istitle(),
229 + '-3:word.isupper()': word3.isupper(),
230 + '-3:word.hasDigit()': hasDigit(word3),
231 + '-3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
232 + # '-3:word.hasUpperLower': hasUpperLower(word3),
233 + '-3:word': word3,
234 + '-3:lemma': lemma3,
235 + '-3:postag': postag3,
236 + '-3:postag[:2]': postag3[:2],
237 + '-3:postag[:1]': postag3[:1],
238 + })
239 +
240 + if i < len(sent) - 3:
241 + listElem = sent[i + 3].split('|')
242 + word3 = listElem[0]
243 + lemma3 = listElem[1]
244 + postag3 = listElem[2]
245 + features.update({
246 + '+3:word.lower()': word3.lower(),
247 + '+3:word.istitle()': word3.istitle(),
248 + '+3:word.isupper()': word3.isupper(),
249 + '+3:word.hasDigit()': hasDigit(word3),
250 + '+3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
251 + # '+3:word.hasUpperLower': hasUpperLower(word3),
252 + '+3:word': word3,
253 + '+3:lemma': lemma3,
254 + '+3:postag': postag3,
255 + '+3:postag[:2]': postag3[:2],
256 + '+3:postag[:1]': postag3[:1],
257 + })
258 +
259 + return features
260 +
261 +
262 +def sent2features(sent):
263 + return [word2features(sent, i) for i in range(len(sent))]
264 +
265 +
266 +def sent2labels(sent):
267 + return [elem.split('|')[3] for elem in sent]
268 + # return [label for token, postag, label in sent]
269 +
270 +
271 +def sent2tokens(sent):
272 + return [token for token, postag, label in sent]
273 +
274 +
275 +def print_transitions(trans_features, f):
276 + for (label_from, label_to), weight in trans_features:
277 + # f.write("%-6s -> %-7s %0.6f\n" % (label_from, label_to, weight))
278 + # f.write("label_from :" + label_from)
279 + # f.write("label_to :" + label_to)
280 + # f.write("label_weight :" + weight)
281 + # f.write("{} -> {} {:0.6f}\n".format(label_from.encode("utf-8"), label_to.encode("utf-8"), weight))
282 + f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight))
283 +
284 +
285 +def print_state_features(state_features, f):
286 + for (attr, label), weight in state_features:
287 + # f.write("%0.6f %-8s %s\n" % (weight, label, attr))
288 + # f.write(attr.encode("utf-8"))
289 + # '{:06.2f}'.format(3.141592653589793)
290 + f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8")))
291 +
292 +
293 +__author__ = 'CMendezC'
294 +
295 +##########################################
296 +# MAIN PROGRAM #
297 +##########################################
298 +
299 +if __name__ == "__main__":
300 + # Defining parameters
301 + parser = OptionParser()
302 + parser.add_option("--inputPath", dest="inputPath",
303 + help="Path of training data set", metavar="PATH")
304 + parser.add_option("--outputPath", dest="outputPath",
305 + help="Output path to place output files",
306 + metavar="PATH")
307 + parser.add_option("--trainingFile", dest="trainingFile",
308 + help="File with training data set", metavar="FILE")
309 + parser.add_option("--testFile", dest="testFile",
310 + help="File with test data set", metavar="FILE")
311 + parser.add_option("--filterStopWords", default=False,
312 + action="store_true", dest="filterStopWords",
313 + help="Filtering stop words")
314 + parser.add_option("--filterSymbols", default=False,
315 + action="store_true", dest="filterSymbols",
316 + help="Filtering punctuation marks")
317 +
318 + (options, args) = parser.parse_args()
319 + if len(args) > 0:
320 + parser.error("Any parameter given.")
321 + sys.exit(1)
322 +
323 + print('-------------------------------- PARAMETERS --------------------------------')
324 + print("Path of training data set: " + options.inputPath)
325 + print("File with training data set: " + str(options.trainingFile))
326 + print("Path of test data set: " + options.inputPath)
327 + print("File with test data set: " + str(options.testFile))
328 + print("Filtering stop words: " + str(options.filterStopWords))
329 + symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
330 + '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
331 + print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
332 +
333 + print('-------------------------------- PROCESSING --------------------------------')
334 + print('Reading corpus...')
335 + t0 = time()
336 +
337 + sentencesTrainingData = []
338 + sentencesTestData = []
339 +
340 + stopwords = [word.decode('utf-8') for word in stopwords.words('english')]
341 +
342 + with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile:
343 + # with open(os.path.join(options.inputPath, options.trainingFile), "r", encoding="utf-8", errors='replace') as iFile:
344 + for line in iFile.readlines():
345 + listLine = []
346 + line = line.decode("utf-8")
347 + for token in line.strip('\n').split():
348 + if options.filterStopWords:
349 + listToken = token.split('|')
350 + lemma = listToken[1]
351 + # Original: if lemma in stopwords.words('english'):
352 + # trainingTesting_Sklearn_crfsuite.py:269:
353 + # UnicodeWarning: Unicode equal comparison failed to
354 + # convert both arguments to Unicode -
355 + # interpreting them as being unequal
356 + if lemma in stopwords:
357 + continue
358 + if options.filterSymbols:
359 + listToken = token.split('|')
360 + lemma = listToken[1]
361 + if lemma in symbols:
362 + # if lemma == ',':
363 + # print "Coma , identificada"
364 + continue
365 + listLine.append(token)
366 + sentencesTrainingData.append(listLine)
367 + print " Sentences training data: " + str(len(sentencesTrainingData))
368 + # print sentencesTrainingData[0]
369 +
370 + with open(os.path.join(options.inputPath, options.testFile), "r") as iFile:
371 + # with open(os.path.join(options.inputPath, options.testFile), "r", encoding="utf-8", errors='replace') as iFile:
372 + for line in iFile.readlines():
373 + listLine = []
374 + line = line.decode("utf-8")
375 + for token in line.strip('\n').split():
376 + if options.filterStopWords:
377 + listToken = token.split('|')
378 + lemma = listToken[1]
379 + # Original if lemma in stopwords.words('english'):
380 + if lemma in stopwords:
381 + continue
382 + if options.filterSymbols:
383 + listToken = token.split('|')
384 + lemma = listToken[1]
385 + if lemma in symbols:
386 + # if lemma == ',':
387 + # print "Coma , identificada"
388 + continue
389 + listLine.append(token)
390 + sentencesTestData.append(listLine)
391 + print " Sentences test data: " + str(len(sentencesTestData))
392 + # print sentencesTestData[0]
393 +
394 + print("Reading corpus done in: %fs" % (time() - t0))
395 +
396 + print(sent2features(sentencesTrainingData[0])[0])
397 + print(sent2features(sentencesTestData[0])[0])
398 + # print(sent2labels(sentencesTrainingData[0]))
399 + # print(sent2labels(sentencesTestData[0]))
400 + t0 = time()
401 +
402 + X_train = [sent2features(s) for s in sentencesTrainingData]
403 + y_train = [sent2labels(s) for s in sentencesTrainingData]
404 +
405 + X_test = [sent2features(s) for s in sentencesTestData]
406 + # print X_test
407 + y_test = [sent2labels(s) for s in sentencesTestData]
408 +
409 + # Fixed parameters
410 + # crf = sklearn_crfsuite.CRF(
411 + # algorithm='lbfgs',
412 + # c1=0.1,
413 + # c2=0.1,
414 + # max_iterations=100,
415 + # all_possible_transitions=True
416 + # )
417 +
418 + # Hyperparameter Optimization
419 + crf = sklearn_crfsuite.CRF(
420 + algorithm='lbfgs',
421 + max_iterations=100,
422 + all_possible_transitions=True
423 + )
424 + params_space = {
425 + 'c1': scipy.stats.expon(scale=0.5),
426 + 'c2': scipy.stats.expon(scale=0.05),
427 + }
428 +
429 + # Original: labels = list(crf.classes_)
430 + # Original: labels.remove('O')
431 + labels = list(['GENE'])
432 +
433 + # use the same metric for evaluation
434 + f1_scorer = make_scorer(metrics.flat_f1_score,
435 + average='weighted', labels=labels)
436 +
437 + # search
438 + rs = RandomizedSearchCV(crf, params_space,
439 + cv=10,
440 + verbose=3,
441 + n_jobs=-1,
442 + n_iter=20,
443 + # n_iter=50,
444 + scoring=f1_scorer)
445 + rs.fit(X_train, y_train)
446 +
447 + # Fixed parameters
448 + # crf.fit(X_train, y_train)
449 +
450 + # Best hiperparameters
451 + # crf = rs.best_estimator_
452 + nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
453 + options.filterSymbols) + '.txt')
454 + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile:
455 + oFile.write("********** TRAINING AND TESTING REPORT **********\n")
456 + oFile.write("Training file: " + options.trainingFile + '\n')
457 + oFile.write('\n')
458 + oFile.write('best params:' + str(rs.best_params_) + '\n')
459 + oFile.write('best CV score:' + str(rs.best_score_) + '\n')
460 + oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000))
461 +
462 + print("Training done in: %fs" % (time() - t0))
463 + t0 = time()
464 +
465 + # Update best crf
466 + crf = rs.best_estimator_
467 +
468 + # Saving model
469 + print(" Saving training model...")
470 + t1 = time()
471 + nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
472 + options.filterSymbols) + '.mod')
473 + joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
474 + print(" Saving training model done in: %fs" % (time() - t1))
475 +
476 + # Evaluation against test data
477 + y_pred = crf.predict(X_test)
478 + print("*********************************")
479 + name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
480 + options.filterSymbols) + '.txt')
481 + with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile:
482 + for y in y_pred:
483 + oFile.write(str(y) + '\n')
484 +
485 + print("*********************************")
486 + name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
487 + options.filterSymbols) + '.txt')
488 + with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile:
489 + for y in y_test:
490 + oFile.write(str(y) + '\n')
491 +
492 + print("Prediction done in: %fs" % (time() - t0))
493 +
494 + # labels = list(crf.classes_)
495 + # labels.remove('O')
496 +
497 + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile:
498 + oFile.write('\n')
499 + oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)))
500 + oFile.write('\n')
501 + # labels = list(crf.classes_)
502 + sorted_labels = sorted(
503 + labels,
504 + key=lambda name: (name[1:], name[0])
505 + )
506 + oFile.write(metrics.flat_classification_report(
507 + y_test, y_pred, labels=sorted_labels, digits=3
508 + ))
509 + oFile.write('\n')
510 +
511 + oFile.write("\nTop likely transitions:\n")
512 + print_transitions(Counter(crf.transition_features_).most_common(50), oFile)
513 + oFile.write('\n')
514 +
515 + oFile.write("\nTop unlikely transitions:\n")
516 + print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile)
517 + oFile.write('\n')
518 +
519 + oFile.write("\nTop positive:\n")
520 + print_state_features(Counter(crf.state_features_).most_common(200), oFile)
521 + oFile.write('\n')
522 +
523 + oFile.write("\nTop negative:\n")
524 + print_state_features(Counter(crf.state_features_).most_common()[-200:], oFile)
525 + oFile.write('\n')
...@@ -428,7 +428,7 @@ if __name__ == "__main__": ...@@ -428,7 +428,7 @@ if __name__ == "__main__":
428 428
429 # Original: labels = list(crf.classes_) 429 # Original: labels = list(crf.classes_)
430 # Original: labels.remove('O') 430 # Original: labels.remove('O')
431 - labels = list(['MF', 'TF', 'DFAM', 'DMOT', 'DPOS', 'PRO']) 431 + labels = list(['GENE'])
432 432
433 # use the same metric for evaluation 433 # use the same metric for evaluation
434 f1_scorer = make_scorer(metrics.flat_f1_score, 434 f1_scorer = make_scorer(metrics.flat_f1_score,
...@@ -436,7 +436,7 @@ if __name__ == "__main__": ...@@ -436,7 +436,7 @@ if __name__ == "__main__":
436 436
437 # search 437 # search
438 rs = RandomizedSearchCV(crf, params_space, 438 rs = RandomizedSearchCV(crf, params_space,
439 - cv=3, 439 + cv=10,
440 verbose=3, 440 verbose=3,
441 n_jobs=-1, 441 n_jobs=-1,
442 n_iter=20, 442 n_iter=20,
......