Showing
1 changed file
with
280 additions
and
0 deletions
tagging_Sklearn_crfsuite.py
0 → 100644
1 | +# -*- coding: UTF-8 -*- | ||
2 | + | ||
3 | +import os | ||
4 | +from itertools import chain | ||
5 | +from optparse import OptionParser | ||
6 | +from time import time | ||
7 | +from collections import Counter | ||
8 | + | ||
9 | +import nltk | ||
10 | +import sklearn | ||
11 | +import scipy.stats | ||
12 | +import sys | ||
13 | + | ||
14 | +from sklearn.externals import joblib | ||
15 | +from sklearn.metrics import make_scorer | ||
16 | +from sklearn.cross_validation import cross_val_score | ||
17 | +from sklearn.grid_search import RandomizedSearchCV | ||
18 | + | ||
19 | +import sklearn_crfsuite | ||
20 | +from sklearn_crfsuite import scorers | ||
21 | +from sklearn_crfsuite import metrics | ||
22 | + | ||
23 | +from nltk.corpus import stopwords | ||
24 | +from trainingTesting_Sklearn_crfsuite import word2features | ||
25 | +from trainingTesting_Sklearn_crfsuite import sent2features | ||
26 | +# from trainingTesting_Sklearn_crfsuite import hasNonAlphaNum | ||
27 | +# from trainingTesting_Sklearn_crfsuite import hasDigit | ||
28 | + | ||
29 | +# Objective | ||
30 | +# Tagging transformed file with CRF model with sklearn-crfsuite. | ||
31 | +# | ||
32 | +# Input parameters | ||
33 | +# --inputPath=PATH Path of transformed files x|y|z | ||
34 | +# --modelPath Path to CRF model | ||
35 | +# --modelName Model name | ||
36 | +# --outputPath=PATH Output path to place output files | ||
37 | +# --filteringStopWords Filtering stop words | ||
38 | +# --filterSymbols Filtering punctuation marks | ||
39 | + | ||
40 | +# Output | ||
41 | +# 1) Tagged files in transformed format | ||
42 | + | ||
43 | +# Examples | ||
44 | +# Sentences | ||
45 | +# C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName aspectsTraining.fStopWords_False.fSymbols_True --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged --filterSymbols > output.taggingCRF.20161107.txt | ||
46 | +# C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName sentencesTraining.fStopWords_False.fSymbols_False --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged > output.taggingCRF.20161107.txt | ||
47 | + | ||
48 | +################################# | ||
49 | +# FUNCTIONS # | ||
50 | +################################# | ||
51 | +def word2features(sent, i): | ||
52 | + listElem = sent[i].split('|') | ||
53 | + word = listElem[0] | ||
54 | + lemma = listElem[1] | ||
55 | + postag = listElem[2] | ||
56 | + | ||
57 | + features = { | ||
58 | + # Suffixes | ||
59 | + #'word[-3:]': word[-3:], | ||
60 | + #'word[-2:]': word[-2:], | ||
61 | + #'word[-1:]': word[-1:], | ||
62 | + #'word.isupper()': word.isupper(), | ||
63 | + #'word': word, | ||
64 | + #'lemma': lemma, | ||
65 | + #'postag': postag, | ||
66 | + 'lemma[-3:]': lemma[-3:], | ||
67 | + 'lemma[-2:]': lemma[-2:], | ||
68 | + 'lemma[-1:]': lemma[-1:], | ||
69 | + 'lemma[+3:]': lemma[:3], | ||
70 | + 'lemma[+2:]': lemma[:2], | ||
71 | + 'lemma[+1:]': lemma[:1], | ||
72 | + #'word[:3]': word[:3], | ||
73 | + #'word[:2]': word[:2], | ||
74 | + #'word[:1]': word[:1], | ||
75 | + #'endsConLow()={}'.format(endsConLow(word)): endsConLow(word), | ||
76 | + } | ||
77 | + if i > 0: | ||
78 | + listElem = sent[i - 1].split('|') | ||
79 | + word1 = listElem[0] | ||
80 | + lemma1 = listElem[1] | ||
81 | + postag1 = listElem[2] | ||
82 | + features.update({ | ||
83 | + #'-1:word': word1, | ||
84 | + '-1:lemma': lemma1, | ||
85 | + '-1:postag': postag1, | ||
86 | + }) | ||
87 | + | ||
88 | + if i < len(sent) - 1: | ||
89 | + listElem = sent[i + 1].split('|') | ||
90 | + word1 = listElem[0] | ||
91 | + lemma1 = listElem[1] | ||
92 | + postag1 = listElem[2] | ||
93 | + features.update({ | ||
94 | + #'+1:word': word1, | ||
95 | + '+1:lemma': lemma1, | ||
96 | + '+1:postag': postag1, | ||
97 | + }) | ||
98 | + | ||
99 | + ''' | ||
100 | + if i > 1: | ||
101 | + listElem = sent[i - 2].split('|') | ||
102 | + word2 = listElem[0] | ||
103 | + lemma2 = listElem[1] | ||
104 | + postag2 = listElem[2] | ||
105 | + features.update({ | ||
106 | + '-2:word': word2, | ||
107 | + '-2:lemma': lemma2, | ||
108 | + }) | ||
109 | + | ||
110 | + if i < len(sent) - 2: | ||
111 | + listElem = sent[i + 2].split('|') | ||
112 | + word2 = listElem[0] | ||
113 | + lemma2 = listElem[1] | ||
114 | + postag2 = listElem[2] | ||
115 | + features.update({ | ||
116 | + '+2:word': word2, | ||
117 | + '+2:lemma': lemma2, | ||
118 | + }) | ||
119 | + | ||
120 | + trigrams = False | ||
121 | + if trigrams: | ||
122 | + if i > 2: | ||
123 | + listElem = sent[i - 3].split('|') | ||
124 | + word3 = listElem[0] | ||
125 | + lemma3 = listElem[1] | ||
126 | + postag3 = listElem[2] | ||
127 | + features.update({ | ||
128 | + '-3:word': word3, | ||
129 | + '-3:lemma': lemma3, | ||
130 | + }) | ||
131 | + | ||
132 | + if i < len(sent) - 3: | ||
133 | + listElem = sent[i + 3].split('|') | ||
134 | + word3 = listElem[0] | ||
135 | + lemma3 = listElem[1] | ||
136 | + postag3 = listElem[2] | ||
137 | + features.update({ | ||
138 | + '+3:word': word3, | ||
139 | + '+3:lemma': lemma3, | ||
140 | + }) | ||
141 | + ''' | ||
142 | + return features | ||
143 | + | ||
144 | +__author__ = 'CMendezC' | ||
145 | + | ||
146 | +########################################## | ||
147 | +# MAIN PROGRAM # | ||
148 | +########################################## | ||
149 | + | ||
150 | +if __name__ == "__main__": | ||
151 | + # Defining parameters | ||
152 | + parser = OptionParser() | ||
153 | + parser.add_option("--inputPath", dest="inputPath", | ||
154 | + help="Path of training data set", metavar="PATH") | ||
155 | + parser.add_option("--outputPath", dest="outputPath", | ||
156 | + help="Output path to place output files", | ||
157 | + metavar="PATH") | ||
158 | + parser.add_option("--modelPath", dest="modelPath", | ||
159 | + help="Path to read CRF model", | ||
160 | + metavar="PATH") | ||
161 | + parser.add_option("--modelName", dest="modelName", | ||
162 | + help="Model name", metavar="TEXT") | ||
163 | + parser.add_option("--filterStopWords", default=False, | ||
164 | + action="store_true", dest="filterStopWords", | ||
165 | + help="Filtering stop words") | ||
166 | + parser.add_option("--filterSymbols", default=False, | ||
167 | + action="store_true", dest="filterSymbols", | ||
168 | + help="Filtering punctuation marks") | ||
169 | + | ||
170 | + (options, args) = parser.parse_args() | ||
171 | + if len(args) > 0: | ||
172 | + parser.error("Any parameter given.") | ||
173 | + sys.exit(1) | ||
174 | + | ||
175 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
176 | + print("Path to read input files: " + options.inputPath) | ||
177 | + print("Mode name: " + str(options.modelName)) | ||
178 | + print("Model path: " + options.modelPath) | ||
179 | + print("Path to place output files: " + options.outputPath) | ||
180 | + print("Filtering stop words: " + str(options.filterStopWords)) | ||
181 | + symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | ||
182 | + '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...'] | ||
183 | + # symbols = [sym.decode('utf-8') for sym in ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | ||
184 | + # '}', '[', ']', '*', '%', '$', '#', '&', '°']] | ||
185 | + # symbols = [u'.', u',', u':', u';', u'?', u'!', u'\'', u'"', u'<', u'>', u'(', u')', u'-', u'_', u'/', u'\\', u'¿', u'¡', u'+', u'{', | ||
186 | + # u'}', u'[', u']', u'*', u'%', u'$', u'#', u'&', u'°', u'`'] | ||
187 | + print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols)) | ||
188 | + | ||
189 | + print('-------------------------------- PROCESSING --------------------------------') | ||
190 | + | ||
191 | + stopwords = [word.decode('utf-8') for word in stopwords.words('english')] | ||
192 | + | ||
193 | + # Read CRF model | ||
194 | + t0 = time() | ||
195 | + print('Reading CRF model...') | ||
196 | + crf = joblib.load(os.path.join(options.modelPath, 'models', options.modelName + '.mod')) | ||
197 | + print("Reading CRF model done in: %fs" % (time() - t0)) | ||
198 | + | ||
199 | + print('Processing corpus...') | ||
200 | + t0 = time() | ||
201 | + # labels = list(['MF', 'TF', 'DFAM', 'DMOT', 'DPOS', 'PRO']) | ||
202 | + # Walk directory to read files | ||
203 | + for path, dirs, files in os.walk(options.inputPath): | ||
204 | + # For each file in dir | ||
205 | + for file in files: | ||
206 | + print(" Preprocessing file..." + str(file)) | ||
207 | + sentencesInputData = [] | ||
208 | + sentencesOutputData = [] | ||
209 | + with open(os.path.join(options.inputPath, file), "r") as iFile: | ||
210 | + lines = iFile.readlines() | ||
211 | + for line in lines: | ||
212 | + listLine = [] | ||
213 | + # line = line.decode("utf-8") | ||
214 | + for token in line.strip('\n').split(): | ||
215 | + if options.filterStopWords: | ||
216 | + listToken = token.split('|') | ||
217 | + lemma = listToken[1] | ||
218 | + # Original if lemma in stopwords.words('english'): | ||
219 | + if lemma in stopwords: | ||
220 | + continue | ||
221 | + if options.filterSymbols: | ||
222 | + listToken = token.split('|') | ||
223 | + lemma = listToken[1] | ||
224 | + if lemma in symbols: | ||
225 | + if lemma == ',': | ||
226 | + print("Coma , identificada") | ||
227 | + continue | ||
228 | + listLine.append(token) | ||
229 | + sentencesInputData.append(listLine) | ||
230 | + print(" Sentences input data: " + str(len(sentencesInputData))) | ||
231 | + # print sentencesInputData[0] | ||
232 | + # print(sent2features(sentencesInputData[0])[0]) | ||
233 | + # print(sent2labels(sentencesInputData[0])) | ||
234 | + X_input = [sent2features(s) for s in sentencesInputData] | ||
235 | + print(sent2features(sentencesInputData[0])[0]) | ||
236 | + # y_test = [sent2labels(s) for s in sentencesInputData] | ||
237 | + # Predicting tags | ||
238 | + t1 = time() | ||
239 | + print(" Predicting tags with model") | ||
240 | + y_pred = crf.predict(X_input) | ||
241 | + #print y_pred[0] | ||
242 | + print(" Prediction done in: %fs" % (time() - t1)) | ||
243 | + exit | ||
244 | + | ||
245 | + # Tagging with CRF model | ||
246 | + print(" Tagging file") | ||
247 | + for line, tagLine in zip(lines, y_pred): | ||
248 | + outputLine = '' | ||
249 | + idx_tagLine = 0 | ||
250 | + line = line.strip('\n') | ||
251 | + print("\nLine: " + str(line)) | ||
252 | + print ("CRF tagged line: " + str(tagLine)) | ||
253 | + for token in line.split(): | ||
254 | + listToken = token.split('|') | ||
255 | + word = listToken[0] | ||
256 | + lemma = listToken[1] | ||
257 | + tag = listToken[2] | ||
258 | + if options.filterStopWords: | ||
259 | + if lemma in stopwords: | ||
260 | + outputLine += token + ' ' | ||
261 | + continue | ||
262 | + if options.filterSymbols: | ||
263 | + if lemma in symbols: | ||
264 | + if lemma == ',': | ||
265 | + print("Coma , identificada") | ||
266 | + outputLine += token + ' ' | ||
267 | + continue | ||
268 | + CRFtag = tagLine[idx_tagLine] | ||
269 | + #if (tag not in labels) and (CRFtag != 'O'): | ||
270 | + # print "*** CRF change token {} to {}".format(token, CRFtag) | ||
271 | + # outputLine += word + '|' + lemma + '|' + CRFtag + ' ' | ||
272 | + #else: | ||
273 | + # outputLine += word + '|' + lemma + '|' + tag + ' ' | ||
274 | + #idx_tagLine += 1 | ||
275 | + sentencesOutputData.append(outputLine.rstrip()) | ||
276 | + with open(os.path.join(options.outputPath, file), "w") as oFile: | ||
277 | + for line in sentencesOutputData: | ||
278 | + oFile.write(line + '\n') | ||
279 | + | ||
280 | + print("Processing corpus done in: %fs" % (time() - t0)) |
-
Please register or login to post a comment