Showing
10 changed files
with
301 additions
and
7 deletions
data.json
deleted
100644 → 0
1 | -{"key1": "keyinfo", "key2": "keyinfo2"} | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
No preview for this file type
predict-annot/bin/tagging/tagging_2Nov.py
0 → 100644
1 | +# -*- coding: UTF-8 -*- | ||
2 | + | ||
3 | +import os | ||
4 | +import re | ||
5 | +from pandas import DataFrame as DF | ||
6 | +from optparse import OptionParser | ||
7 | +from time import time | ||
8 | +from collections import Counter | ||
9 | + | ||
10 | +import nltk | ||
11 | +import sklearn | ||
12 | +import scipy.stats | ||
13 | +import sys | ||
14 | + | ||
15 | +import joblib | ||
16 | +from sklearn.metrics import make_scorer | ||
17 | +from sklearn.model_selection import cross_val_score | ||
18 | +from sklearn.model_selection import RandomizedSearchCV | ||
19 | + | ||
20 | +import sklearn_crfsuite | ||
21 | +from sklearn_crfsuite import scorers | ||
22 | +from sklearn_crfsuite import metrics | ||
23 | + | ||
24 | +from nltk.corpus import stopwords | ||
25 | + | ||
26 | +import training_validation_v14 as training | ||
27 | + | ||
28 | +import json | ||
29 | + | ||
30 | +#------------------------------------------------------------------------------- | ||
31 | +# Objective | ||
32 | +# Tagging transformed file with CRF model with sklearn-crfsuite. | ||
33 | +# | ||
34 | +# Input parameters | ||
35 | +# --inputPath=PATH Path of transformed files x|y|z | ||
36 | +# --outputPath Output path to place output files | ||
37 | +# --outputFileI Output tagged file I | ||
38 | +# --outputFileII Output tagged file II | ||
39 | +# --modelPath Path to CRF model | ||
40 | +# --modelName Model name | ||
41 | +# --infoPath Path of GSE-GSM index file | ||
42 | +# --infoFile GSE-GSM index file", | ||
43 | +# --variant Part of S2 variant | ||
44 | +# --S1 Inner word features set | ||
45 | +# --S2 Complete word features | ||
46 | +# --S3 Extended context features | ||
47 | +# --S4 Semantic features | ||
48 | +# --filteringStopWords Filtering stop words | ||
49 | +# --filterSymbols Filtering punctuation marks | ||
50 | + | ||
51 | +# Output | ||
52 | +# 1) Tagged files in transformed format | ||
53 | + | ||
54 | +# Examples | ||
55 | +# --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ | ||
56 | +# --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ | ||
57 | +# --outputFileI annot-input_bg_outputI.txt | ||
58 | +# --outputFileII annot-input_bg_outputII.txt | ||
59 | +# --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models | ||
60 | +# --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 | ||
61 | +# --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping | ||
62 | +# --infoFile bg_sentences_midx.txt | ||
63 | +# --variant 13 | ||
64 | + | ||
65 | +#Examples | ||
66 | +#predict-annot/bin/tagging/tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI_v5.txt --outputFileII annot-input_bg_outputII_v5 --outputFileIII annot-input_bg_outputIII_v5 --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx_v4.txt --variant 10 --S2 > predict-annot/reports/annot-input_bg_report_v4.txt > predict-annot/reports/output_tagging_report_v5.txt | ||
67 | +__author__ = 'egaytan' | ||
68 | + | ||
69 | +########################################## | ||
70 | +# MAIN PROGRAM # | ||
71 | +########################################## | ||
72 | + | ||
73 | +if __name__ == "__main__": | ||
74 | + ########################################### Defining parameters ########################################## | ||
75 | + parser = OptionParser() | ||
76 | + parser.add_option("--inputPath", dest="inputPath", help="Path of training data set", metavar="PATH") | ||
77 | + parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH") | ||
78 | + parser.add_option("--outputFileI", dest="outFileI", help="Output tagged file I", metavar="FILE") | ||
79 | + parser.add_option("--outputFileII", dest="outFileII", help="Output tagged file II", metavar="FILE") | ||
80 | + parser.add_option("--outputFileIII", dest="outFileIII", help="Output tagged file III", metavar="FILE") | ||
81 | + parser.add_option("--modelPath", dest="modelPath", help="Path to read CRF model", metavar="PATH") | ||
82 | + parser.add_option("--modelName", dest="modelName", help="Model name", metavar="TEXT") | ||
83 | + parser.add_option("--infoPath", dest="infoPath", help="Path of GSE-GSM index file", metavar="PATH") | ||
84 | + parser.add_option("--infoFile", dest="idx", help="GSE-GSM index file", metavar="FILE") | ||
85 | + parser.add_option("--variant", dest="variant", help="Run variant", metavar="FILE") | ||
86 | + parser.add_option("--S1", dest="S1", help="Inner word features", action="store_true", default=False) | ||
87 | + parser.add_option("--S2", dest="S2", help="Complete word features", action="store_true", default=False) | ||
88 | + parser.add_option("--S3", dest="S3", help="Extended context features", action="store_true", default=False) | ||
89 | + parser.add_option("--S4", dest="S4", help="Semantic features", action="store_true", default=False) | ||
90 | + | ||
91 | + (options, args) = parser.parse_args() | ||
92 | + if len(args) > 0: | ||
93 | + parser.error("Any parameter given.") | ||
94 | + sys.exit(1) | ||
95 | + | ||
96 | + ########################################### DISP PARAMETERS ########################################## | ||
97 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
98 | + | ||
99 | + print("--inputPath Path of training data set : " + str(options.inputPath )) | ||
100 | + print("--outputPath Output path to place output files: " + str(options.outputPath )) | ||
101 | + print("--outputFileI Output tagged file I : " + str(options.outFileI )) | ||
102 | + print("--outputFileII Output tagged file II : " + str(options.outFileII )) | ||
103 | + print("--outputFileII Output tagged file III : " + str(options.outFileIII )) | ||
104 | + print("--modelPath Path to read CRF model : " + str(options.modelPath )) | ||
105 | + print("--modelName Model name : " + str(options.modelName )) | ||
106 | + print("--infoPath Path of GSE-GSM index file : " + str(options.infoPath )) | ||
107 | + print("--infoFile GSE-GSM index file : " + str(options.idx )) | ||
108 | + print("--variant Run variant : " + str(options.variant )) | ||
109 | + print("--S1 Inner word features set : " + str(options.S1 )) | ||
110 | + print("--S2 Complete word features : " + str(options.S2 )) | ||
111 | + print("--S3 Extended context features : " + str(options.S3 )) | ||
112 | + print("--S4 Semantic features : " + str(options.S4 )) | ||
113 | + | ||
114 | + ########################################### PROCESSING ########################################## | ||
115 | + print('-------------------------------- PROCESSING --------------------------------') | ||
116 | + | ||
117 | + # Load index mapping GSE file information | ||
118 | + idx = open(os.path.join(options.infoPath, options.idx), "r").readlines() | ||
119 | + | ||
120 | + ########################################### Read CRF model ########################################## | ||
121 | + t0 = time() | ||
122 | + print('Reading CRF model...') | ||
123 | + crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod')) | ||
124 | + print("Reading CRF model done in: %fs" % (time() - t0)) | ||
125 | + | ||
126 | + ########################################### Reading sentences ########################################## | ||
127 | + print('Processing corpus...') | ||
128 | + t0 = time() | ||
129 | + labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH']) | ||
130 | + | ||
131 | + ########################################### Preprocessing ########################################### | ||
132 | + # Walk directory to read files | ||
133 | + for path, dirs, files in os.walk(options.inputPath): | ||
134 | + # For each file in dir | ||
135 | + for file in files: | ||
136 | + print("Preprocessing file..." + str(file)) | ||
137 | + sentencesInputData = [] | ||
138 | + sentencesOutputDataI = [] | ||
139 | + | ||
140 | + # Preprocessing input sentences | ||
141 | + with open(os.path.join(options.inputPath, file), "r") as iFile: | ||
142 | + lines = iFile.readlines() | ||
143 | + sentencesInputData = [ line.strip('\n').split() for line in lines] | ||
144 | + # Save input sentences | ||
145 | + X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData] | ||
146 | + print("Sentences input data: " + str(len(sentencesInputData))) | ||
147 | + | ||
148 | + ########################################### Predicting tags ########################################### | ||
149 | + | ||
150 | + t1 = time() | ||
151 | + print("Predicting tags with model...") | ||
152 | + y_pred = crf.predict(X_input) | ||
153 | + print("Prediction done in: %fs" % (time() - t1)) | ||
154 | + | ||
155 | + ########################################### Tagging with CRF model ########################################### | ||
156 | + | ||
157 | + print("Tagging file...") | ||
158 | + lidx = 0 | ||
159 | + for line, tagLine in zip(lines, y_pred): | ||
160 | + | ||
161 | + # get unique tags | ||
162 | + Ltags = set(labels).intersection(set(tagLine)) | ||
163 | + | ||
164 | + # Skip untagged sentence | ||
165 | + if Ltags == {'O'}: continue | ||
166 | + line = line.strip('\n') | ||
167 | + | ||
168 | + # start an empty sentence | ||
169 | + outputLine = '' | ||
170 | + | ||
171 | + # per sentence word count | ||
172 | + i = 0 | ||
173 | + | ||
174 | + # Exception for one word sentences | ||
175 | + if len(tagLine) == 1: | ||
176 | + if tagLine[0] in labels: | ||
177 | + | ||
178 | + # add start tagging signature | ||
179 | + start = '<' + tagLine[0] + '> ' | ||
180 | + | ||
181 | + # add end tagging signature | ||
182 | + end = '</' + tagLine[0] + '>' | ||
183 | + word = line.split('|')[0] + ' ' | ||
184 | + | ||
185 | + # save output tagged sentence | ||
186 | + outputLine = start + word + end | ||
187 | + else: | ||
188 | + outputLine = line.split(' ')[0] | ||
189 | + # Saving Sentence Ouput I | ||
190 | + #print(outputLine) | ||
191 | + sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + '\t' + ', '.join(Ltags)) | ||
192 | + # Increase sentence counter | ||
193 | + lidx += 1 | ||
194 | + # Continue with the next sentence | ||
195 | + continue | ||
196 | + | ||
197 | + # tag behind | ||
198 | + tb = 'O' | ||
199 | + | ||
200 | + # Tagging sentences | ||
201 | + for word,tag in zip(line.split(' '), tagLine): | ||
202 | + | ||
203 | + # general start tagging | ||
204 | + if tag in labels and tb != tag: | ||
205 | + | ||
206 | + # check continues tags case before start tagging | ||
207 | + if tb in labels and outputLine[-2:] != '> ': | ||
208 | + # closed the last tagging | ||
209 | + outputLine += '</' + tb + '> ' | ||
210 | + | ||
211 | + # start new tagging | ||
212 | + outputLine += '<' + tag + '> ' | ||
213 | + outputLine += word.split('|')[0] + ' ' | ||
214 | + | ||
215 | + # check single word tagging case | ||
216 | + try: | ||
217 | + # close tagging for the sigle last word case | ||
218 | + if tag != tagLine[i+1]: outputLine += '</' + tag + '> ' | ||
219 | + except: | ||
220 | + # close tagging for the sigle last word case | ||
221 | + if i+1==len(tagLine): outputLine += '</' + tag + '> ' | ||
222 | + | ||
223 | + i += 1 | ||
224 | + tb = tag | ||
225 | + continue | ||
226 | + | ||
227 | + | ||
228 | + # general close tagging | ||
229 | + elif tag in labels: | ||
230 | + | ||
231 | + # check end sentence case | ||
232 | + if i+1==len(tagLine): | ||
233 | + outputLine += word.split('|')[0] + ' ' | ||
234 | + outputLine += '</' + tag + '> ' | ||
235 | + i += 1 | ||
236 | + tb = tag | ||
237 | + continue | ||
238 | + | ||
239 | + # close tagging | ||
240 | + elif tag!=tagLine[i+1]: | ||
241 | + outputLine += word.split('|')[0] + ' ' | ||
242 | + outputLine += '</' + tag + '> ' | ||
243 | + i += 1 | ||
244 | + tb = tag | ||
245 | + continue | ||
246 | + | ||
247 | + # check the last closed | ||
248 | + if tb != tag and tb in labels and outputLine[-2:] != '> ': | ||
249 | + outputLine += '</' + tb + '> ' | ||
250 | + | ||
251 | + # add word | ||
252 | + outputLine += word.split('|')[0] + ' ' | ||
253 | + i += 1 | ||
254 | + # save the previous tag | ||
255 | + tb = tag | ||
256 | + | ||
257 | + # Saving Sentence Ouput I | ||
258 | + sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ '\t' +', '.join(Ltags)) | ||
259 | + lidx += 1 | ||
260 | + | ||
261 | + ########################################### Save Output I ########################################## | ||
262 | + print("Saving Ouput I...") | ||
263 | + with open(os.path.join(options.outputPath, options.outFileI + '_' + options.modelName + '.tsv'), "w") as oFileI: | ||
264 | + for line in sentencesOutputDataI: | ||
265 | + if re.findall('</', line): | ||
266 | + #print(line) | ||
267 | + oline = line.replace('-LRB-','(') | ||
268 | + oline = oline.replace('-RRB-',')') | ||
269 | + oFileI.write(oline + '\n') | ||
270 | + | ||
271 | + ########################################### Save Output II ########################################## | ||
272 | + print("Saving Ouput II...") | ||
273 | + with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFileII: | ||
274 | + for line in sentencesOutputDataI: | ||
275 | + oline = line.replace('-LRB-','(') | ||
276 | + oline = oline.replace('-RRB-',')') | ||
277 | + for ttex, tag in re.findall(r'<[^>]+>([^<]+)</([^>]+)>', oline): | ||
278 | + lline = oline.split('\t')[0:-2] + [ttex, tag] | ||
279 | + nline = '\t'.join(lline) | ||
280 | + oFileII.write(nline + '\n') | ||
281 | + | ||
282 | + ########################################### Save Output III ########################################## | ||
283 | + print("Saving Ouput III...") | ||
284 | + with open(os.path.join(options.outputPath, options.outFileIII + '_' + options.modelName + '.tsv'), "w") as oFileIII: | ||
285 | + for line, tagLine in zip(lines, y_pred): | ||
286 | + oline = [ w.split('|')[0].replace('-LRB-','(').replace('-LRB-','(')+'|'+tag for w,tag in zip(line.split(' '), tagLine)] | ||
287 | + | ||
288 | + oFileIII.write(' '.join(oline) + '\n') | ||
289 | + | ||
290 | + ########################################### Save Probs ########################################## | ||
291 | + y_probs = crf.predict_marginals(X_input) | ||
292 | + # from https://stackoverflow.com/questions/7100125/storing-python-dictionaries | ||
293 | + with open(os.path.join(options.outputPath, 'crf_probs.json'), 'w') as fp: | ||
294 | + json.dump(y_probs, fp) | ||
295 | + print("Passing corpus done in: %fs" % (time() - t0)) |
This file is too large to display.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
1 | -------------------------------- PARAMETERS -------------------------------- | 1 | -------------------------------- PARAMETERS -------------------------------- |
2 | --inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ | 2 | --inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ |
3 | --outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ | 3 | --outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ |
4 | ---outputFileI Output tagged file I : annot-input_bg_outputI_v6 | 4 | +--outputFileI Output tagged file I : annot-input_bg_outputI_v7 |
5 | ---outputFileII Output tagged file II : annot-input_bg_outputII_v6 | 5 | +--outputFileII Output tagged file II : annot-input_bg_outputII_v7 |
6 | ---outputFileII Output tagged file III : annot-input_bg_outputIII_v6 | 6 | +--outputFileII Output tagged file III : annot-input_bg_outputIII_v7 |
7 | --modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models | 7 | --modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models |
8 | --modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 | 8 | --modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 |
9 | --infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping | 9 | --infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping |
... | @@ -15,14 +15,14 @@ | ... | @@ -15,14 +15,14 @@ |
15 | --S4 Semantic features : False | 15 | --S4 Semantic features : False |
16 | -------------------------------- PROCESSING -------------------------------- | 16 | -------------------------------- PROCESSING -------------------------------- |
17 | Reading CRF model... | 17 | Reading CRF model... |
18 | -Reading CRF model done in: 0.009524s | 18 | +Reading CRF model done in: 0.009408s |
19 | Processing corpus... | 19 | Processing corpus... |
20 | Preprocessing file...annot-input_bg_v4.txt | 20 | Preprocessing file...annot-input_bg_v4.txt |
21 | Sentences input data: 90688 | 21 | Sentences input data: 90688 |
22 | Predicting tags with model... | 22 | Predicting tags with model... |
23 | -Prediction done in: 27.458162s | 23 | +Prediction done in: 27.324524s |
24 | Tagging file... | 24 | Tagging file... |
25 | Saving Ouput I... | 25 | Saving Ouput I... |
26 | Saving Ouput II... | 26 | Saving Ouput II... |
27 | Saving Ouput III... | 27 | Saving Ouput III... |
28 | -Passing corpus done in: 257.970281s | 28 | +Passing corpus done in: 261.721646s | ... | ... |
-
Please register or login to post a comment