Estefani Gaytan Nunez

upload

1 -{"key1": "keyinfo", "key2": "keyinfo2"}
...\ No newline at end of file ...\ No newline at end of file
1 +# -*- coding: UTF-8 -*-
2 +
3 +import os
4 +import re
5 +from pandas import DataFrame as DF
6 +from optparse import OptionParser
7 +from time import time
8 +from collections import Counter
9 +
10 +import nltk
11 +import sklearn
12 +import scipy.stats
13 +import sys
14 +
15 +import joblib
16 +from sklearn.metrics import make_scorer
17 +from sklearn.model_selection import cross_val_score
18 +from sklearn.model_selection import RandomizedSearchCV
19 +
20 +import sklearn_crfsuite
21 +from sklearn_crfsuite import scorers
22 +from sklearn_crfsuite import metrics
23 +
24 +from nltk.corpus import stopwords
25 +
26 +import training_validation_v14 as training
27 +
28 +import json
29 +
30 +#-------------------------------------------------------------------------------
31 +# Objective
32 +# Tagging transformed file with CRF model with sklearn-crfsuite.
33 +#
34 +# Input parameters
35 +# --inputPath=PATH Path of transformed files x|y|z
36 +# --outputPath Output path to place output files
37 +# --outputFileI Output tagged file I
38 +# --outputFileII Output tagged file II
39 +# --modelPath Path to CRF model
40 +# --modelName Model name
41 +# --infoPath Path of GSE-GSM index file
42 +# --infoFile GSE-GSM index file",
43 +# --variant Part of S2 variant
44 +# --S1 Inner word features set
45 +# --S2 Complete word features
46 +# --S3 Extended context features
47 +# --S4 Semantic features
48 +# --filteringStopWords Filtering stop words
49 +# --filterSymbols Filtering punctuation marks
50 +
51 +# Output
52 +# 1) Tagged files in transformed format
53 +
54 +# Examples
55 +# --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
56 +# --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
57 +# --outputFileI annot-input_bg_outputI.txt
58 +# --outputFileII annot-input_bg_outputII.txt
59 +# --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models
60 +# --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
61 +# --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
62 +# --infoFile bg_sentences_midx.txt
63 +# --variant 13
64 +
65 +#Examples
66 +#predict-annot/bin/tagging/tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI_v5.txt --outputFileII annot-input_bg_outputII_v5 --outputFileIII annot-input_bg_outputIII_v5 --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx_v4.txt --variant 10 --S2 > predict-annot/reports/annot-input_bg_report_v4.txt > predict-annot/reports/output_tagging_report_v5.txt
67 +__author__ = 'egaytan'
68 +
69 +##########################################
70 +# MAIN PROGRAM #
71 +##########################################
72 +
73 +if __name__ == "__main__":
74 + ########################################### Defining parameters ##########################################
75 + parser = OptionParser()
76 + parser.add_option("--inputPath", dest="inputPath", help="Path of training data set", metavar="PATH")
77 + parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
78 + parser.add_option("--outputFileI", dest="outFileI", help="Output tagged file I", metavar="FILE")
79 + parser.add_option("--outputFileII", dest="outFileII", help="Output tagged file II", metavar="FILE")
80 + parser.add_option("--outputFileIII", dest="outFileIII", help="Output tagged file III", metavar="FILE")
81 + parser.add_option("--modelPath", dest="modelPath", help="Path to read CRF model", metavar="PATH")
82 + parser.add_option("--modelName", dest="modelName", help="Model name", metavar="TEXT")
83 + parser.add_option("--infoPath", dest="infoPath", help="Path of GSE-GSM index file", metavar="PATH")
84 + parser.add_option("--infoFile", dest="idx", help="GSE-GSM index file", metavar="FILE")
85 + parser.add_option("--variant", dest="variant", help="Run variant", metavar="FILE")
86 + parser.add_option("--S1", dest="S1", help="Inner word features", action="store_true", default=False)
87 + parser.add_option("--S2", dest="S2", help="Complete word features", action="store_true", default=False)
88 + parser.add_option("--S3", dest="S3", help="Extended context features", action="store_true", default=False)
89 + parser.add_option("--S4", dest="S4", help="Semantic features", action="store_true", default=False)
90 +
91 + (options, args) = parser.parse_args()
92 + if len(args) > 0:
93 + parser.error("Any parameter given.")
94 + sys.exit(1)
95 +
96 + ########################################### DISP PARAMETERS ##########################################
97 + print('-------------------------------- PARAMETERS --------------------------------')
98 +
99 + print("--inputPath Path of training data set : " + str(options.inputPath ))
100 + print("--outputPath Output path to place output files: " + str(options.outputPath ))
101 + print("--outputFileI Output tagged file I : " + str(options.outFileI ))
102 + print("--outputFileII Output tagged file II : " + str(options.outFileII ))
103 + print("--outputFileII Output tagged file III : " + str(options.outFileIII ))
104 + print("--modelPath Path to read CRF model : " + str(options.modelPath ))
105 + print("--modelName Model name : " + str(options.modelName ))
106 + print("--infoPath Path of GSE-GSM index file : " + str(options.infoPath ))
107 + print("--infoFile GSE-GSM index file : " + str(options.idx ))
108 + print("--variant Run variant : " + str(options.variant ))
109 + print("--S1 Inner word features set : " + str(options.S1 ))
110 + print("--S2 Complete word features : " + str(options.S2 ))
111 + print("--S3 Extended context features : " + str(options.S3 ))
112 + print("--S4 Semantic features : " + str(options.S4 ))
113 +
114 + ########################################### PROCESSING ##########################################
115 + print('-------------------------------- PROCESSING --------------------------------')
116 +
117 + # Load index mapping GSE file information
118 + idx = open(os.path.join(options.infoPath, options.idx), "r").readlines()
119 +
120 + ########################################### Read CRF model ##########################################
121 + t0 = time()
122 + print('Reading CRF model...')
123 + crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod'))
124 + print("Reading CRF model done in: %fs" % (time() - t0))
125 +
126 + ########################################### Reading sentences ##########################################
127 + print('Processing corpus...')
128 + t0 = time()
129 + labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
130 +
131 + ########################################### Preprocessing ###########################################
132 + # Walk directory to read files
133 + for path, dirs, files in os.walk(options.inputPath):
134 + # For each file in dir
135 + for file in files:
136 + print("Preprocessing file..." + str(file))
137 + sentencesInputData = []
138 + sentencesOutputDataI = []
139 +
140 + # Preprocessing input sentences
141 + with open(os.path.join(options.inputPath, file), "r") as iFile:
142 + lines = iFile.readlines()
143 + sentencesInputData = [ line.strip('\n').split() for line in lines]
144 + # Save input sentences
145 + X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData]
146 + print("Sentences input data: " + str(len(sentencesInputData)))
147 +
148 + ########################################### Predicting tags ###########################################
149 +
150 + t1 = time()
151 + print("Predicting tags with model...")
152 + y_pred = crf.predict(X_input)
153 + print("Prediction done in: %fs" % (time() - t1))
154 +
155 + ########################################### Tagging with CRF model ###########################################
156 +
157 + print("Tagging file...")
158 + lidx = 0
159 + for line, tagLine in zip(lines, y_pred):
160 +
161 + # get unique tags
162 + Ltags = set(labels).intersection(set(tagLine))
163 +
164 + # Skip untagged sentence
165 + if Ltags == {'O'}: continue
166 + line = line.strip('\n')
167 +
168 + # start an empty sentence
169 + outputLine = ''
170 +
171 + # per sentence word count
172 + i = 0
173 +
174 + # Exception for one word sentences
175 + if len(tagLine) == 1:
176 + if tagLine[0] in labels:
177 +
178 + # add start tagging signature
179 + start = '<' + tagLine[0] + '> '
180 +
181 + # add end tagging signature
182 + end = '</' + tagLine[0] + '>'
183 + word = line.split('|')[0] + ' '
184 +
185 + # save output tagged sentence
186 + outputLine = start + word + end
187 + else:
188 + outputLine = line.split(' ')[0]
189 + # Saving Sentence Ouput I
190 + #print(outputLine)
191 + sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + '\t' + ', '.join(Ltags))
192 + # Increase sentence counter
193 + lidx += 1
194 + # Continue with the next sentence
195 + continue
196 +
197 + # tag behind
198 + tb = 'O'
199 +
200 + # Tagging sentences
201 + for word,tag in zip(line.split(' '), tagLine):
202 +
203 + # general start tagging
204 + if tag in labels and tb != tag:
205 +
206 + # check continues tags case before start tagging
207 + if tb in labels and outputLine[-2:] != '> ':
208 + # closed the last tagging
209 + outputLine += '</' + tb + '> '
210 +
211 + # start new tagging
212 + outputLine += '<' + tag + '> '
213 + outputLine += word.split('|')[0] + ' '
214 +
215 + # check single word tagging case
216 + try:
217 + # close tagging for the sigle last word case
218 + if tag != tagLine[i+1]: outputLine += '</' + tag + '> '
219 + except:
220 + # close tagging for the sigle last word case
221 + if i+1==len(tagLine): outputLine += '</' + tag + '> '
222 +
223 + i += 1
224 + tb = tag
225 + continue
226 +
227 +
228 + # general close tagging
229 + elif tag in labels:
230 +
231 + # check end sentence case
232 + if i+1==len(tagLine):
233 + outputLine += word.split('|')[0] + ' '
234 + outputLine += '</' + tag + '> '
235 + i += 1
236 + tb = tag
237 + continue
238 +
239 + # close tagging
240 + elif tag!=tagLine[i+1]:
241 + outputLine += word.split('|')[0] + ' '
242 + outputLine += '</' + tag + '> '
243 + i += 1
244 + tb = tag
245 + continue
246 +
247 + # check the last closed
248 + if tb != tag and tb in labels and outputLine[-2:] != '> ':
249 + outputLine += '</' + tb + '> '
250 +
251 + # add word
252 + outputLine += word.split('|')[0] + ' '
253 + i += 1
254 + # save the previous tag
255 + tb = tag
256 +
257 + # Saving Sentence Ouput I
258 + sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ '\t' +', '.join(Ltags))
259 + lidx += 1
260 +
261 + ########################################### Save Output I ##########################################
262 + print("Saving Ouput I...")
263 + with open(os.path.join(options.outputPath, options.outFileI + '_' + options.modelName + '.tsv'), "w") as oFileI:
264 + for line in sentencesOutputDataI:
265 + if re.findall('</', line):
266 + #print(line)
267 + oline = line.replace('-LRB-','(')
268 + oline = oline.replace('-RRB-',')')
269 + oFileI.write(oline + '\n')
270 +
271 + ########################################### Save Output II ##########################################
272 + print("Saving Ouput II...")
273 + with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFileII:
274 + for line in sentencesOutputDataI:
275 + oline = line.replace('-LRB-','(')
276 + oline = oline.replace('-RRB-',')')
277 + for ttex, tag in re.findall(r'<[^>]+>([^<]+)</([^>]+)>', oline):
278 + lline = oline.split('\t')[0:-2] + [ttex, tag]
279 + nline = '\t'.join(lline)
280 + oFileII.write(nline + '\n')
281 +
282 + ########################################### Save Output III ##########################################
283 + print("Saving Ouput III...")
284 + with open(os.path.join(options.outputPath, options.outFileIII + '_' + options.modelName + '.tsv'), "w") as oFileIII:
285 + for line, tagLine in zip(lines, y_pred):
286 + oline = [ w.split('|')[0].replace('-LRB-','(').replace('-LRB-','(')+'|'+tag for w,tag in zip(line.split(' '), tagLine)]
287 +
288 + oFileIII.write(' '.join(oline) + '\n')
289 +
290 + ########################################### Save Probs ##########################################
291 + y_probs = crf.predict_marginals(X_input)
292 + # from https://stackoverflow.com/questions/7100125/storing-python-dictionaries
293 + with open(os.path.join(options.outputPath, 'crf_probs.json'), 'w') as fp:
294 + json.dump(y_probs, fp)
295 + print("Passing corpus done in: %fs" % (time() - t0))
This diff could not be displayed because it is too large.
1 -------------------------------- PARAMETERS -------------------------------- 1 -------------------------------- PARAMETERS --------------------------------
2 --inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ 2 --inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
3 --outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ 3 --outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
4 ---outputFileI Output tagged file I : annot-input_bg_outputI_v6 4 +--outputFileI Output tagged file I : annot-input_bg_outputI_v7
5 ---outputFileII Output tagged file II : annot-input_bg_outputII_v6 5 +--outputFileII Output tagged file II : annot-input_bg_outputII_v7
6 ---outputFileII Output tagged file III : annot-input_bg_outputIII_v6 6 +--outputFileII Output tagged file III : annot-input_bg_outputIII_v7
7 --modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models 7 --modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
8 --modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 8 --modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
9 --infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping 9 --infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
...@@ -15,14 +15,14 @@ ...@@ -15,14 +15,14 @@
15 --S4 Semantic features : False 15 --S4 Semantic features : False
16 -------------------------------- PROCESSING -------------------------------- 16 -------------------------------- PROCESSING --------------------------------
17 Reading CRF model... 17 Reading CRF model...
18 -Reading CRF model done in: 0.009524s 18 +Reading CRF model done in: 0.009408s
19 Processing corpus... 19 Processing corpus...
20 Preprocessing file...annot-input_bg_v4.txt 20 Preprocessing file...annot-input_bg_v4.txt
21 Sentences input data: 90688 21 Sentences input data: 90688
22 Predicting tags with model... 22 Predicting tags with model...
23 -Prediction done in: 27.458162s 23 +Prediction done in: 27.324524s
24 Tagging file... 24 Tagging file...
25 Saving Ouput I... 25 Saving Ouput I...
26 Saving Ouput II... 26 Saving Ouput II...
27 Saving Ouput III... 27 Saving Ouput III...
28 -Passing corpus done in: 257.970281s 28 +Passing corpus done in: 261.721646s
......