Showing
9 changed files
with
271 additions
and
40 deletions
1 | +# -*- coding: UTF-8 -*- | ||
2 | + | ||
3 | +import os | ||
4 | +import re | ||
5 | +from pandas import DataFrame as DF | ||
6 | +from optparse import OptionParser | ||
7 | +from time import time | ||
8 | +from collections import Counter | ||
9 | + | ||
10 | +import nltk | ||
11 | +import sklearn | ||
12 | +import scipy.stats | ||
13 | +import sys | ||
14 | + | ||
15 | +import joblib | ||
16 | +from sklearn.metrics import make_scorer | ||
17 | +from sklearn.model_selection import cross_val_score | ||
18 | +from sklearn.model_selection import RandomizedSearchCV | ||
19 | + | ||
20 | +import sklearn_crfsuite | ||
21 | +from sklearn_crfsuite import scorers | ||
22 | +from sklearn_crfsuite import metrics | ||
23 | + | ||
24 | +from nltk.corpus import stopwords | ||
25 | + | ||
26 | +import training_validation_v14 as training | ||
27 | + | ||
28 | +import json | ||
29 | + | ||
30 | +#------------------------------------------------------------------------------- | ||
31 | +# Objective | ||
32 | +# Tagging transformed file with CRF model with sklearn-crfsuite. | ||
33 | +# | ||
34 | +# Input parameters | ||
35 | +# --inputPath=PATH Path of transformed files x|y|z | ||
36 | +# --outputPath Output path to place output files | ||
37 | +# --outputFileI Output tagged file I | ||
38 | +# --outputFileII Output tagged file II | ||
39 | +# --modelPath Path to CRF model | ||
40 | +# --modelName Model name | ||
41 | +# --infoPath Path of GSE-GSM index file | ||
42 | +# --infoFile GSE-GSM index file", | ||
43 | +# --variant Part of S2 variant | ||
44 | +# --S1 Inner word features set | ||
45 | +# --S2 Complete word features | ||
46 | +# --S3 Extended context features | ||
47 | +# --S4 Semantic features | ||
48 | +# --filteringStopWords Filtering stop words | ||
49 | +# --filterSymbols Filtering punctuation marks | ||
50 | + | ||
51 | +# Output | ||
52 | +# 1) Tagged files in transformed format | ||
53 | + | ||
54 | +# Examples | ||
55 | +# --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ | ||
56 | +# --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ | ||
57 | +# --outputFileI annot-input_bg_outputI.txt | ||
58 | +# --outputFileII annot-input_bg_outputII.txt | ||
59 | +# --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models | ||
60 | +# --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 | ||
61 | +# --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping | ||
62 | +# --infoFile bg_sentences_midx.txt | ||
63 | +# --variant 13 | ||
64 | + | ||
65 | +#Examples | ||
66 | +#predict-annot/bin/tagging/tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI_v5.txt --outputFileII annot-input_bg_outputII_v5 --outputFileIII annot-input_bg_outputIII_v5 --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx_v4.txt --variant 10 --S2 > predict-annot/reports/annot-input_bg_report_v4.txt > predict-annot/reports/output_tagging_report_v5.txt | ||
67 | +__author__ = 'egaytan' | ||
68 | + | ||
69 | +########################################## | ||
70 | +# MAIN PROGRAM # | ||
71 | +########################################## | ||
72 | + | ||
73 | +if __name__ == "__main__": | ||
74 | + ########################################### Defining parameters ########################################## | ||
75 | + parser = OptionParser() | ||
76 | + parser.add_option("--inputPath", dest="inputPath", help="Path of training data set", metavar="PATH") | ||
77 | + parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH") | ||
78 | + parser.add_option("--outputFileI", dest="outFileI", help="Output tagged file I", metavar="FILE") | ||
79 | + parser.add_option("--outputFileII", dest="outFileII", help="Output tagged file II", metavar="FILE") | ||
80 | + parser.add_option("--outputFileIII", dest="outFileIII", help="Output tagged file III", metavar="FILE") | ||
81 | + parser.add_option("--modelPath", dest="modelPath", help="Path to read CRF model", metavar="PATH") | ||
82 | + parser.add_option("--modelName", dest="modelName", help="Model name", metavar="TEXT") | ||
83 | + parser.add_option("--infoPath", dest="infoPath", help="Path of GSE-GSM index file", metavar="PATH") | ||
84 | + parser.add_option("--infoFile", dest="idx", help="GSE-GSM index file", metavar="FILE") | ||
85 | + parser.add_option("--variant", dest="variant", help="Run variant", metavar="FILE") | ||
86 | + parser.add_option("--S1", dest="S1", help="Inner word features", action="store_true", default=False) | ||
87 | + parser.add_option("--S2", dest="S2", help="Complete word features", action="store_true", default=False) | ||
88 | + parser.add_option("--S3", dest="S3", help="Extended context features", action="store_true", default=False) | ||
89 | + parser.add_option("--S4", dest="S4", help="Semantic features", action="store_true", default=False) | ||
90 | + | ||
91 | + (options, args) = parser.parse_args() | ||
92 | + if len(args) > 0: | ||
93 | + parser.error("Any parameter given.") | ||
94 | + sys.exit(1) | ||
95 | + | ||
96 | + ########################################### DISP PARAMETERS ########################################## | ||
97 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
98 | + | ||
99 | + print("--inputPath Path of training data set : " + str(options.inputPath )) | ||
100 | + print("--outputPath Output path to place output files: " + str(options.outputPath )) | ||
101 | + print("--outputFileI Output tagged file I : " + str(options.outFileI )) | ||
102 | + print("--outputFileII Output tagged file II : " + str(options.outFileII )) | ||
103 | + print("--outputFileII Output tagged file III : " + str(options.outFileIII )) | ||
104 | + print("--modelPath Path to read CRF model : " + str(options.modelPath )) | ||
105 | + print("--modelName Model name : " + str(options.modelName )) | ||
106 | + print("--infoPath Path of GSE-GSM index file : " + str(options.infoPath )) | ||
107 | + print("--infoFile GSE-GSM index file : " + str(options.idx )) | ||
108 | + print("--variant Run variant : " + str(options.variant )) | ||
109 | + print("--S1 Inner word features set : " + str(options.S1 )) | ||
110 | + print("--S2 Complete word features : " + str(options.S2 )) | ||
111 | + print("--S3 Extended context features : " + str(options.S3 )) | ||
112 | + print("--S4 Semantic features : " + str(options.S4 )) | ||
113 | + | ||
114 | + ########################################### PROCESSING ########################################## | ||
115 | + print('-------------------------------- PROCESSING --------------------------------') | ||
116 | + | ||
117 | + # Read index mapping GSE file information | ||
118 | + idx = open(os.path.join(options.infoPath, options.idx), "r").readlines() | ||
119 | + | ||
120 | + ########################################### Read CRF model ########################################## | ||
121 | + t0 = time() | ||
122 | + print('Reading CRF model...') | ||
123 | + crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod')) | ||
124 | + print("Reading CRF model done in: %fs" % (time() - t0)) | ||
125 | + | ||
126 | + ########################################### Reading sentences ########################################## | ||
127 | + print('Processing corpus...') | ||
128 | + t0 = time() | ||
129 | + labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH']) | ||
130 | + | ||
131 | + ########################################### Preprocessing ########################################### | ||
132 | + # Walk directory to read files | ||
133 | + for path, dirs, files in os.walk(options.inputPath): | ||
134 | + # For each file in dir | ||
135 | + for file in files: | ||
136 | + print("Preprocessing file..." + str(file)) | ||
137 | + sentencesInputData = [] | ||
138 | + sentencesOutputDataI = [] | ||
139 | + # Preprocessing input sentences | ||
140 | + with open(os.path.join(options.inputPath, file), "r") as iFile: | ||
141 | + lines = iFile.readlines() | ||
142 | + sentencesInputData = [ line.strip('\n').split() for line in lines] | ||
143 | + # Save input sentences | ||
144 | + X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData] | ||
145 | + print("Sentences input data: " + str(len(sentencesInputData))) | ||
146 | + | ||
147 | + ########################################### Predicting tags ########################################### | ||
148 | + t1 = time() | ||
149 | + print("Predicting tags with model...") | ||
150 | + y_pred = crf.predict(X_input) | ||
151 | + | ||
152 | + #print(y_pred) | ||
153 | + print("Prediction done in: %fs" % (time() - t1)) | ||
154 | + | ||
155 | + ########################################### Tagging with CRF model ########################################### | ||
156 | + print("Tagging file...") | ||
157 | + lidx = 0 | ||
158 | + for line, tagLine in zip(lines, y_pred): | ||
159 | + # unique tags | ||
160 | + Ltags = set(labels).intersection(set(tagLine)) | ||
161 | + # Skip untagged sentence | ||
162 | + if Ltags == {'O'}: continue | ||
163 | + line = line.strip('\n') | ||
164 | + # starting empty sentence | ||
165 | + outputLine = '' | ||
166 | + # tag behind | ||
167 | + tb = 'O' | ||
168 | + # per sentence word count | ||
169 | + i = 0 | ||
170 | + # Exception for one word sentences | ||
171 | + if len(tagLine) == 1: | ||
172 | + if tagLine[0] in labels: | ||
173 | + # add start tagging signature | ||
174 | + start = '<' + tagLine[0] + '> ' | ||
175 | + # add end tagging signature | ||
176 | + end = '</' + tagLine[0] + '>' | ||
177 | + word = line.split('|')[0] + ' ' | ||
178 | + # save output tagged sentence | ||
179 | + outputLine = start + word + end | ||
180 | + else: | ||
181 | + outputLine = line.split(' ')[0] | ||
182 | + # Saving Sentence Ouput I | ||
183 | + #print(outputLine) | ||
184 | + sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + '\t' + ', '.join(Ltags)) | ||
185 | + # Increase sentence counter | ||
186 | + lidx += 1 | ||
187 | + # Continue with the next sentence | ||
188 | + continue | ||
189 | + # Tagging sentences | ||
190 | + for word,tag in zip(line.split(' '), tagLine): | ||
191 | + # start tagging | ||
192 | + if tag in labels and tb != tag: | ||
193 | + # check the last tagged word(s) | ||
194 | + if tb in labels and outputLine[-2:] != '> ': | ||
195 | + outputLine += '<' + tb + '> ' | ||
196 | + outputLine += '<' + tag + '> ' | ||
197 | + outputLine += word.split('|')[0] + ' ' | ||
198 | + tb = tag | ||
199 | + i += 1 | ||
200 | + continue | ||
201 | + # end tagging | ||
202 | + elif tag in labels: #elif tb in labels: #elif tag in labels and tag!=tagLine[i+1]: | ||
203 | + if i+1==len(tagLine): | ||
204 | + # end sentence | ||
205 | + #print(outputLine) | ||
206 | + outputLine += word.split('|')[0] + ' ' | ||
207 | + outputLine += '</' + tag + '> ' | ||
208 | + tb = 'O' | ||
209 | + i += 1 | ||
210 | + continue | ||
211 | + elif tag!=tagLine[i+1]: | ||
212 | + # start new tag | ||
213 | + outputLine += word.split('|')[0] + ' ' | ||
214 | + outputLine += '</' + tag + '> ' | ||
215 | + tb = 'O' | ||
216 | + i += 1 | ||
217 | + continue | ||
218 | + # check the last tagged word(s) | ||
219 | + if tb != tag and tb in labels and outputLine[-2:] != '<' + tb + '> ': | ||
220 | + outputLine += '</' + tb + '> ' | ||
221 | + # word tagged | ||
222 | + outputLine += word.split('|')[0] + ' ' | ||
223 | + i += 1 | ||
224 | + # Saving Sentence Ouput I | ||
225 | + #print(outputLine) | ||
226 | + sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ '\t' +', '.join(Ltags)) | ||
227 | + lidx += 1 | ||
228 | + | ||
229 | + #print("\n".join(sentencesOutputDataI[1:3])) | ||
230 | + | ||
231 | + ########################################### Save Output I ########################################## | ||
232 | + print("Saving Ouput I...") | ||
233 | + with open(os.path.join(options.outputPath, options.outFileI + '_' + options.modelName + '.tsv'), "w") as oFileI: | ||
234 | + for line in sentencesOutputDataI: | ||
235 | + if re.findall('</', line): | ||
236 | + #print(line) | ||
237 | + oline = line.replace('-LRB-','(') | ||
238 | + oline = oline.replace('-RRB-',')') | ||
239 | + oFileI.write(oline + '\n') | ||
240 | + | ||
241 | + ########################################### Save Output II ########################################## | ||
242 | + print("Saving Ouput II...") | ||
243 | + with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFileII: | ||
244 | + for line in sentencesOutputDataI: | ||
245 | + oline = line.replace('-LRB-','(') | ||
246 | + oline = oline.replace('-RRB-',')') | ||
247 | + for ttex, tag in re.findall(r'<[^>]+>([^<]+)</([^>]+)>', oline): | ||
248 | + lline = oline.split('\t')[0:-2] + [ttex, tag] | ||
249 | + nline = '\t'.join(lline) | ||
250 | + oFileII.write(nline + '\n') | ||
251 | + | ||
252 | + ########################################### Save Output III ########################################## | ||
253 | + print("Saving Ouput III...") | ||
254 | + with open(os.path.join(options.outputPath, options.outFileIII + '_' + options.modelName + '.tsv'), "w") as oFileIII: | ||
255 | + for line, tagLine in zip(lines, y_pred): | ||
256 | + oline = [ w.split('|')[0].replace('-LRB-','(').replace('-LRB-','(')+'|'+tag for w,tag in zip(line.split(' '), tagLine)] | ||
257 | + | ||
258 | + oFileIII.write(' '.join(oline) + '\n') | ||
259 | + | ||
260 | + ########################################### Save Probs ########################################## | ||
261 | + y_probs = crf.predict_marginals(X_input) | ||
262 | + # from https://stackoverflow.com/questions/7100125/storing-python-dictionaries | ||
263 | + with open(os.path.join(options.outputPath, 'crf_probs.json'), 'w') as fp: | ||
264 | + json.dump(y_probs, fp) | ||
265 | + print("Passing corpus done in: %fs" % (time() - t0)) |
This file is too large to display.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
File mode changed
1 | -------------------------------- PARAMETERS -------------------------------- | 1 | -------------------------------- PARAMETERS -------------------------------- |
2 | --inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ | 2 | --inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ |
3 | --outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ | 3 | --outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ |
4 | ---outputFileI Output tagged file I : annot-input_bg_outputI_v5.txt | 4 | +--outputFileI Output tagged file I : annot-input_bg_outputI_v6 |
5 | ---outputFileII Output tagged file II : annot-input_bg_outputII_v5 | 5 | +--outputFileII Output tagged file II : annot-input_bg_outputII_v6 |
6 | ---outputFileII Output tagged file III : annot-input_bg_outputIII_v5 | 6 | +--outputFileII Output tagged file III : annot-input_bg_outputIII_v6 |
7 | --modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models | 7 | --modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models |
8 | --modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 | 8 | --modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 |
9 | --infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping | 9 | --infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping |
... | @@ -15,16 +15,14 @@ | ... | @@ -15,16 +15,14 @@ |
15 | --S4 Semantic features : False | 15 | --S4 Semantic features : False |
16 | -------------------------------- PROCESSING -------------------------------- | 16 | -------------------------------- PROCESSING -------------------------------- |
17 | Reading CRF model... | 17 | Reading CRF model... |
18 | -Reading CRF model done in: 0.009225s | 18 | +Reading CRF model done in: 0.009524s |
19 | Processing corpus... | 19 | Processing corpus... |
20 | Preprocessing file...annot-input_bg_v4.txt | 20 | Preprocessing file...annot-input_bg_v4.txt |
21 | Sentences input data: 90688 | 21 | Sentences input data: 90688 |
22 | Predicting tags with model... | 22 | Predicting tags with model... |
23 | -Prediction done in: 27.733279s | 23 | +Prediction done in: 27.458162s |
24 | Tagging file... | 24 | Tagging file... |
25 | -GSE100233 GSM2675514 GPL18006-GPL18133-PMID:29186514 library_strategy.1 <Technique> ChIP-Seq </Technique> Technique | ||
26 | -GSE100233 GSM2675514 GPL18006-GPL18133-PMID:29186514 growth_protocol_ch1.1 Cultures of Caulobacter -LRB- TLS1631-TLS1633 -RRB- were grown at 30oC in PYE and supplemented with antibiotics , as necessary , at appropriate concentrations . To deplete wild-type non-tagged ParB , exponential-phase cells were washed off xylose and re-introduced to PYE +0.2 % glucose for an additional <Supp> 5 hours </Supp> . After 4 hours , vanillate was added to induce the expression of flag-parB -LRB- WT -RRB- or flag-parB -LRB- G101S/R104A -RRB- for an hour . Cultures of Escherichia coli -LRB- TLS1637-TLS1650 -RRB- were grown at 30oC in LB and supplemented with antibiotics , as necessary , at appropriate concentrations . IPTG -LRB- 0.5 mM -RRB- was added to induce the production of T18-ParB -LRB- WT -RRB- or T18-ParB -LRB- G101S -RRB- . After an hour , formadehyde -LRB- 1 % final concentration -RRB- were added to fix cells for ChIP-seq . Supp | ||
27 | Saving Ouput I... | 25 | Saving Ouput I... |
28 | Saving Ouput II... | 26 | Saving Ouput II... |
29 | Saving Ouput III... | 27 | Saving Ouput III... |
30 | -Pssing corpus done in: 258.328259s | 28 | +Passing corpus done in: 257.970281s | ... | ... |
1 | --------------------------------- PARAMETERS -------------------------------- | ||
2 | ---inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ | ||
3 | ---outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ | ||
4 | ---outputFileI Output tagged file I : annot-input_bg_outputI_v4.txt | ||
5 | ---outputFileII Output tagged file II : annot-input_bg_outputII_v4 | ||
6 | ---modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models | ||
7 | ---modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 | ||
8 | ---infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping | ||
9 | ---infoFile GSE-GSM index file : bg_sentences_midx_v4.txt | ||
10 | ---variant Run variant : 13 | ||
11 | ---S1 General features : True | ||
12 | ---S2 Inner/Complete word features : False | ||
13 | ---S3 Extended context features : False | ||
14 | ---S4 Semantic features : True | ||
15 | ---filteringStopWords Filtering stop words : False | ||
16 | ---filterSymbols Filtering punctuation marks : False | ||
17 | -Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False | ||
18 | --------------------------------- PROCESSING -------------------------------- | ||
19 | -Reading CRF model... | ||
20 | -Reading CRF model done in: 0.009363s | ||
21 | -Processing corpus... | ||
22 | -Preprocessing file...annot-input_bg_v3.txt | ||
23 | -Sentences input data: 14716 | ||
24 | -Predicting tags with model | ||
25 | -Prediction done in: 1.737334s | ||
26 | -Tagging file | ||
27 | -Preprocessing file...annot-input_bg_v4.txt | ||
28 | -Sentences input data: 90688 | ||
29 | -Predicting tags with model | ||
30 | -Prediction done in: 26.434549s | ||
31 | -Tagging file | ||
32 | -Processing corpus done in: 58.304885s |
-
Please register or login to post a comment