Estefani Gaytan Nunez

upload

1 +# -*- coding: UTF-8 -*-
2 +
3 +import os
4 +import re
5 +from pandas import DataFrame as DF
6 +from optparse import OptionParser
7 +from time import time
8 +from collections import Counter
9 +
10 +import nltk
11 +import sklearn
12 +import scipy.stats
13 +import sys
14 +
15 +import joblib
16 +from sklearn.metrics import make_scorer
17 +from sklearn.model_selection import cross_val_score
18 +from sklearn.model_selection import RandomizedSearchCV
19 +
20 +import sklearn_crfsuite
21 +from sklearn_crfsuite import scorers
22 +from sklearn_crfsuite import metrics
23 +
24 +from nltk.corpus import stopwords
25 +
26 +import training_validation_v14 as training
27 +
28 +import json
29 +
30 +#-------------------------------------------------------------------------------
31 +# Objective
32 +# Tagging transformed file with CRF model with sklearn-crfsuite.
33 +#
34 +# Input parameters
35 +# --inputPath=PATH Path of transformed files x|y|z
36 +# --outputPath Output path to place output files
37 +# --outputFileI Output tagged file I
38 +# --outputFileII Output tagged file II
39 +# --modelPath Path to CRF model
40 +# --modelName Model name
41 +# --infoPath Path of GSE-GSM index file
42 +# --infoFile GSE-GSM index file",
43 +# --variant Part of S2 variant
44 +# --S1 Inner word features set
45 +# --S2 Complete word features
46 +# --S3 Extended context features
47 +# --S4 Semantic features
48 +# --filteringStopWords Filtering stop words
49 +# --filterSymbols Filtering punctuation marks
50 +
51 +# Output
52 +# 1) Tagged files in transformed format
53 +
54 +# Examples
55 +# --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
56 +# --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
57 +# --outputFileI annot-input_bg_outputI.txt
58 +# --outputFileII annot-input_bg_outputII.txt
59 +# --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models
60 +# --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
61 +# --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
62 +# --infoFile bg_sentences_midx.txt
63 +# --variant 13
64 +
65 +#Examples
66 +#predict-annot/bin/tagging/tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI_v5.txt --outputFileII annot-input_bg_outputII_v5 --outputFileIII annot-input_bg_outputIII_v5 --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx_v4.txt --variant 10 --S2 > predict-annot/reports/annot-input_bg_report_v4.txt > predict-annot/reports/output_tagging_report_v5.txt
67 +__author__ = 'egaytan'
68 +
69 +##########################################
70 +# MAIN PROGRAM #
71 +##########################################
72 +
73 +if __name__ == "__main__":
74 + ########################################### Defining parameters ##########################################
75 + parser = OptionParser()
76 + parser.add_option("--inputPath", dest="inputPath", help="Path of training data set", metavar="PATH")
77 + parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
78 + parser.add_option("--outputFileI", dest="outFileI", help="Output tagged file I", metavar="FILE")
79 + parser.add_option("--outputFileII", dest="outFileII", help="Output tagged file II", metavar="FILE")
80 + parser.add_option("--outputFileIII", dest="outFileIII", help="Output tagged file III", metavar="FILE")
81 + parser.add_option("--modelPath", dest="modelPath", help="Path to read CRF model", metavar="PATH")
82 + parser.add_option("--modelName", dest="modelName", help="Model name", metavar="TEXT")
83 + parser.add_option("--infoPath", dest="infoPath", help="Path of GSE-GSM index file", metavar="PATH")
84 + parser.add_option("--infoFile", dest="idx", help="GSE-GSM index file", metavar="FILE")
85 + parser.add_option("--variant", dest="variant", help="Run variant", metavar="FILE")
86 + parser.add_option("--S1", dest="S1", help="Inner word features", action="store_true", default=False)
87 + parser.add_option("--S2", dest="S2", help="Complete word features", action="store_true", default=False)
88 + parser.add_option("--S3", dest="S3", help="Extended context features", action="store_true", default=False)
89 + parser.add_option("--S4", dest="S4", help="Semantic features", action="store_true", default=False)
90 +
91 + (options, args) = parser.parse_args()
92 + if len(args) > 0:
93 + parser.error("Any parameter given.")
94 + sys.exit(1)
95 +
96 + ########################################### DISP PARAMETERS ##########################################
97 + print('-------------------------------- PARAMETERS --------------------------------')
98 +
99 + print("--inputPath Path of training data set : " + str(options.inputPath ))
100 + print("--outputPath Output path to place output files: " + str(options.outputPath ))
101 + print("--outputFileI Output tagged file I : " + str(options.outFileI ))
102 + print("--outputFileII Output tagged file II : " + str(options.outFileII ))
103 + print("--outputFileII Output tagged file III : " + str(options.outFileIII ))
104 + print("--modelPath Path to read CRF model : " + str(options.modelPath ))
105 + print("--modelName Model name : " + str(options.modelName ))
106 + print("--infoPath Path of GSE-GSM index file : " + str(options.infoPath ))
107 + print("--infoFile GSE-GSM index file : " + str(options.idx ))
108 + print("--variant Run variant : " + str(options.variant ))
109 + print("--S1 Inner word features set : " + str(options.S1 ))
110 + print("--S2 Complete word features : " + str(options.S2 ))
111 + print("--S3 Extended context features : " + str(options.S3 ))
112 + print("--S4 Semantic features : " + str(options.S4 ))
113 +
114 + ########################################### PROCESSING ##########################################
115 + print('-------------------------------- PROCESSING --------------------------------')
116 +
117 + # Read index mapping GSE file information
118 + idx = open(os.path.join(options.infoPath, options.idx), "r").readlines()
119 +
120 + ########################################### Read CRF model ##########################################
121 + t0 = time()
122 + print('Reading CRF model...')
123 + crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod'))
124 + print("Reading CRF model done in: %fs" % (time() - t0))
125 +
126 + ########################################### Reading sentences ##########################################
127 + print('Processing corpus...')
128 + t0 = time()
129 + labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
130 +
131 + ########################################### Preprocessing ###########################################
132 + # Walk directory to read files
133 + for path, dirs, files in os.walk(options.inputPath):
134 + # For each file in dir
135 + for file in files:
136 + print("Preprocessing file..." + str(file))
137 + sentencesInputData = []
138 + sentencesOutputDataI = []
139 + # Preprocessing input sentences
140 + with open(os.path.join(options.inputPath, file), "r") as iFile:
141 + lines = iFile.readlines()
142 + sentencesInputData = [ line.strip('\n').split() for line in lines]
143 + # Save input sentences
144 + X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData]
145 + print("Sentences input data: " + str(len(sentencesInputData)))
146 +
147 + ########################################### Predicting tags ###########################################
148 + t1 = time()
149 + print("Predicting tags with model...")
150 + y_pred = crf.predict(X_input)
151 +
152 + #print(y_pred)
153 + print("Prediction done in: %fs" % (time() - t1))
154 +
155 + ########################################### Tagging with CRF model ###########################################
156 + print("Tagging file...")
157 + lidx = 0
158 + for line, tagLine in zip(lines, y_pred):
159 + # unique tags
160 + Ltags = set(labels).intersection(set(tagLine))
161 + # Skip untagged sentence
162 + if Ltags == {'O'}: continue
163 + line = line.strip('\n')
164 + # starting empty sentence
165 + outputLine = ''
166 + # tag behind
167 + tb = 'O'
168 + # per sentence word count
169 + i = 0
170 + # Exception for one word sentences
171 + if len(tagLine) == 1:
172 + if tagLine[0] in labels:
173 + # add start tagging signature
174 + start = '<' + tagLine[0] + '> '
175 + # add end tagging signature
176 + end = '</' + tagLine[0] + '>'
177 + word = line.split('|')[0] + ' '
178 + # save output tagged sentence
179 + outputLine = start + word + end
180 + else:
181 + outputLine = line.split(' ')[0]
182 + # Saving Sentence Ouput I
183 + #print(outputLine)
184 + sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + '\t' + ', '.join(Ltags))
185 + # Increase sentence counter
186 + lidx += 1
187 + # Continue with the next sentence
188 + continue
189 + # Tagging sentences
190 + for word,tag in zip(line.split(' '), tagLine):
191 + # start tagging
192 + if tag in labels and tb != tag:
193 + # check the last tagged word(s)
194 + if tb in labels and outputLine[-2:] != '> ':
195 + outputLine += '<' + tb + '> '
196 + outputLine += '<' + tag + '> '
197 + outputLine += word.split('|')[0] + ' '
198 + tb = tag
199 + i += 1
200 + continue
201 + # end tagging
202 + elif tag in labels: #elif tb in labels: #elif tag in labels and tag!=tagLine[i+1]:
203 + if i+1==len(tagLine):
204 + # end sentence
205 + #print(outputLine)
206 + outputLine += word.split('|')[0] + ' '
207 + outputLine += '</' + tag + '> '
208 + tb = 'O'
209 + i += 1
210 + continue
211 + elif tag!=tagLine[i+1]:
212 + # start new tag
213 + outputLine += word.split('|')[0] + ' '
214 + outputLine += '</' + tag + '> '
215 + tb = 'O'
216 + i += 1
217 + continue
218 + # check the last tagged word(s)
219 + if tb != tag and tb in labels and outputLine[-2:] != '<' + tb + '> ':
220 + outputLine += '</' + tb + '> '
221 + # word tagged
222 + outputLine += word.split('|')[0] + ' '
223 + i += 1
224 + # Saving Sentence Ouput I
225 + #print(outputLine)
226 + sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ '\t' +', '.join(Ltags))
227 + lidx += 1
228 +
229 + #print("\n".join(sentencesOutputDataI[1:3]))
230 +
231 + ########################################### Save Output I ##########################################
232 + print("Saving Ouput I...")
233 + with open(os.path.join(options.outputPath, options.outFileI + '_' + options.modelName + '.tsv'), "w") as oFileI:
234 + for line in sentencesOutputDataI:
235 + if re.findall('</', line):
236 + #print(line)
237 + oline = line.replace('-LRB-','(')
238 + oline = oline.replace('-RRB-',')')
239 + oFileI.write(oline + '\n')
240 +
241 + ########################################### Save Output II ##########################################
242 + print("Saving Ouput II...")
243 + with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFileII:
244 + for line in sentencesOutputDataI:
245 + oline = line.replace('-LRB-','(')
246 + oline = oline.replace('-RRB-',')')
247 + for ttex, tag in re.findall(r'<[^>]+>([^<]+)</([^>]+)>', oline):
248 + lline = oline.split('\t')[0:-2] + [ttex, tag]
249 + nline = '\t'.join(lline)
250 + oFileII.write(nline + '\n')
251 +
252 + ########################################### Save Output III ##########################################
253 + print("Saving Ouput III...")
254 + with open(os.path.join(options.outputPath, options.outFileIII + '_' + options.modelName + '.tsv'), "w") as oFileIII:
255 + for line, tagLine in zip(lines, y_pred):
256 + oline = [ w.split('|')[0].replace('-LRB-','(').replace('-LRB-','(')+'|'+tag for w,tag in zip(line.split(' '), tagLine)]
257 +
258 + oFileIII.write(' '.join(oline) + '\n')
259 +
260 + ########################################### Save Probs ##########################################
261 + y_probs = crf.predict_marginals(X_input)
262 + # from https://stackoverflow.com/questions/7100125/storing-python-dictionaries
263 + with open(os.path.join(options.outputPath, 'crf_probs.json'), 'w') as fp:
264 + json.dump(y_probs, fp)
265 + print("Passing corpus done in: %fs" % (time() - t0))
1 -------------------------------- PARAMETERS -------------------------------- 1 -------------------------------- PARAMETERS --------------------------------
2 --inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ 2 --inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
3 --outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ 3 --outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
4 ---outputFileI Output tagged file I : annot-input_bg_outputI_v5.txt 4 +--outputFileI Output tagged file I : annot-input_bg_outputI_v6
5 ---outputFileII Output tagged file II : annot-input_bg_outputII_v5 5 +--outputFileII Output tagged file II : annot-input_bg_outputII_v6
6 ---outputFileII Output tagged file III : annot-input_bg_outputIII_v5 6 +--outputFileII Output tagged file III : annot-input_bg_outputIII_v6
7 --modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models 7 --modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
8 --modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 8 --modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
9 --infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping 9 --infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
...@@ -15,16 +15,14 @@ ...@@ -15,16 +15,14 @@
15 --S4 Semantic features : False 15 --S4 Semantic features : False
16 -------------------------------- PROCESSING -------------------------------- 16 -------------------------------- PROCESSING --------------------------------
17 Reading CRF model... 17 Reading CRF model...
18 -Reading CRF model done in: 0.009225s 18 +Reading CRF model done in: 0.009524s
19 Processing corpus... 19 Processing corpus...
20 Preprocessing file...annot-input_bg_v4.txt 20 Preprocessing file...annot-input_bg_v4.txt
21 Sentences input data: 90688 21 Sentences input data: 90688
22 Predicting tags with model... 22 Predicting tags with model...
23 -Prediction done in: 27.733279s 23 +Prediction done in: 27.458162s
24 Tagging file... 24 Tagging file...
25 -GSE100233 GSM2675514 GPL18006-GPL18133-PMID:29186514 library_strategy.1 <Technique> ChIP-Seq </Technique> Technique
26 -GSE100233 GSM2675514 GPL18006-GPL18133-PMID:29186514 growth_protocol_ch1.1 Cultures of Caulobacter -LRB- TLS1631-TLS1633 -RRB- were grown at 30oC in PYE and supplemented with antibiotics , as necessary , at appropriate concentrations . To deplete wild-type non-tagged ParB , exponential-phase cells were washed off xylose and re-introduced to PYE +0.2 % glucose for an additional <Supp> 5 hours </Supp> . After 4 hours , vanillate was added to induce the expression of flag-parB -LRB- WT -RRB- or flag-parB -LRB- G101S/R104A -RRB- for an hour . Cultures of Escherichia coli -LRB- TLS1637-TLS1650 -RRB- were grown at 30oC in LB and supplemented with antibiotics , as necessary , at appropriate concentrations . IPTG -LRB- 0.5 mM -RRB- was added to induce the production of T18-ParB -LRB- WT -RRB- or T18-ParB -LRB- G101S -RRB- . After an hour , formadehyde -LRB- 1 % final concentration -RRB- were added to fix cells for ChIP-seq . Supp
27 Saving Ouput I... 25 Saving Ouput I...
28 Saving Ouput II... 26 Saving Ouput II...
29 Saving Ouput III... 27 Saving Ouput III...
30 -Pssing corpus done in: 258.328259s 28 +Passing corpus done in: 257.970281s
......
1 --------------------------------- PARAMETERS --------------------------------
2 ---inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
3 ---outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
4 ---outputFileI Output tagged file I : annot-input_bg_outputI_v4.txt
5 ---outputFileII Output tagged file II : annot-input_bg_outputII_v4
6 ---modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
7 ---modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
8 ---infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
9 ---infoFile GSE-GSM index file : bg_sentences_midx_v4.txt
10 ---variant Run variant : 13
11 ---S1 General features : True
12 ---S2 Inner/Complete word features : False
13 ---S3 Extended context features : False
14 ---S4 Semantic features : True
15 ---filteringStopWords Filtering stop words : False
16 ---filterSymbols Filtering punctuation marks : False
17 -Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
18 --------------------------------- PROCESSING --------------------------------
19 -Reading CRF model...
20 -Reading CRF model done in: 0.009363s
21 -Processing corpus...
22 -Preprocessing file...annot-input_bg_v3.txt
23 -Sentences input data: 14716
24 -Predicting tags with model
25 -Prediction done in: 1.737334s
26 -Tagging file
27 -Preprocessing file...annot-input_bg_v4.txt
28 -Sentences input data: 90688
29 -Predicting tags with model
30 -Prediction done in: 26.434549s
31 -Tagging file
32 -Processing corpus done in: 58.304885s