Estefani Gaytan Nunez

upload

1 +{"key1": "keyinfo", "key2": "keyinfo2"}
...\ No newline at end of file ...\ No newline at end of file
...@@ -25,6 +25,8 @@ from nltk.corpus import stopwords ...@@ -25,6 +25,8 @@ from nltk.corpus import stopwords
25 25
26 import training_validation_v14 as training 26 import training_validation_v14 as training
27 27
28 +import json
29 +
28 #------------------------------------------------------------------------------- 30 #-------------------------------------------------------------------------------
29 # Objective 31 # Objective
30 # Tagging transformed file with CRF model with sklearn-crfsuite. 32 # Tagging transformed file with CRF model with sklearn-crfsuite.
...@@ -61,10 +63,7 @@ import training_validation_v14 as training ...@@ -61,10 +63,7 @@ import training_validation_v14 as training
61 # --variant 13 63 # --variant 13
62 64
63 #Examples 65 #Examples
64 -#python3 tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI.txt --outputFileII annot-input_bg_outputII.txt --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx.txt --variant 13 --S4 --S1 > ../../reports/output_tagging_report.txt 66 +#predict-annot/bin/tagging/tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI_v5.txt --outputFileII annot-input_bg_outputII_v5 --outputFileIII annot-input_bg_outputIII_v5 --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx_v4.txt --variant 10 --S2 > predict-annot/reports/annot-input_bg_report_v4.txt > predict-annot/reports/output_tagging_report_v5.txt
65 -#python3 predict-annot/bin/tagging/tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI_v4.txt --outputFileII annot-input_bg_outputII_v4 --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx_v4.txt --variant 13 --S4 --S1 > predict-annot/reports/output_tagging_report_v4.txt
66 -#python3 predict-annot/bin/tagging/tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI_v4.txt --outputFileII annot-input_bg_outputII_v4 --outputFileII annot-input_bg_outputIII_v4 --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx_v4.txt --variant 13 --S4 --S1 > predict-annot/reports/annot-input_bg_report_v4.txt
67 -
68 __author__ = 'egaytan' 67 __author__ = 'egaytan'
69 68
70 ########################################## 69 ##########################################
...@@ -84,12 +83,10 @@ if __name__ == "__main__": ...@@ -84,12 +83,10 @@ if __name__ == "__main__":
84 parser.add_option("--infoPath", dest="infoPath", help="Path of GSE-GSM index file", metavar="PATH") 83 parser.add_option("--infoPath", dest="infoPath", help="Path of GSE-GSM index file", metavar="PATH")
85 parser.add_option("--infoFile", dest="idx", help="GSE-GSM index file", metavar="FILE") 84 parser.add_option("--infoFile", dest="idx", help="GSE-GSM index file", metavar="FILE")
86 parser.add_option("--variant", dest="variant", help="Run variant", metavar="FILE") 85 parser.add_option("--variant", dest="variant", help="Run variant", metavar="FILE")
87 - parser.add_option("--S1", dest="S1", help="General features", action="store_true", default=False) 86 + parser.add_option("--S1", dest="S1", help="Inner word features", action="store_true", default=False)
88 - parser.add_option("--S2", dest="S2", help="Inner/Complete word features", action="store_true", default=False) 87 + parser.add_option("--S2", dest="S2", help="Complete word features", action="store_true", default=False)
89 parser.add_option("--S3", dest="S3", help="Extended context features", action="store_true", default=False) 88 parser.add_option("--S3", dest="S3", help="Extended context features", action="store_true", default=False)
90 parser.add_option("--S4", dest="S4", help="Semantic features", action="store_true", default=False) 89 parser.add_option("--S4", dest="S4", help="Semantic features", action="store_true", default=False)
91 - parser.add_option("--filterStopWords", dest="filterStopWords", help="Filtering stop words", action="store_true", default=False)
92 - parser.add_option("--filterSymbols", dest="filterSymbols", help="Filtering punctuation marks", action="store_true", default=False)
93 90
94 (options, args) = parser.parse_args() 91 (options, args) = parser.parse_args()
95 if len(args) > 0: 92 if len(args) > 0:
...@@ -109,39 +106,29 @@ if __name__ == "__main__": ...@@ -109,39 +106,29 @@ if __name__ == "__main__":
109 print("--infoPath Path of GSE-GSM index file : " + str(options.infoPath )) 106 print("--infoPath Path of GSE-GSM index file : " + str(options.infoPath ))
110 print("--infoFile GSE-GSM index file : " + str(options.idx )) 107 print("--infoFile GSE-GSM index file : " + str(options.idx ))
111 print("--variant Run variant : " + str(options.variant )) 108 print("--variant Run variant : " + str(options.variant ))
112 - print("--S1 General features : " + str(options.S1 )) 109 + print("--S1 Inner word features set : " + str(options.S1 ))
113 - print("--S2 Inner/Complete word features : " + str(options.S2 )) 110 + print("--S2 Complete word features : " + str(options.S2 ))
114 print("--S3 Extended context features : " + str(options.S3 )) 111 print("--S3 Extended context features : " + str(options.S3 ))
115 print("--S4 Semantic features : " + str(options.S4 )) 112 print("--S4 Semantic features : " + str(options.S4 ))
116 - print("--filteringStopWords Filtering stop words : " + str(options.filterStopWords ))
117 - print("--filterSymbols Filtering punctuation marks : " + str(options.filterSymbols ))
118 -
119 113
120 - symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
121 - '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
122 - #print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
123 ########################################### PROCESSING ########################################## 114 ########################################### PROCESSING ##########################################
124 print('-------------------------------- PROCESSING --------------------------------') 115 print('-------------------------------- PROCESSING --------------------------------')
125 116
126 - stopwords = [word for word in stopwords.words('english')]
127 # Read index mapping GSE file information 117 # Read index mapping GSE file information
128 idx = open(os.path.join(options.infoPath, options.idx), "r").readlines() 118 idx = open(os.path.join(options.infoPath, options.idx), "r").readlines()
129 119
130 -
131 ########################################### Read CRF model ########################################## 120 ########################################### Read CRF model ##########################################
132 t0 = time() 121 t0 = time()
133 print('Reading CRF model...') 122 print('Reading CRF model...')
134 crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod')) 123 crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod'))
135 print("Reading CRF model done in: %fs" % (time() - t0)) 124 print("Reading CRF model done in: %fs" % (time() - t0))
136 125
137 -
138 ########################################### Reading sentences ########################################## 126 ########################################### Reading sentences ##########################################
139 print('Processing corpus...') 127 print('Processing corpus...')
140 t0 = time() 128 t0 = time()
141 labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH']) 129 labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
142 130
143 - 131 + ########################################### Preprocessing ###########################################
144 - #####################################################################################
145 # Walk directory to read files 132 # Walk directory to read files
146 for path, dirs, files in os.walk(options.inputPath): 133 for path, dirs, files in os.walk(options.inputPath):
147 # For each file in dir 134 # For each file in dir
...@@ -149,74 +136,60 @@ if __name__ == "__main__": ...@@ -149,74 +136,60 @@ if __name__ == "__main__":
149 print("Preprocessing file..." + str(file)) 136 print("Preprocessing file..." + str(file))
150 sentencesInputData = [] 137 sentencesInputData = []
151 sentencesOutputDataI = [] 138 sentencesOutputDataI = []
152 - sentencesOutputDataII = [] 139 + # Preprocessing input sentences
153 with open(os.path.join(options.inputPath, file), "r") as iFile: 140 with open(os.path.join(options.inputPath, file), "r") as iFile:
154 - lines = iFile.readlines() 141 + sentencesInputData = [ line.strip('\n').split() for line in iFile]
155 - for line in lines: 142 + # Save input sentences
156 - listLine = []
157 - for token in line.strip('\n').split():
158 - if options.filterStopWords:
159 - listToken = token.split('|')
160 - lemma = listToken[1]
161 - if lemma in stopwords:
162 - continue
163 - if options.filterSymbols:
164 - listToken = token.split('|')
165 - lemma = listToken[1]
166 - if lemma in symbols:
167 - if lemma == ',':
168 - print("Coma , identificada")
169 - continue
170 - listLine.append(token)
171 - sentencesInputData.append(listLine)
172 X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData] 143 X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData]
173 print("Sentences input data: " + str(len(sentencesInputData))) 144 print("Sentences input data: " + str(len(sentencesInputData)))
174 145
175 -
176 ########################################### Predicting tags ########################################### 146 ########################################### Predicting tags ###########################################
177 t1 = time() 147 t1 = time()
178 print("Predicting tags with model...") 148 print("Predicting tags with model...")
179 y_pred = crf.predict(X_input) 149 y_pred = crf.predict(X_input)
180 - print("Prediction done in: %fs" % (time() - t1))
181 150
151 + print("Prediction done in: %fs" % (time() - t1))
182 152
183 ########################################### Tagging with CRF model ########################################### 153 ########################################### Tagging with CRF model ###########################################
184 print("Tagging file...") 154 print("Tagging file...")
185 lidx = 0 155 lidx = 0
186 - for line, tagLine in zip(lines, y_pred): 156 + for line, tagLine in zip(iFile.readlines(), y_pred):
157 + # unique tags
187 Ltags = set(labels).intersection(set(tagLine)) 158 Ltags = set(labels).intersection(set(tagLine))
188 - outputLine = '' 159 + # Skip untagged sentence
160 + if Ltags == {'O'}: continue
189 line = line.strip('\n') 161 line = line.strip('\n')
190 - 162 + # starting empty sentence
163 + outputLine = ''
164 + # tag behind
191 tb = 'O' 165 tb = 'O'
166 + # per sentence word count
192 i = 0 167 i = 0
193 - ########################## one word sentences ########################## 168 + # Exception for one word sentences
194 - if len(tagLine)==1: 169 + if len(tagLine) == 1:
195 if tagLine[0] in labels: 170 if tagLine[0] in labels:
171 + # add start tagging signature
196 start = '<' + tagLine[0] + '> ' 172 start = '<' + tagLine[0] + '> '
197 - end = '</' + tagLine[0] + '/>' 173 + # add end tagging signature
174 + end = '</' + tagLine[0] + '>'
198 word = line.split('|')[0] + ' ' 175 word = line.split('|')[0] + ' '
176 + # save output tagged sentence
199 outputLine = start + word + end 177 outputLine = start + word + end
200 else: 178 else:
201 outputLine = line.split(' ')[0] 179 outputLine = line.split(' ')[0]
202 - ########################## Saving Sentence Ouput I ########################## 180 + # Saving Sentence Ouput I
203 sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + '\t' + ', '.join(Ltags)) 181 sentencesOutputDataI.append(idx[lidx].replace('\n','\t') + outputLine + '\t' + ', '.join(Ltags))
204 - ########################## Saving Sentence Ouput II ########################## 182 + # Increase sentence counter
205 - sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + word.split('|')[0] + '\t' + tag)
206 lidx += 1 183 lidx += 1
184 + # Continue with the next sentence
207 continue 185 continue
208 - 186 + # Tagging sentences
209 - sentence = ''
210 - sb = False
211 for word,tag in zip(line.split(' '), tagLine): 187 for word,tag in zip(line.split(' '), tagLine):
212 # start tagging 188 # start tagging
213 if tag in labels and tb != tag: 189 if tag in labels and tb != tag:
214 - # start tagging
215 outputLine += '<' + tag + '> ' 190 outputLine += '<' + tag + '> '
216 - sb = True
217 - sentence = word.split('|')[0] + ' '
218 - tb = tag
219 outputLine += word.split('|')[0] + ' ' 191 outputLine += word.split('|')[0] + ' '
192 + tb = tag
220 i += 1 193 i += 1
221 continue 194 continue
222 # end tagging 195 # end tagging
...@@ -224,32 +197,24 @@ if __name__ == "__main__": ...@@ -224,32 +197,24 @@ if __name__ == "__main__":
224 if i+1==len(tagLine): 197 if i+1==len(tagLine):
225 # end sentence 198 # end sentence
226 outputLine += word.split('|')[0] + ' ' 199 outputLine += word.split('|')[0] + ' '
227 - outputLine += '</' + tag + '/> ' 200 + outputLine += '</' + tag + '> '
228 - ########################## Saving Sentence Ouput II ##########################
229 - sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + sentence + word.split('|')[0] + '\t' +tag)
230 - sb = False
231 tb = 'O' 201 tb = 'O'
232 i += 1 202 i += 1
233 continue 203 continue
234 elif tag!=tagLine[i+1]: 204 elif tag!=tagLine[i+1]:
235 # start new tag 205 # start new tag
236 outputLine += word.split('|')[0] + ' ' 206 outputLine += word.split('|')[0] + ' '
237 - outputLine += '</' + tag + '/> ' 207 + outputLine += '</' + tag + '> '
238 - ########################## Saving Sentence Ouput II ##########################
239 - sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + sentence + word.split('|')[0] + '\t' +tag)
240 - sb = False
241 tb = 'O' 208 tb = 'O'
242 i += 1 209 i += 1
243 continue 210 continue
244 # word tagged 211 # word tagged
245 outputLine += word.split('|')[0] + ' ' 212 outputLine += word.split('|')[0] + ' '
246 i += 1 213 i += 1
247 - if sb: 214 + # Saving Sentence Ouput I
248 - sentence+= word.split('|')[0] + ' '
249 - ########################## Saving Sentence Ouput I ##########################
250 sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ '\t' +', '.join(Ltags)) 215 sentencesOutputDataI.append(idx[lidx].replace('\n', '\t') + outputLine+ '\t' +', '.join(Ltags))
251 lidx += 1 216 lidx += 1
252 - 217 + print("\n".join(sentencesOutputDataI[1:3]))
253 ########################################### Save Output I ########################################## 218 ########################################### Save Output I ##########################################
254 print("Saving Ouput I...") 219 print("Saving Ouput I...")
255 with open(os.path.join(options.outputPath, options.outFileI + '_' + options.modelName + '.tsv'), "w") as oFileI: 220 with open(os.path.join(options.outputPath, options.outFileI + '_' + options.modelName + '.tsv'), "w") as oFileI:
...@@ -259,19 +224,29 @@ if __name__ == "__main__": ...@@ -259,19 +224,29 @@ if __name__ == "__main__":
259 oline = line.replace('LDR','(') 224 oline = line.replace('LDR','(')
260 oline = oline.replace('RDR',')') 225 oline = oline.replace('RDR',')')
261 oFileI.write(oline + '\n') 226 oFileI.write(oline + '\n')
227 +
262 ########################################### Save Output II ########################################## 228 ########################################### Save Output II ##########################################
263 print("Saving Ouput II...") 229 print("Saving Ouput II...")
264 with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFileII: 230 with open(os.path.join(options.outputPath, options.outFileII + '_' + options.modelName + '.tsv'), "w") as oFileII:
265 - for line in sentencesOutputDataII: 231 + for line in sentencesOutputDataI:
266 - #print(line)
267 oline = line.replace('LDR','(') 232 oline = line.replace('LDR','(')
268 oline = oline.replace('RDR',')') 233 oline = oline.replace('RDR',')')
269 - oFileII.write(oline + '\n') 234 + for ttex, tag in re.findall(r'<[^>]+>([^<]+)</([^>]+)>', oline):
235 + lline = oline.split('\t')[0:-2] + [ttex, tag]
236 + nline = '\t'.join(lline)
237 + oFileII.write(nline + '\n')
238 +
270 ########################################### Save Output III ########################################## 239 ########################################### Save Output III ##########################################
271 print("Saving Ouput III...") 240 print("Saving Ouput III...")
272 with open(os.path.join(options.outputPath, options.outFileIII + '_' + options.modelName + '.tsv'), "w") as oFileIII: 241 with open(os.path.join(options.outputPath, options.outFileIII + '_' + options.modelName + '.tsv'), "w") as oFileIII:
273 - for line, tagLine in zip(lines, y_pred): 242 + for line, tagLine in zip(iFile.readlines(), y_pred):
274 oline = [ w.split('|')[0].replace('LDR','(').replace('LDR','(')+'|'+tag for w,tag in zip(line.split(' '), tagLine)] 243 oline = [ w.split('|')[0].replace('LDR','(').replace('LDR','(')+'|'+tag for w,tag in zip(line.split(' '), tagLine)]
275 244
276 oFileIII.write(' '.join(oline) + '\n') 245 oFileIII.write(' '.join(oline) + '\n')
246 +
247 + ########################################### Save Probs ##########################################
248 + y_probs = crf.predict_marginals(X_input)
249 + # from https://stackoverflow.com/questions/7100125/storing-python-dictionaries
250 + with open(os.path.join(options.outputPath, 'crf_probs.json'), 'w') as fp:
251 + json.dump(y_probs, fp)
277 print("Processing corpus done in: %fs" % (time() - t0)) 252 print("Processing corpus done in: %fs" % (time() - t0))
......
This file is too large to display.
1 -------------------------------- PARAMETERS -------------------------------- 1 -------------------------------- PARAMETERS --------------------------------
2 --inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ 2 --inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
3 --outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ 3 --outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
4 ---outputFileI Output tagged file I : annot-input_bg_outputI_v4 4 +--outputFileI Output tagged file I : annot-input_bg_outputI_v5.txt
5 ---outputFileII Output tagged file II : annot-input_bg_outputII_v4 5 +--outputFileII Output tagged file II : annot-input_bg_outputII_v5
6 ---outputFileII Output tagged file III : annot-input_bg_outputIII_v4 6 +--outputFileII Output tagged file III : annot-input_bg_outputIII_v5
7 --modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models 7 --modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
8 --modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 8 --modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
9 --infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping 9 --infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
10 --infoFile GSE-GSM index file : bg_sentences_midx_v4.txt 10 --infoFile GSE-GSM index file : bg_sentences_midx_v4.txt
11 ---variant Run variant : 13 11 +--variant Run variant : 10
12 ---S1 General features : True 12 +--S1 Inner word features set : False
13 ---S2 Inner/Complete word features : False 13 +--S2 Complete word features : True
14 --S3 Extended context features : False 14 --S3 Extended context features : False
15 ---S4 Semantic features : True 15 +--S4 Semantic features : False
16 ---filteringStopWords Filtering stop words : False
17 ---filterSymbols Filtering punctuation marks : False
18 -------------------------------- PROCESSING -------------------------------- 16 -------------------------------- PROCESSING --------------------------------
19 Reading CRF model... 17 Reading CRF model...
20 -Reading CRF model done in: 0.009463s 18 +Reading CRF model done in: 0.009485s
21 Processing corpus... 19 Processing corpus...
22 Preprocessing file...annot-input_bg_v4.txt 20 Preprocessing file...annot-input_bg_v4.txt
23 Sentences input data: 90688 21 Sentences input data: 90688
24 Predicting tags with model... 22 Predicting tags with model...
25 -Prediction done in: 26.367272s 23 +Prediction done in: 27.326342s
26 Tagging file... 24 Tagging file...
25 +
27 Saving Ouput I... 26 Saving Ouput I...
28 Saving Ouput II... 27 Saving Ouput II...
29 Saving Ouput III... 28 Saving Ouput III...
30 -Processing corpus done in: 56.584394s 29 +Processing corpus done in: 247.353067s
......