Estefani Gaytan Nunez

upload

1 # -*- coding: UTF-8 -*- 1 # -*- coding: UTF-8 -*-
2 2
3 import os 3 import os
4 +import re
4 from pandas import DataFrame as DF 5 from pandas import DataFrame as DF
5 from optparse import OptionParser 6 from optparse import OptionParser
6 from time import time 7 from time import time
...@@ -29,25 +30,37 @@ import training_validation_v14 as training ...@@ -29,25 +30,37 @@ import training_validation_v14 as training
29 # Tagging transformed file with CRF model with sklearn-crfsuite. 30 # Tagging transformed file with CRF model with sklearn-crfsuite.
30 # 31 #
31 # Input parameters 32 # Input parameters
32 -# --inputPath=PATH Path of transformed files x|y|z 33 +# --inputPath=PATH Path of transformed files x|y|z
33 -# --modelPath Path to CRF model 34 +# --outputPath Output path to place output files
34 -# --modelName Model name 35 +# --outputFileI Output tagged file I
35 -# --outputPath=PATH Output path to place output files 36 +# --outputFileII Output tagged file II
36 -# --filteringStopWords Filtering stop words 37 +# --modelPath Path to CRF model
37 -# --filterSymbols Filtering punctuation marks 38 +# --modelName Model name
39 +# --infoPath Path of GSE-GSM index file
40 +# --infoFile GSE-GSM index file",
41 +# --variant Part of S2 variant
42 +# --S1 Inner word features set
43 +# --S2 Complete word features
44 +# --S3 Extended context features
45 +# --S4 Semantic features
46 +# --filteringStopWords Filtering stop words
47 +# --filterSymbols Filtering punctuation marks
38 48
39 # Output 49 # Output
40 # 1) Tagged files in transformed format 50 # 1) Tagged files in transformed format
41 51
42 # Examples 52 # Examples
43 -# python3 tagging.py 53 +# --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
44 -# --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ 54 +# --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
45 -# --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod 55 +# --outputFileI annot-input_bg_outputI.txt
46 -# --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models/ 56 +# --outputFileII annot-input_bg_outputII.txt
47 -# --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ 57 +# --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models
48 -# --filterSymbols 58 +# --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
49 - 59 +# --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
50 -# python3 tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --filterSymbols > output_tagging_report.txt 60 +# --infoFile bg_sentences_midx.txt
61 +# --variant 13
62 +
63 +#python3 tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --outputFileI annot-input_bg_outputI.txt --outputFileII annot-input_bg_outputII.txt --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 --infoPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping --infoFile bg_sentences_midx.txt --variant 13 --S4 --S1 > ../../reports/output_tagging_report.txt
51 64
52 __author__ = 'egaytan' 65 __author__ = 'egaytan'
53 66
...@@ -60,9 +73,13 @@ if __name__ == "__main__": ...@@ -60,9 +73,13 @@ if __name__ == "__main__":
60 parser = OptionParser() 73 parser = OptionParser()
61 parser.add_option("--inputPath", dest="inputPath", help="Path of training data set", metavar="PATH") 74 parser.add_option("--inputPath", dest="inputPath", help="Path of training data set", metavar="PATH")
62 parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH") 75 parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
76 + parser.add_option("--outputFileI", dest="outFileI", help="Output tagged file I", metavar="FILE")
77 + parser.add_option("--outputFileII", dest="outFileII", help="Output tagged file II", metavar="FILE")
63 parser.add_option("--modelPath", dest="modelPath", help="Path to read CRF model", metavar="PATH") 78 parser.add_option("--modelPath", dest="modelPath", help="Path to read CRF model", metavar="PATH")
64 parser.add_option("--modelName", dest="modelName", help="Model name", metavar="TEXT") 79 parser.add_option("--modelName", dest="modelName", help="Model name", metavar="TEXT")
65 - parser.add_option("--variant", dest="variant", help="Report file", metavar="FILE") 80 + parser.add_option("--infoPath", dest="infoPath", help="Path of GSE-GSM index file", metavar="PATH")
81 + parser.add_option("--infoFile", dest="idx", help="GSE-GSM index file", metavar="FILE")
82 + parser.add_option("--variant", dest="variant", help="Run variant", metavar="FILE")
66 parser.add_option("--S1", dest="S1", help="General features", action="store_true", default=False) 83 parser.add_option("--S1", dest="S1", help="General features", action="store_true", default=False)
67 parser.add_option("--S2", dest="S2", help="Inner/Complete word features", action="store_true", default=False) 84 parser.add_option("--S2", dest="S2", help="Inner/Complete word features", action="store_true", default=False)
68 parser.add_option("--S3", dest="S3", help="Extended context features", action="store_true", default=False) 85 parser.add_option("--S3", dest="S3", help="Extended context features", action="store_true", default=False)
...@@ -75,14 +92,25 @@ if __name__ == "__main__": ...@@ -75,14 +92,25 @@ if __name__ == "__main__":
75 parser.error("Any parameter given.") 92 parser.error("Any parameter given.")
76 sys.exit(1) 93 sys.exit(1)
77 94
95 +
78 print('-------------------------------- PARAMETERS --------------------------------') 96 print('-------------------------------- PARAMETERS --------------------------------')
79 - print("Path to read input files: " + options.inputPath) 97 +
80 - print("Mode name: " + str(options.modelName)) 98 + print("--inputPath Path of training data set : " + str(options.inputPath ))
81 - print("Model path: " + options.modelPath) 99 + print("--outputPath Output path to place output files: " + str(options.outputPath ))
82 - print("Path to place output files: " + options.outputPath) 100 + print("--outputFileI Output tagged file I : " + str(options.outFileI ))
83 - print("Filtering stop words: " + str(options.filterStopWords)) 101 + print("--outputFileII Output tagged file II : " + str(options.outFileII ))
84 - print("Levels: " + "S1: " + str(options.S1) + "S2: " + str(options.S2) + "S3: " + str(options.S3) + "S4: " + str(options.S4)) 102 + print("--modelPath Path to read CRF model : " + str(options.modelPath ))
85 - print("Run variant: " + str(options.variant)) 103 + print("--modelName Model name : " + str(options.modelName ))
104 + print("--infoPath Path of GSE-GSM index file : " + str(options.infoPath ))
105 + print("--infoFile GSE-GSM index file : " + str(options.idx ))
106 + print("--variant Run variant : " + str(options.variant ))
107 + print("--S1 General features : " + str(options.S1 ))
108 + print("--S2 Inner/Complete word features : " + str(options.S2 ))
109 + print("--S3 Extended context features : " + str(options.S3 ))
110 + print("--S4 Semantic features : " + str(options.S4 ))
111 + print("--filteringStopWords Filtering stop words : " + str(options.filterStopWords ))
112 + print("--filterSymbols Filtering punctuation marks : " + str(options.filterSymbols ))
113 +
86 114
87 symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', 115 symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
88 '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...'] 116 '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
...@@ -92,7 +120,9 @@ if __name__ == "__main__": ...@@ -92,7 +120,9 @@ if __name__ == "__main__":
92 print('-------------------------------- PROCESSING --------------------------------') 120 print('-------------------------------- PROCESSING --------------------------------')
93 121
94 stopwords = [word for word in stopwords.words('english')] 122 stopwords = [word for word in stopwords.words('english')]
95 - 123 + # Read index
124 + idx = open(os.path.join(options.infoPath, options.idx), "r").readlines()
125 +
96 # Read CRF model 126 # Read CRF model
97 t0 = time() 127 t0 = time()
98 print('Reading CRF model...') 128 print('Reading CRF model...')
...@@ -108,8 +138,9 @@ if __name__ == "__main__": ...@@ -108,8 +138,9 @@ if __name__ == "__main__":
108 # For each file in dir 138 # For each file in dir
109 for file in files: 139 for file in files:
110 print("Preprocessing file..." + str(file)) 140 print("Preprocessing file..." + str(file))
111 - sentencesInputData = [] 141 + sentencesInputData = []
112 - sentencesOutputData = [] 142 + sentencesOutputDataI = []
143 + sentencesOutputDataII = []
113 with open(os.path.join(options.inputPath, file), "r") as iFile: 144 with open(os.path.join(options.inputPath, file), "r") as iFile:
114 lines = iFile.readlines() 145 lines = iFile.readlines()
115 for line in lines: 146 for line in lines:
...@@ -142,10 +173,12 @@ if __name__ == "__main__": ...@@ -142,10 +173,12 @@ if __name__ == "__main__":
142 173
143 # Tagging with CRF model 174 # Tagging with CRF model
144 print("Tagging file") 175 print("Tagging file")
176 + lidx = 0
145 for line, tagLine in zip(lines, y_pred): 177 for line, tagLine in zip(lines, y_pred):
146 Ltags = set(labels).intersection(set(tagLine)) 178 Ltags = set(labels).intersection(set(tagLine))
147 outputLine = '' 179 outputLine = ''
148 - line = line.strip('\n') 180 + line = line.strip('\n')
181 +
149 #print("\nLine: " + str(line)) 182 #print("\nLine: " + str(line))
150 #print ("CRF tagged line: " + str(tagLine)) 183 #print ("CRF tagged line: " + str(tagLine))
151 tb = 'O' 184 tb = 'O'
...@@ -153,20 +186,25 @@ if __name__ == "__main__": ...@@ -153,20 +186,25 @@ if __name__ == "__main__":
153 if len(tagLine)==1: 186 if len(tagLine)==1:
154 if tagLine[0] in labels: 187 if tagLine[0] in labels:
155 start = '<' + tagLine[0] + '> ' 188 start = '<' + tagLine[0] + '> '
156 - end = '<' + tagLine[0] + '/>' 189 + end = '</' + tagLine[0] + '/>'
157 - word = line.split('|')[0] + ' ' 190 + word = line.split('|')[0] + ' '
158 outputLine = start + word + end 191 outputLine = start + word + end
159 else: 192 else:
160 outputLine = line.split(' ')[0] 193 outputLine = line.split(' ')[0]
161 #print(outputLine + '\t' + ', '.join(Ltags)) 194 #print(outputLine + '\t' + ', '.join(Ltags))
162 - sentencesOutputData.append([outputLine, ', '.join(Ltags)]) 195 + sentencesOutputDataI.append([outputLine, ', '.join(Ltags)])
196 + sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + word.split('|')[0] + '\t' + tag)
163 continue 197 continue
164 198
199 + sentence = ''
200 + sb = False
165 for word,tag in zip(line.split(' '), tagLine): 201 for word,tag in zip(line.split(' '), tagLine):
166 # start tagging 202 # start tagging
167 - if tag in labels and tb == 'O': 203 + if tag in labels and tb != tag:
168 # start tagging 204 # start tagging
169 outputLine += '<' + tag + '> ' 205 outputLine += '<' + tag + '> '
206 + sb = True
207 + sentence = word.split('|')[0] + ' '
170 tb = tag 208 tb = tag
171 outputLine += word.split('|')[0] + ' ' 209 outputLine += word.split('|')[0] + ' '
172 i += 1 210 i += 1
...@@ -174,40 +212,38 @@ if __name__ == "__main__": ...@@ -174,40 +212,38 @@ if __name__ == "__main__":
174 # end tagging 212 # end tagging
175 elif tb in labels: 213 elif tb in labels:
176 if i+1==len(tagLine): 214 if i+1==len(tagLine):
177 - # end tagging 215 + # end sentence
178 outputLine += word.split('|')[0] + ' ' 216 outputLine += word.split('|')[0] + ' '
179 - outputLine += '<' + tag + '/> ' 217 + outputLine += '</' + tag + '/> '
218 + sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + sentence + word.split('|')[0] + '\t' +tag)
219 + sb = False
180 tb = 'O' 220 tb = 'O'
181 i += 1 221 i += 1
182 continue 222 continue
183 - elif tagLine[i+1]=='O': 223 + elif tag!=tagLine[i+1]:
184 - # end tagging 224 + # start new tag
185 outputLine += word.split('|')[0] + ' ' 225 outputLine += word.split('|')[0] + ' '
186 - outputLine += '<' + tag + '/> ' 226 + outputLine += '</' + tag + '/> '
227 + sentencesOutputDataII.append(idx[lidx].replace('\n', '\t') + sentence + word.split('|')[0] + '\t' +tag)
228 + sb = False
187 tb = 'O' 229 tb = 'O'
188 i += 1 230 i += 1
189 continue 231 continue
190 # word tagged 232 # word tagged
191 outputLine += word.split('|')[0] + ' ' 233 outputLine += word.split('|')[0] + ' '
192 i += 1 234 i += 1
193 - #print(outputLine + '\t' + ', '.join(Ltags)) 235 + if sb:
194 - sentencesOutputData.append([outputLine, ', '.join(Ltags)]) 236 + sentence+= word.split('|')[0] + ' '
237 + #print(outputLine + '\t' + ', '.join(Ltags))
238 + sentencesOutputDataI.append([outputLine, ', '.join(Ltags)])
239 + lidx += 1
195 240
196 - print( DF(sentencesOutputData) ) 241 + #print( DF(sentencesOutputDataI) )
197 - 242 + #print( '\n'.join(sentencesOutputDataII) )
198 # Save tags 243 # Save tags
199 - ''' 244 + with open(os.path.join(options.outputPath, options.outFileII), "w") as oFile:
200 - with open(os.path.join(options.outputPath, file), "w") as oFile: 245 + for line in sentencesOutputDataII:
201 - for line in sentencesOutputData: 246 + #print(line)
202 oFile.write(line + '\n') 247 oFile.write(line + '\n')
203 248
204 print("Processing corpus done in: %fs" % (time() - t0)) 249 print("Processing corpus done in: %fs" % (time() - t0))
205 -'''
206 -
207 -
208 -
209 -
210 -
211 -
212 -
213 -
......
This diff could not be displayed because it is too large.
1 -------------------------------- PARAMETERS -------------------------------- 1 -------------------------------- PARAMETERS --------------------------------
2 -Path to read input files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ 2 +--inputPath Path of training data set : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
3 -Mode name: model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 3 +--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
4 -Model path: /home/egaytan/automatic-extraction-growth-conditions/CRF/models 4 +--outputFileI Output tagged file I : annot-input_bg_outputI.txt
5 -Path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ 5 +--outputFileII Output tagged file II : annot-input_bg_outputII.txt
6 -Filtering stop words: False 6 +--modelPath Path to read CRF model : /home/egaytan/automatic-extraction-growth-conditions/CRF/models
7 -Levels: S1: FalseS2: FalseS3: FalseS4: False 7 +--modelName Model name : model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
8 -Run variant: None 8 +--infoPath Path of GSE-GSM index file : /home/egaytan/automatic-extraction-growth-conditions/predict-annot/mapping
9 +--infoFile GSE-GSM index file : bg_sentences_midx.txt
10 +--variant Run variant : 13
11 +--S1 General features : True
12 +--S2 Inner/Complete word features : False
13 +--S3 Extended context features : False
14 +--S4 Semantic features : True
15 +--filteringStopWords Filtering stop words : False
16 +--filterSymbols Filtering punctuation marks : False
9 Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False 17 Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
10 -------------------------------- PROCESSING -------------------------------- 18 -------------------------------- PROCESSING --------------------------------
11 Reading CRF model... 19 Reading CRF model...
12 -Reading CRF model done in: 0.008342s 20 +Reading CRF model done in: 0.008336s
13 Processing corpus... 21 Processing corpus...
14 Preprocessing file...annot-input_bg_v3.txt 22 Preprocessing file...annot-input_bg_v3.txt
15 Sentences input data: 14716 23 Sentences input data: 14716
16 Predicting tags with model 24 Predicting tags with model
17 -Prediction done in: 0.983480s 25 +Prediction done in: 1.688127s
18 Tagging file 26 Tagging file
19 - 0 1 27 +Processing corpus done in: 3.948320s
20 -0 <Gtype> antibody : Flag <Gtype/> Gtype
21 -1 <Gversion> ChIP-Seq <Gversion/> Gversion
22 -2 Cultures of Caulobacter -LRB- TLS1631-TLS1633 ... Gtype
23 -3 <Gtype> developmental stage : mixed population... Gtype
24 -4 DNA was isolated using the Qiagen Cell Lysis a...
25 -5 Escherichia coli
26 -6 Escherichia coli AB1157
27 -7 For analysis of ChIP-seq data , Hiseq 2500 Ill...
28 -8 For analysis of IDAP-seq data , Hiseq 2500 Ill... Gtype
29 -9 Genome _ build : NC _ 000913.3
30 -10 Genome _ build : NC _ 011916.1
31 -11 <Gtype> genotype : AB1157 ybbD : : parS scramb... Gtype
32 -12 <Gtype> genotype : AB1157 ybbD : : parS scramb... Gtype
33 -13 <Gtype> genotype : AB1157 ybbD : : parS site 1... Gtype
34 -14 <Gtype> genotype : AB1157 ybbD : : parS site 2... Gtype
35 -15 <Gtype> genotype : AB1157 ybbD : : parS site 2... Gtype
36 -16 <Gtype> genotype : AB1157 ybbD : : parS site 3... Gtype
37 -17 <Gtype> genotype : AB1157 ybbD : : parS site 3... Gtype
38 -18 <Gtype> genotype : AB1157 ybbD : : parS site 4... Gtype
39 -19 <Gtype> genotype : AB1157 ybbD : : parS site 4... Gtype
40 -20 <Gtype> genotype : AB1157 ybbD : : parS site 5... Gtype
41 -21 <Gtype> genotype : AB1157 ybbD : : parS site 5... Gtype
42 -22 <Gtype> genotype : AB1157 ybbD : : parS site 6... Gtype
43 -23 <Gtype> genotype : AB1157 ybbD : : parS site 7... Gtype
44 -24 <Gtype> genotype : AB1157 ybbD : : parS site 7... Gtype
45 -25 Hiseq 2500 Illumina short reads -LRB- 50 bp -R...
46 -26 LELab _ ChIP _ seq _ TLS1637 _ anti _ FLAG
47 -27 LELab _ ChIP _ seq _ TLS1638 _ anti _ FLAG
48 -28 LELab _ ChIP _ seq _ TLS1639 _ anti _ FLAG
49 -29 LELab _ ChIP _ seq _ TLS1640 _ anti _ FLAG
50 -... ... ...
51 -14686 <Phase> ESBL019 Coliform <Phase/> Phase
52 -14687 <Gtype> ESBL019 Filamented <Gtype/> Gtype
53 -14688 ESBL019 Reverted
54 -14689 <Phase> ESBL019 Transition <Phase/> Phase
55 -14690 Escherichia coli
56 -14691 Four morphologic states of ESBL019 were used d...
57 -14692 <Gtype> morphology : Coliform <Gtype/> Gtype
58 -14693 <Gtype> morphology : Filamented <Gtype/> Gtype
59 -14694 morphology : Reverted -LRB- reverted back from...
60 -14695 morphology : Transition -LRB- from Coli into F...
61 -14696 RNA isolation was performed using an RNeasy mi...
62 -14697 <Gtype> strain : beta-lactamase -LRB- ESBL -RR... Gtype
63 -14698 The E. coli isolate ESBL019 was originally iso...
64 -14699 Escherichia coli
65 -14700 lexA 10 ' after UV vs. 0 ' , MG1655
66 -14701 <Gtype> lexA 10 min after UV treatment , 25 ug... Gtype
67 -14702 lexA 20 ' after NOuv vs. 0 ' , MG1655
68 -14703 lexA 20 ' after UV vs. 0 ' , MG1655
69 -14704 lexA 20 min after NOuv , 25 ug total RNA , 2 u...
70 -14705 <Gtype> lexA 20 min after UV treatment , 25 ug... Gtype
71 -14706 lexA 40 ' after UV vs. 0 ' , MG1655
72 -14707 <Gtype> lexA 40 min after UV treatment , 25 ug... Gtype
73 -14708 lexA 5 ' after UV vs. 0 ' , MG1655
74 -14709 <Gtype> lexA 5 min after UV treatment , 25 ug ... Gtype
75 -14710 lexA 60 ' after NOuv vs. 0 ' , MG1655
76 -14711 lexA 60 ' after UV vs. 0 ' , MG1655
77 -14712 lexA 60 min after NOuv , 25 ug total RNA , 2 u...
78 -14713 <Gtype> lexA 60 min after UV treatment , 25 ug... Gtype
79 -14714 lexA vs. wt , before UV treatment , MG1655
80 -14715 untreated cells , 25 ug total RNA
81 -
82 -[14716 rows x 2 columns]
......