Estefani Gaytan Nunez

upload

...@@ -9,9 +9,11 @@ import random ...@@ -9,9 +9,11 @@ import random
9 # 9 #
10 # Input parameters 10 # Input parameters
11 # --inputPath=PATH Path of inputfile 11 # --inputPath=PATH Path of inputfile
12 +# --inputFile Output CoreNLP file with tagging sentences
12 # --outputPath=PATH Path to place output files 13 # --outputPath=PATH Path to place output files
13 # --trainingFile=testFile Output training data set 14 # --trainingFile=testFile Output training data set
14 # --testFile=testFile Output test data set 15 # --testFile=testFile Output test data set
16 +# --index Select a limit CoreNLP output column
15 # 17 #
16 # Output 18 # Output
17 # training and test data set 19 # training and test data set
...@@ -23,7 +25,7 @@ import random ...@@ -23,7 +25,7 @@ import random
23 # --trainingFile training-data-set-70_v4.txt 25 # --trainingFile training-data-set-70_v4.txt
24 # --testFile test-data-set-30_v4.txt 26 # --testFile test-data-set-30_v4.txt
25 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets 27 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets
26 -# 28 +# --index 5
27 # 29 #
28 # python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/test-trainig --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --index 5 30 # python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/test-trainig --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --index 5
29 31
......
...@@ -11,6 +11,7 @@ from optparse import OptionParser ...@@ -11,6 +11,7 @@ from optparse import OptionParser
11 # --outputFile=File Output data set 11 # --outputFile=File Output data set
12 # --minWordLen Minimum word length 12 # --minWordLen Minimum word length
13 # --minSenLen Minimum sentence length 13 # --minSenLen Minimum sentence length
14 +# --index Select a limit CoreNLP output column
14 # 15 #
15 # Output 16 # Output
16 # Tagged sentences reconstruction 17 # Tagged sentences reconstruction
...@@ -23,6 +24,7 @@ from optparse import OptionParser ...@@ -23,6 +24,7 @@ from optparse import OptionParser
23 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input 24 # --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input
24 # --minWordLen 2 25 # --minWordLen 2
25 # --minSenLen 1 26 # --minSenLen 1
27 +# --index 5
26 # 28 #
27 #python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1 29 #python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1
28 30
...@@ -39,7 +41,7 @@ if __name__ == "__main__": ...@@ -39,7 +41,7 @@ if __name__ == "__main__":
39 parser.add_option("--outputFile", dest="outputFile", help="File with training data set", metavar="FILE") 41 parser.add_option("--outputFile", dest="outputFile", help="File with training data set", metavar="FILE")
40 parser.add_option("--minWordLen", dest="wL", help="Minimum word length", type="int") 42 parser.add_option("--minWordLen", dest="wL", help="Minimum word length", type="int")
41 parser.add_option("--minSenLen", dest="sL", help="Minimum word length", type="int") 43 parser.add_option("--minSenLen", dest="sL", help="Minimum word length", type="int")
42 - 44 + parser.add_option("--index", dest="index",help="Select a limit CoreNLP output column", metavar='N', type=int)
43 45
44 (options, args) = parser.parse_args() 46 (options, args) = parser.parse_args()
45 if len(args) > 0: 47 if len(args) > 0:
...@@ -58,23 +60,26 @@ if __name__ == "__main__": ...@@ -58,23 +60,26 @@ if __name__ == "__main__":
58 lista = [] 60 lista = []
59 #First sentence 61 #First sentence
60 sentence = '' 62 sentence = ''
63 + #count
64 + i = 0
61 with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file: 65 with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
62 for line in input_file: 66 for line in input_file:
63 if len(line.split('\t')) > 1: 67 if len(line.split('\t')) > 1:
64 w = line.split('\t')[1] 68 w = line.split('\t')[1]
65 if w == "PGCGROWTHCONDITIONS": 69 if w == "PGCGROWTHCONDITIONS":
70 + i = i + 1
66 if len( sentence.lstrip().split(' ') ) <= options.sL and len(sentence.lstrip().split(' ')[0].split('|')[0]) <= options.wL: 71 if len( sentence.lstrip().split(' ') ) <= options.sL and len(sentence.lstrip().split(' ')[0].split('|')[0]) <= options.wL:
67 - print( "EXCLUDE: " + sentence.lstrip() ) 72 + print( "EXCLUDE: " + str(i) + "line" + sentence.lstrip() )
68 else: 73 else:
69 #End of sentence 74 #End of sentence
70 lista.append(sentence.lstrip()) 75 lista.append(sentence.lstrip())
71 #New setence 76 #New setence
72 n = n+1 77 n = n+1
73 #New setence 78 #New setence
74 - sentence = '' 79 + sentence = ''
75 else: 80 else:
76 #Building and save tagging sentence 81 #Building and save tagging sentence
77 - sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])) 82 + sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:options.index]))
78 83
79 print("Number of sentences: " + str(n)) 84 print("Number of sentences: " + str(n))
80 85
......
1 +# -*- coding: UTF-8 -*-
2 +
3 +import os
4 +from pandas import DataFrame as DF
5 +from optparse import OptionParser
6 +from time import time
7 +from collections import Counter
8 +
9 +import nltk
10 +import sklearn
11 +import scipy.stats
12 +import sys
13 +
14 +import joblib
15 +from sklearn.metrics import make_scorer
16 +from sklearn.model_selection import cross_val_score
17 +from sklearn.model_selection import RandomizedSearchCV
18 +
19 +import sklearn_crfsuite
20 +from sklearn_crfsuite import scorers
21 +from sklearn_crfsuite import metrics
22 +
23 +from nltk.corpus import stopwords
24 +
25 +import training_validation_v14 as training
26 +
27 +#-------------------------------------------------------------------------------
28 +# Objective
29 +# Tagging transformed file with CRF model with sklearn-crfsuite.
30 +#
31 +# Input parameters
32 +# --inputPath=PATH Path of transformed files x|y|z
33 +# --modelPath Path to CRF model
34 +# --modelName Model name
35 +# --outputPath=PATH Output path to place output files
36 +# --filteringStopWords Filtering stop words
37 +# --filterSymbols Filtering punctuation marks
38 +
39 +# Output
40 +# 1) Tagged files in transformed format
41 +
42 +# Examples
43 +# python3 tagging.py
44 +# --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
45 +# --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod
46 +# --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models/
47 +# --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
48 +# --filterSymbols
49 +
50 +# python3 tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --filterSymbols > output_tagging_report.txt
51 +
52 +__author__ = 'egaytan'
53 +
54 +##########################################
55 +# MAIN PROGRAM #
56 +##########################################
57 +
58 +if __name__ == "__main__":
59 + # Defining parameters
60 + parser = OptionParser()
61 + parser.add_option("--inputPath", dest="inputPath", help="Path of training data set", metavar="PATH")
62 + parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH")
63 + parser.add_option("--modelPath", dest="modelPath", help="Path to read CRF model", metavar="PATH")
64 + parser.add_option("--modelName", dest="modelName", help="Model name", metavar="TEXT")
65 + parser.add_option("--variant", dest="variant", help="Report file", metavar="FILE")
66 + parser.add_option("--S1", dest="S1", help="General features", action="store_true", default=False)
67 + parser.add_option("--S2", dest="S2", help="Inner/Complete word features", action="store_true", default=False)
68 + parser.add_option("--S3", dest="S3", help="Extended context features", action="store_true", default=False)
69 + parser.add_option("--S4", dest="S4", help="Semantic features", action="store_true", default=False)
70 + parser.add_option("--filterStopWords", dest="filterStopWords", help="Filtering stop words", action="store_true", default=False)
71 + parser.add_option("--filterSymbols", dest="filterSymbols", help="Filtering punctuation marks", action="store_true", default=False)
72 +
73 + (options, args) = parser.parse_args()
74 + if len(args) > 0:
75 + parser.error("Any parameter given.")
76 + sys.exit(1)
77 +
78 + print('-------------------------------- PARAMETERS --------------------------------')
79 + print("Path to read input files: " + options.inputPath)
80 + print("Mode name: " + str(options.modelName))
81 + print("Model path: " + options.modelPath)
82 + print("Path to place output files: " + options.outputPath)
83 + print("Filtering stop words: " + str(options.filterStopWords))
84 + print("Levels: " + "S1: " + str(options.S1) + "S2: " + str(options.S2) + "S3: " + str(options.S3) + "S4: " + str(options.S4))
85 + print("Run variant: " + str(options.variant))
86 +
87 + symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
88 + '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
89 +
90 + print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
91 +
92 + print('-------------------------------- PROCESSING --------------------------------')
93 +
94 + stopwords = [word for word in stopwords.words('english')]
95 +
96 + # Read CRF model
97 + t0 = time()
98 + print('Reading CRF model...')
99 + crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod'))
100 + print("Reading CRF model done in: %fs" % (time() - t0))
101 +
102 + # Reading sentences
103 + print('Processing corpus...')
104 + t0 = time()
105 + labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH'])
106 + # Walk directory to read files
107 + for path, dirs, files in os.walk(options.inputPath):
108 + # For each file in dir
109 + for file in files:
110 + print("Preprocessing file..." + str(file))
111 + sentencesInputData = []
112 + sentencesOutputData = []
113 + with open(os.path.join(options.inputPath, file), "r") as iFile:
114 + lines = iFile.readlines()
115 + for line in lines:
116 + listLine = []
117 + for token in line.strip('\n').split():
118 + if options.filterStopWords:
119 + listToken = token.split('|')
120 + lemma = listToken[1]
121 + if lemma in stopwords:
122 + continue
123 + if options.filterSymbols:
124 + listToken = token.split('|')
125 + lemma = listToken[1]
126 + if lemma in symbols:
127 + if lemma == ',':
128 + print("Coma , identificada")
129 + continue
130 + listLine.append(token)
131 + sentencesInputData.append(listLine)
132 + X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData]
133 + print("Sentences input data: " + str(len(sentencesInputData)))
134 +
135 +
136 + # Predicting tags
137 + t1 = time()
138 + print("Predicting tags with model")
139 + y_pred = crf.predict(X_input)
140 + print("Prediction done in: %fs" % (time() - t1))
141 +
142 +
143 + # Tagging with CRF model
144 + print("Tagging file")
145 + for line, tagLine in zip(lines, y_pred):
146 + Ltags = set(labels).intersection(set(tagLine))
147 + outputLine = ''
148 + line = line.strip('\n')
149 + #print("\nLine: " + str(line))
150 + #print ("CRF tagged line: " + str(tagLine))
151 + tb = 'O'
152 + i = 0
153 + if len(tagLine)==1:
154 + if tagLine[0] in labels:
155 + start = '<' + tagLine[0] + '> '
156 + end = '<' + tagLine[0] + '/>'
157 + word = line.split('|')[0] + ' '
158 + outputLine = start + word + end
159 + else:
160 + outputLine = line.split(' ')[0]
161 + #print(outputLine + '\t' + ', '.join(Ltags))
162 + sentencesOutputData.append([outputLine, ', '.join(Ltags)])
163 + continue
164 +
165 + for word,tag in zip(line.split(' '), tagLine):
166 + # start tagging
167 + if tag in labels and tb == 'O':
168 + # start tagging
169 + outputLine += '<' + tag + '> '
170 + tb = tag
171 + outputLine += word.split('|')[0] + ' '
172 + i += 1
173 + continue
174 + # end tagging
175 + elif tb in labels:
176 + if i+1==len(tagLine):
177 + # end tagging
178 + outputLine += word.split('|')[0] + ' '
179 + outputLine += '<' + tag + '/> '
180 + tb = 'O'
181 + i += 1
182 + continue
183 + elif tagLine[i+1]=='O':
184 + # end tagging
185 + outputLine += word.split('|')[0] + ' '
186 + outputLine += '<' + tag + '/> '
187 + tb = 'O'
188 + i += 1
189 + continue
190 + # word tagged
191 + outputLine += word.split('|')[0] + ' '
192 + i += 1
193 + #print(outputLine + '\t' + ', '.join(Ltags))
194 + sentencesOutputData.append([outputLine, ', '.join(Ltags)])
195 +
196 + print( DF(sentencesOutputData) )
197 +
198 + # Save tags
199 + '''
200 + with open(os.path.join(options.outputPath, file), "w") as oFile:
201 + for line in sentencesOutputData:
202 + oFile.write(line + '\n')
203 +
204 + print("Processing corpus done in: %fs" % (time() - t0))
205 +'''
206 +
207 +
208 +
209 +
210 +
211 +
212 +
213 +
1 +# -*- coding: UTF-8 -*-
2 +
3 +import os
4 +from optparse import OptionParser
5 +from time import time
6 +from collections import Counter
7 +
8 +import nltk
9 +import sklearn
10 +import scipy.stats
11 +import sys
12 +
13 +#from sklearn.externals import joblib
14 +import joblib
15 +from sklearn.metrics import make_scorer
16 +#from sklearn.cross_validation import cross_val_score
17 +from sklearn.model_selection import cross_val_score
18 +#from sklearn.grid_search import RandomizedSearchCV
19 +from sklearn.model_selection import RandomizedSearchCV
20 +
21 +import sklearn_crfsuite
22 +from sklearn_crfsuite import scorers
23 +from sklearn_crfsuite import metrics
24 +
25 +from nltk.corpus import stopwords
26 +
27 +#################################
This diff is collapsed. Click to expand it.
This diff could not be displayed because it is too large.
1 +-------------------------------- PARAMETERS --------------------------------
2 +Path to read input files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/
3 +Mode name: model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10
4 +Model path: /home/egaytan/automatic-extraction-growth-conditions/CRF/models
5 +Path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/
6 +Filtering stop words: False
7 +Levels: S1: FalseS2: FalseS3: FalseS4: False
8 +Run variant: None
9 +Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False
10 +-------------------------------- PROCESSING --------------------------------
11 +Reading CRF model...
12 +Reading CRF model done in: 0.008342s
13 +Processing corpus...
14 +Preprocessing file...annot-input_bg_v3.txt
15 +Sentences input data: 14716
16 +Predicting tags with model
17 +Prediction done in: 0.983480s
18 +Tagging file
19 + 0 1
20 +0 <Gtype> antibody : Flag <Gtype/> Gtype
21 +1 <Gversion> ChIP-Seq <Gversion/> Gversion
22 +2 Cultures of Caulobacter -LRB- TLS1631-TLS1633 ... Gtype
23 +3 <Gtype> developmental stage : mixed population... Gtype
24 +4 DNA was isolated using the Qiagen Cell Lysis a...
25 +5 Escherichia coli
26 +6 Escherichia coli AB1157
27 +7 For analysis of ChIP-seq data , Hiseq 2500 Ill...
28 +8 For analysis of IDAP-seq data , Hiseq 2500 Ill... Gtype
29 +9 Genome _ build : NC _ 000913.3
30 +10 Genome _ build : NC _ 011916.1
31 +11 <Gtype> genotype : AB1157 ybbD : : parS scramb... Gtype
32 +12 <Gtype> genotype : AB1157 ybbD : : parS scramb... Gtype
33 +13 <Gtype> genotype : AB1157 ybbD : : parS site 1... Gtype
34 +14 <Gtype> genotype : AB1157 ybbD : : parS site 2... Gtype
35 +15 <Gtype> genotype : AB1157 ybbD : : parS site 2... Gtype
36 +16 <Gtype> genotype : AB1157 ybbD : : parS site 3... Gtype
37 +17 <Gtype> genotype : AB1157 ybbD : : parS site 3... Gtype
38 +18 <Gtype> genotype : AB1157 ybbD : : parS site 4... Gtype
39 +19 <Gtype> genotype : AB1157 ybbD : : parS site 4... Gtype
40 +20 <Gtype> genotype : AB1157 ybbD : : parS site 5... Gtype
41 +21 <Gtype> genotype : AB1157 ybbD : : parS site 5... Gtype
42 +22 <Gtype> genotype : AB1157 ybbD : : parS site 6... Gtype
43 +23 <Gtype> genotype : AB1157 ybbD : : parS site 7... Gtype
44 +24 <Gtype> genotype : AB1157 ybbD : : parS site 7... Gtype
45 +25 Hiseq 2500 Illumina short reads -LRB- 50 bp -R...
46 +26 LELab _ ChIP _ seq _ TLS1637 _ anti _ FLAG
47 +27 LELab _ ChIP _ seq _ TLS1638 _ anti _ FLAG
48 +28 LELab _ ChIP _ seq _ TLS1639 _ anti _ FLAG
49 +29 LELab _ ChIP _ seq _ TLS1640 _ anti _ FLAG
50 +... ... ...
51 +14686 <Phase> ESBL019 Coliform <Phase/> Phase
52 +14687 <Gtype> ESBL019 Filamented <Gtype/> Gtype
53 +14688 ESBL019 Reverted
54 +14689 <Phase> ESBL019 Transition <Phase/> Phase
55 +14690 Escherichia coli
56 +14691 Four morphologic states of ESBL019 were used d...
57 +14692 <Gtype> morphology : Coliform <Gtype/> Gtype
58 +14693 <Gtype> morphology : Filamented <Gtype/> Gtype
59 +14694 morphology : Reverted -LRB- reverted back from...
60 +14695 morphology : Transition -LRB- from Coli into F...
61 +14696 RNA isolation was performed using an RNeasy mi...
62 +14697 <Gtype> strain : beta-lactamase -LRB- ESBL -RR... Gtype
63 +14698 The E. coli isolate ESBL019 was originally iso...
64 +14699 Escherichia coli
65 +14700 lexA 10 ' after UV vs. 0 ' , MG1655
66 +14701 <Gtype> lexA 10 min after UV treatment , 25 ug... Gtype
67 +14702 lexA 20 ' after NOuv vs. 0 ' , MG1655
68 +14703 lexA 20 ' after UV vs. 0 ' , MG1655
69 +14704 lexA 20 min after NOuv , 25 ug total RNA , 2 u...
70 +14705 <Gtype> lexA 20 min after UV treatment , 25 ug... Gtype
71 +14706 lexA 40 ' after UV vs. 0 ' , MG1655
72 +14707 <Gtype> lexA 40 min after UV treatment , 25 ug... Gtype
73 +14708 lexA 5 ' after UV vs. 0 ' , MG1655
74 +14709 <Gtype> lexA 5 min after UV treatment , 25 ug ... Gtype
75 +14710 lexA 60 ' after NOuv vs. 0 ' , MG1655
76 +14711 lexA 60 ' after UV vs. 0 ' , MG1655
77 +14712 lexA 60 min after NOuv , 25 ug total RNA , 2 u...
78 +14713 <Gtype> lexA 60 min after UV treatment , 25 ug... Gtype
79 +14714 lexA vs. wt , before UV treatment , MG1655
80 +14715 untreated cells , 25 ug total RNA
81 +
82 +[14716 rows x 2 columns]