Showing
8 changed files
with
887 additions
and
5 deletions
... | @@ -9,9 +9,11 @@ import random | ... | @@ -9,9 +9,11 @@ import random |
9 | # | 9 | # |
10 | # Input parameters | 10 | # Input parameters |
11 | # --inputPath=PATH Path of inputfile | 11 | # --inputPath=PATH Path of inputfile |
12 | +# --inputFile Output CoreNLP file with tagging sentences | ||
12 | # --outputPath=PATH Path to place output files | 13 | # --outputPath=PATH Path to place output files |
13 | # --trainingFile=testFile Output training data set | 14 | # --trainingFile=testFile Output training data set |
14 | # --testFile=testFile Output test data set | 15 | # --testFile=testFile Output test data set |
16 | +# --index Select a limit CoreNLP output column | ||
15 | # | 17 | # |
16 | # Output | 18 | # Output |
17 | # training and test data set | 19 | # training and test data set |
... | @@ -23,7 +25,7 @@ import random | ... | @@ -23,7 +25,7 @@ import random |
23 | # --trainingFile training-data-set-70_v4.txt | 25 | # --trainingFile training-data-set-70_v4.txt |
24 | # --testFile test-data-set-30_v4.txt | 26 | # --testFile test-data-set-30_v4.txt |
25 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets | 27 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets |
26 | -# | 28 | +# --index 5 |
27 | # | 29 | # |
28 | # python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/test-trainig --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --index 5 | 30 | # python label-split_training_test_v1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/test-trainig --inputFile raw-metadata-senteneces_v2.txt.conll --trainingFile training-data-set-70._v4txt --testFile test-data-set-30_v4.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --index 5 |
29 | 31 | ... | ... |
... | @@ -11,6 +11,7 @@ from optparse import OptionParser | ... | @@ -11,6 +11,7 @@ from optparse import OptionParser |
11 | # --outputFile=File Output data set | 11 | # --outputFile=File Output data set |
12 | # --minWordLen Minimum word length | 12 | # --minWordLen Minimum word length |
13 | # --minSenLen Minimum sentence length | 13 | # --minSenLen Minimum sentence length |
14 | +# --index Select a limit CoreNLP output column | ||
14 | # | 15 | # |
15 | # Output | 16 | # Output |
16 | # Tagged sentences reconstruction | 17 | # Tagged sentences reconstruction |
... | @@ -23,6 +24,7 @@ from optparse import OptionParser | ... | @@ -23,6 +24,7 @@ from optparse import OptionParser |
23 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input | 24 | # --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input |
24 | # --minWordLen 2 | 25 | # --minWordLen 2 |
25 | # --minSenLen 1 | 26 | # --minSenLen 1 |
27 | +# --index 5 | ||
26 | # | 28 | # |
27 | #python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1 | 29 | #python built_bg_sentences.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CoreNLP/output/annotation --inputFile bg_sentences_v2.txt.ner --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input --outputFile annot-input_bg.txt --minWordLen 2 --minSenLen 1 |
28 | 30 | ||
... | @@ -39,7 +41,7 @@ if __name__ == "__main__": | ... | @@ -39,7 +41,7 @@ if __name__ == "__main__": |
39 | parser.add_option("--outputFile", dest="outputFile", help="File with training data set", metavar="FILE") | 41 | parser.add_option("--outputFile", dest="outputFile", help="File with training data set", metavar="FILE") |
40 | parser.add_option("--minWordLen", dest="wL", help="Minimum word length", type="int") | 42 | parser.add_option("--minWordLen", dest="wL", help="Minimum word length", type="int") |
41 | parser.add_option("--minSenLen", dest="sL", help="Minimum word length", type="int") | 43 | parser.add_option("--minSenLen", dest="sL", help="Minimum word length", type="int") |
42 | - | 44 | + parser.add_option("--index", dest="index",help="Select a limit CoreNLP output column", metavar='N', type=int) |
43 | 45 | ||
44 | (options, args) = parser.parse_args() | 46 | (options, args) = parser.parse_args() |
45 | if len(args) > 0: | 47 | if len(args) > 0: |
... | @@ -58,23 +60,26 @@ if __name__ == "__main__": | ... | @@ -58,23 +60,26 @@ if __name__ == "__main__": |
58 | lista = [] | 60 | lista = [] |
59 | #First sentence | 61 | #First sentence |
60 | sentence = '' | 62 | sentence = '' |
63 | + #count | ||
64 | + i = 0 | ||
61 | with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file: | 65 | with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file: |
62 | for line in input_file: | 66 | for line in input_file: |
63 | if len(line.split('\t')) > 1: | 67 | if len(line.split('\t')) > 1: |
64 | w = line.split('\t')[1] | 68 | w = line.split('\t')[1] |
65 | if w == "PGCGROWTHCONDITIONS": | 69 | if w == "PGCGROWTHCONDITIONS": |
70 | + i = i + 1 | ||
66 | if len( sentence.lstrip().split(' ') ) <= options.sL and len(sentence.lstrip().split(' ')[0].split('|')[0]) <= options.wL: | 71 | if len( sentence.lstrip().split(' ') ) <= options.sL and len(sentence.lstrip().split(' ')[0].split('|')[0]) <= options.wL: |
67 | - print( "EXCLUDE: " + sentence.lstrip() ) | 72 | + print( "EXCLUDE: " + str(i) + "line" + sentence.lstrip() ) |
68 | else: | 73 | else: |
69 | #End of sentence | 74 | #End of sentence |
70 | lista.append(sentence.lstrip()) | 75 | lista.append(sentence.lstrip()) |
71 | #New setence | 76 | #New setence |
72 | n = n+1 | 77 | n = n+1 |
73 | #New setence | 78 | #New setence |
74 | - sentence = '' | 79 | + sentence = '' |
75 | else: | 80 | else: |
76 | #Building and save tagging sentence | 81 | #Building and save tagging sentence |
77 | - sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])) | 82 | + sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:options.index])) |
78 | 83 | ||
79 | print("Number of sentences: " + str(n)) | 84 | print("Number of sentences: " + str(n)) |
80 | 85 | ... | ... |
No preview for this file type
predict-annot/bin/tagging/tagging.py
0 → 100644
1 | +# -*- coding: UTF-8 -*- | ||
2 | + | ||
3 | +import os | ||
4 | +from pandas import DataFrame as DF | ||
5 | +from optparse import OptionParser | ||
6 | +from time import time | ||
7 | +from collections import Counter | ||
8 | + | ||
9 | +import nltk | ||
10 | +import sklearn | ||
11 | +import scipy.stats | ||
12 | +import sys | ||
13 | + | ||
14 | +import joblib | ||
15 | +from sklearn.metrics import make_scorer | ||
16 | +from sklearn.model_selection import cross_val_score | ||
17 | +from sklearn.model_selection import RandomizedSearchCV | ||
18 | + | ||
19 | +import sklearn_crfsuite | ||
20 | +from sklearn_crfsuite import scorers | ||
21 | +from sklearn_crfsuite import metrics | ||
22 | + | ||
23 | +from nltk.corpus import stopwords | ||
24 | + | ||
25 | +import training_validation_v14 as training | ||
26 | + | ||
27 | +#------------------------------------------------------------------------------- | ||
28 | +# Objective | ||
29 | +# Tagging transformed file with CRF model with sklearn-crfsuite. | ||
30 | +# | ||
31 | +# Input parameters | ||
32 | +# --inputPath=PATH Path of transformed files x|y|z | ||
33 | +# --modelPath Path to CRF model | ||
34 | +# --modelName Model name | ||
35 | +# --outputPath=PATH Output path to place output files | ||
36 | +# --filteringStopWords Filtering stop words | ||
37 | +# --filterSymbols Filtering punctuation marks | ||
38 | + | ||
39 | +# Output | ||
40 | +# 1) Tagged files in transformed format | ||
41 | + | ||
42 | +# Examples | ||
43 | +# python3 tagging.py | ||
44 | +# --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ | ||
45 | +# --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod | ||
46 | +# --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models/ | ||
47 | +# --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ | ||
48 | +# --filterSymbols | ||
49 | + | ||
50 | +# python3 tagging.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ --modelName model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.mod --modelPath /home/egaytan/automatic-extraction-growth-conditions/CRF/models --outputPath /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ --filterSymbols > output_tagging_report.txt | ||
51 | + | ||
52 | +__author__ = 'egaytan' | ||
53 | + | ||
54 | +########################################## | ||
55 | +# MAIN PROGRAM # | ||
56 | +########################################## | ||
57 | + | ||
58 | +if __name__ == "__main__": | ||
59 | + # Defining parameters | ||
60 | + parser = OptionParser() | ||
61 | + parser.add_option("--inputPath", dest="inputPath", help="Path of training data set", metavar="PATH") | ||
62 | + parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH") | ||
63 | + parser.add_option("--modelPath", dest="modelPath", help="Path to read CRF model", metavar="PATH") | ||
64 | + parser.add_option("--modelName", dest="modelName", help="Model name", metavar="TEXT") | ||
65 | + parser.add_option("--variant", dest="variant", help="Report file", metavar="FILE") | ||
66 | + parser.add_option("--S1", dest="S1", help="General features", action="store_true", default=False) | ||
67 | + parser.add_option("--S2", dest="S2", help="Inner/Complete word features", action="store_true", default=False) | ||
68 | + parser.add_option("--S3", dest="S3", help="Extended context features", action="store_true", default=False) | ||
69 | + parser.add_option("--S4", dest="S4", help="Semantic features", action="store_true", default=False) | ||
70 | + parser.add_option("--filterStopWords", dest="filterStopWords", help="Filtering stop words", action="store_true", default=False) | ||
71 | + parser.add_option("--filterSymbols", dest="filterSymbols", help="Filtering punctuation marks", action="store_true", default=False) | ||
72 | + | ||
73 | + (options, args) = parser.parse_args() | ||
74 | + if len(args) > 0: | ||
75 | + parser.error("Any parameter given.") | ||
76 | + sys.exit(1) | ||
77 | + | ||
78 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
79 | + print("Path to read input files: " + options.inputPath) | ||
80 | + print("Mode name: " + str(options.modelName)) | ||
81 | + print("Model path: " + options.modelPath) | ||
82 | + print("Path to place output files: " + options.outputPath) | ||
83 | + print("Filtering stop words: " + str(options.filterStopWords)) | ||
84 | + print("Levels: " + "S1: " + str(options.S1) + "S2: " + str(options.S2) + "S3: " + str(options.S3) + "S4: " + str(options.S4)) | ||
85 | + print("Run variant: " + str(options.variant)) | ||
86 | + | ||
87 | + symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | ||
88 | + '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...'] | ||
89 | + | ||
90 | + print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols)) | ||
91 | + | ||
92 | + print('-------------------------------- PROCESSING --------------------------------') | ||
93 | + | ||
94 | + stopwords = [word for word in stopwords.words('english')] | ||
95 | + | ||
96 | + # Read CRF model | ||
97 | + t0 = time() | ||
98 | + print('Reading CRF model...') | ||
99 | + crf = joblib.load(os.path.join(options.modelPath, options.modelName + '.mod')) | ||
100 | + print("Reading CRF model done in: %fs" % (time() - t0)) | ||
101 | + | ||
102 | + # Reading sentences | ||
103 | + print('Processing corpus...') | ||
104 | + t0 = time() | ||
105 | + labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH']) | ||
106 | + # Walk directory to read files | ||
107 | + for path, dirs, files in os.walk(options.inputPath): | ||
108 | + # For each file in dir | ||
109 | + for file in files: | ||
110 | + print("Preprocessing file..." + str(file)) | ||
111 | + sentencesInputData = [] | ||
112 | + sentencesOutputData = [] | ||
113 | + with open(os.path.join(options.inputPath, file), "r") as iFile: | ||
114 | + lines = iFile.readlines() | ||
115 | + for line in lines: | ||
116 | + listLine = [] | ||
117 | + for token in line.strip('\n').split(): | ||
118 | + if options.filterStopWords: | ||
119 | + listToken = token.split('|') | ||
120 | + lemma = listToken[1] | ||
121 | + if lemma in stopwords: | ||
122 | + continue | ||
123 | + if options.filterSymbols: | ||
124 | + listToken = token.split('|') | ||
125 | + lemma = listToken[1] | ||
126 | + if lemma in symbols: | ||
127 | + if lemma == ',': | ||
128 | + print("Coma , identificada") | ||
129 | + continue | ||
130 | + listLine.append(token) | ||
131 | + sentencesInputData.append(listLine) | ||
132 | + X_input = [training.sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesInputData] | ||
133 | + print("Sentences input data: " + str(len(sentencesInputData))) | ||
134 | + | ||
135 | + | ||
136 | + # Predicting tags | ||
137 | + t1 = time() | ||
138 | + print("Predicting tags with model") | ||
139 | + y_pred = crf.predict(X_input) | ||
140 | + print("Prediction done in: %fs" % (time() - t1)) | ||
141 | + | ||
142 | + | ||
143 | + # Tagging with CRF model | ||
144 | + print("Tagging file") | ||
145 | + for line, tagLine in zip(lines, y_pred): | ||
146 | + Ltags = set(labels).intersection(set(tagLine)) | ||
147 | + outputLine = '' | ||
148 | + line = line.strip('\n') | ||
149 | + #print("\nLine: " + str(line)) | ||
150 | + #print ("CRF tagged line: " + str(tagLine)) | ||
151 | + tb = 'O' | ||
152 | + i = 0 | ||
153 | + if len(tagLine)==1: | ||
154 | + if tagLine[0] in labels: | ||
155 | + start = '<' + tagLine[0] + '> ' | ||
156 | + end = '<' + tagLine[0] + '/>' | ||
157 | + word = line.split('|')[0] + ' ' | ||
158 | + outputLine = start + word + end | ||
159 | + else: | ||
160 | + outputLine = line.split(' ')[0] | ||
161 | + #print(outputLine + '\t' + ', '.join(Ltags)) | ||
162 | + sentencesOutputData.append([outputLine, ', '.join(Ltags)]) | ||
163 | + continue | ||
164 | + | ||
165 | + for word,tag in zip(line.split(' '), tagLine): | ||
166 | + # start tagging | ||
167 | + if tag in labels and tb == 'O': | ||
168 | + # start tagging | ||
169 | + outputLine += '<' + tag + '> ' | ||
170 | + tb = tag | ||
171 | + outputLine += word.split('|')[0] + ' ' | ||
172 | + i += 1 | ||
173 | + continue | ||
174 | + # end tagging | ||
175 | + elif tb in labels: | ||
176 | + if i+1==len(tagLine): | ||
177 | + # end tagging | ||
178 | + outputLine += word.split('|')[0] + ' ' | ||
179 | + outputLine += '<' + tag + '/> ' | ||
180 | + tb = 'O' | ||
181 | + i += 1 | ||
182 | + continue | ||
183 | + elif tagLine[i+1]=='O': | ||
184 | + # end tagging | ||
185 | + outputLine += word.split('|')[0] + ' ' | ||
186 | + outputLine += '<' + tag + '/> ' | ||
187 | + tb = 'O' | ||
188 | + i += 1 | ||
189 | + continue | ||
190 | + # word tagged | ||
191 | + outputLine += word.split('|')[0] + ' ' | ||
192 | + i += 1 | ||
193 | + #print(outputLine + '\t' + ', '.join(Ltags)) | ||
194 | + sentencesOutputData.append([outputLine, ', '.join(Ltags)]) | ||
195 | + | ||
196 | + print( DF(sentencesOutputData) ) | ||
197 | + | ||
198 | + # Save tags | ||
199 | + ''' | ||
200 | + with open(os.path.join(options.outputPath, file), "w") as oFile: | ||
201 | + for line in sentencesOutputData: | ||
202 | + oFile.write(line + '\n') | ||
203 | + | ||
204 | + print("Processing corpus done in: %fs" % (time() - t0)) | ||
205 | +''' | ||
206 | + | ||
207 | + | ||
208 | + | ||
209 | + | ||
210 | + | ||
211 | + | ||
212 | + | ||
213 | + |
predict-annot/bin/tagging/tlibs.py
0 → 100644
1 | +# -*- coding: UTF-8 -*- | ||
2 | + | ||
3 | +import os | ||
4 | +from optparse import OptionParser | ||
5 | +from time import time | ||
6 | +from collections import Counter | ||
7 | + | ||
8 | +import nltk | ||
9 | +import sklearn | ||
10 | +import scipy.stats | ||
11 | +import sys | ||
12 | + | ||
13 | +#from sklearn.externals import joblib | ||
14 | +import joblib | ||
15 | +from sklearn.metrics import make_scorer | ||
16 | +#from sklearn.cross_validation import cross_val_score | ||
17 | +from sklearn.model_selection import cross_val_score | ||
18 | +#from sklearn.grid_search import RandomizedSearchCV | ||
19 | +from sklearn.model_selection import RandomizedSearchCV | ||
20 | + | ||
21 | +import sklearn_crfsuite | ||
22 | +from sklearn_crfsuite import scorers | ||
23 | +from sklearn_crfsuite import metrics | ||
24 | + | ||
25 | +from nltk.corpus import stopwords | ||
26 | + | ||
27 | +################################# |
1 | +# -*- coding: UTF-8 -*- | ||
2 | + | ||
3 | +import os # Access operative sistem | ||
4 | +#from itertools import chain # No se ocupa | ||
5 | +from optparse import OptionParser # Number of transitions | ||
6 | +from time import time # Return the time in seconds since the epoch as a float | ||
7 | +from collections import Counter # Dict subclass for counting hashable objects | ||
8 | +#import re # No se ocupa | ||
9 | + | ||
10 | +import nltk # Natural Language Toolkit platform to work with human language data | ||
11 | +import sklearn # Free software machine learning | ||
12 | +import scipy.stats # library of statistical functions | ||
13 | +import sys # to exit from Python. | ||
14 | + | ||
15 | +import joblib # provide lightweight pipelining | ||
16 | +from sklearn.metrics import make_scorer # Make a scorer from a performance metric or loss function | ||
17 | +from sklearn.model_selection import cross_val_score # Evaluate a score by cross-validation | ||
18 | +from sklearn.model_selection import RandomizedSearchCV # Randomized search on hyper parameters | ||
19 | + | ||
20 | +import sklearn_crfsuite # Thin CRFsuite | ||
21 | +from sklearn_crfsuite import scorers # Added scorers.sequence_accuracy | ||
22 | +from sklearn_crfsuite import metrics # Add flat recall score to metrics | ||
23 | + | ||
24 | +from pandas import DataFrame as DF # Contruct dataframe object | ||
25 | +from nltk.corpus import stopwords # To exclude top words | ||
26 | + | ||
27 | +#------------------------------------------------------------------------------- | ||
28 | +# Objective | ||
29 | +# Training and evaluation of CRFs with sklearn-crfsuite. | ||
30 | +# | ||
31 | +# Input parameters | ||
32 | +# (1) --inputPath Path of training and test data set | ||
33 | +# (2) --outputPath Output path to place output files | ||
34 | +# (3) --trainingFile File with training data set | ||
35 | +# (4) --testFile File with test data set | ||
36 | +# (5) --reportName Number of run | ||
37 | +# (6) --variant Part of S2 variant | ||
38 | +# (7) --nrules Number of crf transitions | ||
39 | +# (8) --S1 Inner word features set | ||
40 | +# (9) --S2 Complete word features | ||
41 | +# (10) --S3 Extended context features | ||
42 | +# (11) --S4 Semantic features | ||
43 | +# (12) --excludeStopWords | ||
44 | +# (13) --excludeSymbols | ||
45 | + | ||
46 | +# Output | ||
47 | +# 1) Best model | ||
48 | +# 2) Report | ||
49 | + | ||
50 | +# Examples | ||
51 | +# python3 training_validation_v14.0.1.py | ||
52 | +# --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/input/ | ||
53 | +# --trainingFile training-data-set-70-NER.txt | ||
54 | +# --testFile test-data-set-30-NER.txt | ||
55 | +# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ | ||
56 | +# --nrules 500 | ||
57 | +# --reportName Run1 | ||
58 | +# --variant 11 | ||
59 | +# --S1 | ||
60 | +# --S2 | ||
61 | +# --S3 | ||
62 | +# --S4 | ||
63 | + | ||
64 | +# python3 /home/egaytan/automatic-extraction-growth-conditions/CRF/bin/training/training_validation_v14.0.1.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/data-sets --trainingFile training-data-set-70-NER.txt --testFile test-data-set-30-NER.txt --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/ --nrules 500 --reportName Run1 --variant 10 > ../../outputs/enero/Run1_v10.txt | ||
65 | + | ||
66 | +################################################################## | ||
67 | +# FEATURES # | ||
68 | +################################################################## | ||
69 | + | ||
70 | +#================== COMPLETE WORD FEATURES ======================# | ||
71 | + | ||
72 | +def isGreek(word): | ||
73 | + ## Complete word are greek letters | ||
74 | + alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω', | ||
75 | + 'α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω'] | ||
76 | + if word in alphabet: | ||
77 | + return True | ||
78 | + else: | ||
79 | + return False | ||
80 | + | ||
81 | +#================ INNER OF THE WORD FEATURES ====================# | ||
82 | + | ||
83 | +def hGreek(word): | ||
84 | + ## Search for at least has one greek letter | ||
85 | + alphabet = ['Α','Β','Γ','Δ','Ε','Ζ','Η','Θ','Ι','Κ','Λ','Μ','Ν','Ξ','Ο','Π','Ρ','Σ','Τ','Υ','Φ','Χ','Ψ','Ω','α','β','γ','δ','ε','ζ','η','θ','ι','κ','λ','μ','ν','ξ','ο','π','ρ','ς','σ','τ','υ','φ','χ','ψ','ω'] | ||
86 | + # hexadicimal code | ||
87 | + matches = [letter for letter in word if letter in alphabet] | ||
88 | + if (len(matches) > 0): | ||
89 | + return(True) | ||
90 | + else: return(False) | ||
91 | + ## At least a greek letter | ||
92 | + | ||
93 | +def hNumber(word): | ||
94 | + ## Al leats has one greek letter | ||
95 | + for l in word: | ||
96 | + if l.isdigit(): | ||
97 | + return True | ||
98 | + return False | ||
99 | + | ||
100 | +def hUpper(word): | ||
101 | + ## At least an upper letter | ||
102 | + for l in word: | ||
103 | + if l.isupper(): return True | ||
104 | + return False | ||
105 | + | ||
106 | +def hLower(word): | ||
107 | + ## At least a lower letter | ||
108 | + for l in word: | ||
109 | + if l.islower(): return True | ||
110 | + return False | ||
111 | + | ||
112 | +#============================FEATURES===========================# | ||
113 | + | ||
114 | +def word2features(sent, i, S1, S2, S3, S4, v): #SA, v | ||
115 | + ## Getting word features | ||
116 | + | ||
117 | + ## Saving CoreNLP annotations | ||
118 | + listElem = sent[i].split('|') | ||
119 | + ## Split CoreNLP output by columns | ||
120 | + word = listElem[0] | ||
121 | + lemma = listElem[1] | ||
122 | + postag = listElem[2] | ||
123 | + ner = listElem[3] | ||
124 | + | ||
125 | + #=========================== G =============================# | ||
126 | + ## NAME LEVEL G | ||
127 | + ## FUTURE TYPE General features | ||
128 | + | ||
129 | + ## Adding to features dictionary | ||
130 | + features = { | ||
131 | + ## basal features | ||
132 | + 'lemma': lemma, | ||
133 | + 'postag': postag | ||
134 | + } | ||
135 | + | ||
136 | + ## Anterior lemma and postag | ||
137 | + ## need more tha one word in sentence | ||
138 | + if i > 0: | ||
139 | + ## Split CoreNLP output by columns | ||
140 | + listElem = sent[i - 1].split('|') | ||
141 | + | ||
142 | + ## Saving CoreNLP annotations | ||
143 | + lemma0 = listElem[1] | ||
144 | + postag0 = listElem[2] | ||
145 | + ## Adding features to dictionary | ||
146 | + features.update({ | ||
147 | + #LemaG anterior | ||
148 | + '-1:lemma': lemma0, | ||
149 | + #Postag anterior | ||
150 | + '-1:postag': postag0, | ||
151 | + }) | ||
152 | + | ||
153 | + ## Posterior lemma and postag | ||
154 | + ## is not the last word | ||
155 | + if i < len(sent) - 1: | ||
156 | + ## Posterior word | ||
157 | + listElem = sent[i + 1].split('|') | ||
158 | + ## Saving CoreNLP annotations | ||
159 | + lemma2 = listElem[1] | ||
160 | + postag2 = listElem[2] | ||
161 | + ## Adding to features dictionary | ||
162 | + features.update({ | ||
163 | + #LemaG posterior | ||
164 | + '+1:lemma': lemma2, | ||
165 | + #Postag posterior | ||
166 | + '+1:postag': postag2, | ||
167 | + }) | ||
168 | + | ||
169 | + #=========================== S1 =============================# | ||
170 | + ## NAME LEVEL S1 | ||
171 | + ## FEATURE TYPE Inner word features | ||
172 | + | ||
173 | + if S1: | ||
174 | + ## Adding features to dictionary | ||
175 | + features.update({ | ||
176 | + 'hUpper' : hUpper(word), | ||
177 | + 'hLower' : hLower(word), | ||
178 | + 'hGreek' : hGreek(word), | ||
179 | + 'symb' : word.isalnum() | ||
180 | + }) | ||
181 | + #========== Variants of inner words features ============# | ||
182 | + if v == 10: | ||
183 | + #word first character | ||
184 | + features['word[:1]']= word[:1] | ||
185 | + | ||
186 | + #word second character | ||
187 | + if len(word)>1: | ||
188 | + features['word[:2]']= word[:2] | ||
189 | + | ||
190 | + if v == 11: | ||
191 | + #lemma and postag first dharacter | ||
192 | + features['lemma[:1]']= lemma[:1] | ||
193 | + features['postag[:1]']= postag[:1] | ||
194 | + | ||
195 | + #lemma and postag secondChar | ||
196 | + if len(lemma)>1: | ||
197 | + features['lemma[:2]']= lemma[:2] | ||
198 | + if len(postag)>1: | ||
199 | + features['postag[:2]']= postag[:2] | ||
200 | + | ||
201 | + if v == 12: | ||
202 | + #word first character | ||
203 | + features['word[:1]']= word[:1] | ||
204 | + | ||
205 | + #word second character | ||
206 | + if len(word)>1: | ||
207 | + features['word[:2]']= word[:2] | ||
208 | + | ||
209 | + #postag first character | ||
210 | + features['postag[:1]']= postag[:1] | ||
211 | + | ||
212 | + #postag second character | ||
213 | + if len(postag)>1: | ||
214 | + features['postag[:2]']= postag[:2] | ||
215 | + | ||
216 | + if v == 13: | ||
217 | + #lemma first character | ||
218 | + features['lemma[:1]']= lemma[:1] | ||
219 | + | ||
220 | + #lemma second character | ||
221 | + if len(lemma)>1: | ||
222 | + features['lemma[:2]']= lemma[:2] | ||
223 | + | ||
224 | + #=========================== S2 =============================# | ||
225 | + ## NAME LEVEL S2 | ||
226 | + ## FEATURE TYPE Complete word features | ||
227 | + | ||
228 | + if S2: | ||
229 | + #Add features to dictionary | ||
230 | + features.update({ | ||
231 | + 'word' : word, | ||
232 | + 'isUpper' : word.isupper(), | ||
233 | + 'isLower' : word.islower(), | ||
234 | + 'isGreek' : isGreek(word), | ||
235 | + 'isNumber' : word.isdigit() | ||
236 | + }) | ||
237 | + ## Anterior word | ||
238 | + ## sentence needs more tha one word | ||
239 | + if i > 0: | ||
240 | + ## Split CoreNLP output by columns | ||
241 | + listElem = sent[i - 1].split('|') | ||
242 | + ## Saving CoreNLP annotations | ||
243 | + word0 = listElem[0] | ||
244 | + features['-1:word']= word0 | ||
245 | + | ||
246 | + ## Posterior word | ||
247 | + ## is not the last word | ||
248 | + if i < len(sent)-1: | ||
249 | + ## Split CoreNLP output by columns | ||
250 | + listElem = sent[i + 1].split('|') | ||
251 | + ## Saving CoreNLP annotations | ||
252 | + word2 = listElem[0] | ||
253 | + features['+1:word']= word2 | ||
254 | + | ||
255 | + #=========================== S3 =============================# | ||
256 | + ## NAME LEVEL S3 | ||
257 | + ## FEATURE TYPE Extended context features | ||
258 | + if S3: | ||
259 | + ## more than two words in sentence | ||
260 | + if i > 1: | ||
261 | + ## Split CoreNLP output by columns | ||
262 | + listElem = sent[i - 2].split('|') | ||
263 | + ## Saving CoreNLP annotations | ||
264 | + ## two anterior lemma and postag | ||
265 | + lemma01 = listElem[1] | ||
266 | + postag01 = listElem[2] | ||
267 | + features['-2:lemma']= lemma01 | ||
268 | + features['-2:postag']= postag01 | ||
269 | + | ||
270 | + ## is not the penultimate word | ||
271 | + if i < len(sent) - 2: | ||
272 | + ## Split CoreNLP output by columns | ||
273 | + listElem = sent[i + 2].split('|') | ||
274 | + ## Saving CoreNLP annotations | ||
275 | + lemma02 = listElem[1] | ||
276 | + postag02 = listElem[2] | ||
277 | + ## two posterior lemma and postag | ||
278 | + features['+2:lemma']= lemma02 | ||
279 | + features['+2:postag']= postag02 | ||
280 | + | ||
281 | + #=========================== S4 =============================# | ||
282 | + ## NAME LEVEL S4if S4: | ||
283 | + ## FEATURE TYPE NER | ||
284 | + if S4: | ||
285 | + ## more than one word in sentence | ||
286 | + if i > 0: | ||
287 | + ## Split CoreNLP output by columns | ||
288 | + listElem = sent[i - 1].split('|') | ||
289 | + ## =============== Anterior ner ====================## | ||
290 | + ## Saving CoreNLP annotations according column position | ||
291 | + ner0 = listElem[3] | ||
292 | + ## Adding to features dictionary | ||
293 | + features['-1:ner'] = ner | ||
294 | + | ||
295 | + ## is not the last word | ||
296 | + if i < len(sent) - 1: | ||
297 | + ## Split CoreNLP output by columns | ||
298 | + listElem = sent[i + 1].split('|') | ||
299 | + ## ============= Posterior ner ====================## | ||
300 | + ## Saving CoreNLP annotations according column position | ||
301 | + ner2 = listElem[3] | ||
302 | + ## Adding to features dictionary | ||
303 | + features['+1:ner'] = ner2 | ||
304 | + | ||
305 | + if i > 1: | ||
306 | + ## Split CoreNLP output by columns | ||
307 | + listElem = sent[i - 2].split('|') | ||
308 | + ## Saving CoreNLP annotations | ||
309 | + ## =============== 2 Anterior ner =================## | ||
310 | + ner01 = listElem[3] | ||
311 | + features['-2:ner']= ner01 | ||
312 | + | ||
313 | + ## is not the penultimate word | ||
314 | + if i < len(sent) - 2: | ||
315 | + ## Split CoreNLP output by columns | ||
316 | + listElem = sent[i + 2].split('|') | ||
317 | + ## Saving CoreNLP annotations | ||
318 | + ner02 = listElem[3] | ||
319 | + ## ============= 2 Posterior ner =================## | ||
320 | + features['+2:ner']= ner02 | ||
321 | + | ||
322 | + return features | ||
323 | + | ||
324 | +def sent2features(sent, S1, S2, S3, S4, v): | ||
325 | + ## Itering in sentence for each word and saving its features | ||
326 | + return [word2features(sent, i, S1, S2, S3, S4, v) for i in range(len(sent))] | ||
327 | + | ||
328 | +def sent2labels(sent): | ||
329 | + ## Save tag, last position by word tokens | ||
330 | + return [elem.split('|')[-1] for elem in sent] | ||
331 | + | ||
332 | +def sent2tokens(sent): | ||
333 | + return [token for token, postag, label in sent] | ||
334 | + | ||
335 | +def print_transitions(trans_features, f): | ||
336 | + for (label_from, label_to), weight in trans_features: | ||
337 | + f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight)) | ||
338 | + | ||
339 | +def print_state_features(state_features, f): | ||
340 | + for (attr, label), weight in state_features: | ||
341 | + f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8"))) | ||
342 | + | ||
343 | + | ||
344 | +__author__ = 'egaytan' | ||
345 | + | ||
346 | +################################################################## | ||
347 | +# MAIN PROGRAM # | ||
348 | +################################################################## | ||
349 | + | ||
350 | +if __name__ == "__main__": | ||
351 | + ## Defining parameters | ||
352 | + parser = OptionParser() | ||
353 | + parser.add_option("--inputPath", dest="inputPath", help="Path of training data set", metavar="PATH") | ||
354 | + parser.add_option("--outputPath", dest="outputPath", help="Output path to place output files", metavar="PATH") | ||
355 | + parser.add_option("--trainingFile", dest="trainingFile", help="File with training data set", metavar="FILE") | ||
356 | + parser.add_option("--testFile", dest="testFile", help="File with test data set", metavar="FILE") | ||
357 | + parser.add_option("--reportName", dest="reportName", help="Report number run", metavar="FILE") | ||
358 | + parser.add_option("--variant", dest="variant", help="Report file", metavar="FILE") | ||
359 | + parser.add_option("--S1", dest="S1", help="General features", action="store_true", default=False) | ||
360 | + parser.add_option("--S2", dest="S2", help="Inner/Complete word features", action="store_true", default=False) | ||
361 | + parser.add_option("--S3", dest="S3", help="Extended context features", action="store_true", default=False) | ||
362 | + parser.add_option("--S4", dest="S4", help="Semantic features", action="store_true", default=False) | ||
363 | + parser.add_option("--excludeStopWords", dest="excludeStopWords",help="Exclude stop words", action="store_true", default=False) | ||
364 | + parser.add_option("--excludeSymbols", dest="excludeSymbols", help="Exclude punctuation marks", action="store_true", default=False) | ||
365 | + parser.add_option("--nrules", dest="nrules", help="Number of crf rules on report", type="int") | ||
366 | + | ||
367 | + (options, args) = parser.parse_args() | ||
368 | + if len(args) > 0: | ||
369 | + parser.error("Any parameter given.") | ||
370 | + sys.exit(1) | ||
371 | + | ||
372 | + print('-------------------------------- PARAMETERS --------------------------------') | ||
373 | + print("Path of test and training data sets: " + options.inputPath) | ||
374 | + print("Path of outputs: " + options.outputPath) | ||
375 | + print("File with training data set: " + str(options.trainingFile)) | ||
376 | + print("File with test data set: " + str(options.testFile)) | ||
377 | + print("reportName: " + str(options.reportName)) | ||
378 | + print("Exclude stop words: " + str(options.excludeStopWords)) | ||
379 | + print("Levels: " + "S1: " + str(options.S1) + "S2: " + str(options.S2) + "S3: " + str(options.S3) + "S4: " + str(options.S4)) | ||
380 | + print("Run variant: " + str(options.variant)) | ||
381 | + print("Number of rules on report file: " + str(options.nrules)) | ||
382 | + | ||
383 | + symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', | ||
384 | + '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...'] | ||
385 | + print("Exclude symbols: " + str(options.excludeSymbols)) | ||
386 | + | ||
387 | + print('-------------------------------- PROCESSING --------------------------------') | ||
388 | + print('Reading corpus...') | ||
389 | + t0 = time() | ||
390 | + | ||
391 | + sentencesTrainingData = [] | ||
392 | + sentencesTestData = [] | ||
393 | + | ||
394 | + stopwords = [word for word in stopwords.words('english')] | ||
395 | + | ||
396 | + with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile: | ||
397 | + for line in iFile.readlines(): | ||
398 | + listLine = [] | ||
399 | + line = line.strip('\n') | ||
400 | + for token in line.split(): | ||
401 | + if options.excludeStopWords: | ||
402 | + listToken = token.split('|') | ||
403 | + lemma = listToken[1] | ||
404 | + if lemma in stopwords: | ||
405 | + continue | ||
406 | + if options.excludeSymbols: | ||
407 | + listToken = token.split('|') | ||
408 | + lemma = listToken[1] | ||
409 | + if lemma in symbols: | ||
410 | + continue | ||
411 | + listLine.append(token) | ||
412 | + sentencesTrainingData.append(listLine) | ||
413 | + print(" Sentences training data: " + str(len(sentencesTrainingData))) | ||
414 | + | ||
415 | + with open(os.path.join(options.inputPath, options.testFile), "r") as iFile: | ||
416 | + for line in iFile.readlines(): | ||
417 | + listLine = [] | ||
418 | + line = line.strip('\n') | ||
419 | + for token in line.split(): | ||
420 | + if options.excludeStopWords: | ||
421 | + listToken = token.split('|') | ||
422 | + lemma = listToken[1] | ||
423 | + if lemma in stopwords: | ||
424 | + continue | ||
425 | + if options.excludeSymbols: | ||
426 | + listToken = token.split('|') | ||
427 | + lemma = listToken[1] | ||
428 | + if lemma in symbols: | ||
429 | + continue | ||
430 | + listLine.append(token) | ||
431 | + sentencesTestData.append(listLine) | ||
432 | + print(" Sentences test data: " + str(len(sentencesTestData))) | ||
433 | + | ||
434 | + print("Reading corpus done in: %fs" % (time() - t0)) | ||
435 | + | ||
436 | + print('-------------------------------- FEATURES --------------------------------') | ||
437 | + | ||
438 | + Dtraning = sent2features(sentencesTrainingData[0], options.S1, options.S2, options.S3, options.S4, int(options.variant))[2] | ||
439 | + Dtest = sent2features(sentencesTestData[0], options.S1, options.S2, options.S3, options.S4, int(options.variant))[2] | ||
440 | + print('--------------------------Features Training ---------------------------') | ||
441 | + print(DF(list(Dtraning.items()))) | ||
442 | + print('--------------------------- FeaturesTest -----------------------------') | ||
443 | + print(DF(list(Dtest.items()))) | ||
444 | + | ||
445 | + t0 = time() | ||
446 | + | ||
447 | + X_train = [sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesTrainingData] | ||
448 | + y_train = [sent2labels(s) for s in sentencesTrainingData] | ||
449 | + | ||
450 | + X_test = [sent2features(s, options.S1, options.S2, options.S3, options.S4, options.variant) for s in sentencesTestData] | ||
451 | + # print X_test | ||
452 | + y_test = [sent2labels(s) for s in sentencesTestData] | ||
453 | + | ||
454 | + ''' | ||
455 | + Fixed parameters | ||
456 | + crf = sklearn_crfsuite.CRF( | ||
457 | + algorithm='lbfgs', | ||
458 | + c1=0.1, | ||
459 | + c2=0.1, | ||
460 | + max_iterations=100, | ||
461 | + all_pgossible_transitions=True | ||
462 | + ) | ||
463 | + ''' | ||
464 | + # Hyperparameter Optimization | ||
465 | + crf = sklearn_crfsuite.CRF( | ||
466 | + algorithm='lbfgs', | ||
467 | + max_iterations=100, | ||
468 | + all_possible_transitions=True | ||
469 | + ) | ||
470 | + params_space = { | ||
471 | + 'c1': scipy.stats.expon(scale=0.5), | ||
472 | + 'c2': scipy.stats.expon(scale=0.05), | ||
473 | + } | ||
474 | + | ||
475 | + # Original: labels = list(crf.classes_) | ||
476 | + # Original: labels.remove('O') | ||
477 | + labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Strain', 'Substrain', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Air', 'Vess', 'pH']) | ||
478 | + | ||
479 | + # use the same metric for evaluation | ||
480 | + f1_scorer = make_scorer(metrics.flat_f1_score, average='weighted', labels=labels) | ||
481 | + | ||
482 | + # search | ||
483 | + rs = RandomizedSearchCV(crf, params_space, | ||
484 | + cv=5, | ||
485 | + verbose=3, | ||
486 | + n_jobs=-1, | ||
487 | + n_iter=100, | ||
488 | + scoring=f1_scorer, | ||
489 | + random_state=42) | ||
490 | + | ||
491 | + rs.fit(X_train, y_train) | ||
492 | + | ||
493 | + # Fixed parameters | ||
494 | + # crf.fit(X_train, y_train) | ||
495 | + | ||
496 | + # Best hiperparameters | ||
497 | + # crf = rs.best_estimator_ | ||
498 | + | ||
499 | + nameReport = str(options.reportName) + '_v'+ str(options.variant) + '.txt' | ||
500 | + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile: | ||
501 | + oFile.write("********** TRAINING AND TESTING REPORT **********\n") | ||
502 | + oFile.write("Training file: " + options.trainingFile + '\n') | ||
503 | + oFile.write('\n') | ||
504 | + oFile.write('best params:' + str(rs.best_params_) + '\n') | ||
505 | + oFile.write('best CV score:' + str(rs.best_score_) + '\n') | ||
506 | + oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000)) | ||
507 | + | ||
508 | + print("Training done in: %fs" % (time() - t0)) | ||
509 | + t0 = time() | ||
510 | + | ||
511 | + # Update best crf | ||
512 | + crf = rs.best_estimator_ | ||
513 | + | ||
514 | + # Saving model | ||
515 | + print(" Saving training model...") | ||
516 | + t1 = time() | ||
517 | + nameModel = 'model_' + str(options.reportName) + '_v'+ str(options.variant) + '_S1_' + str(options.S1) + '_S2_' + str(options.S2) + '_S3_' + str(options.S3) + '_S4_' + str(options.S4) + '_' + str(options.reportName) + '_v' + str(options.variant) +'.mod' | ||
518 | + joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel)) | ||
519 | + print(" Saving training model done in: %fs" % (time() - t1)) | ||
520 | + | ||
521 | + # Evaluation against test data | ||
522 | + y_pred = crf.predict(X_test) | ||
523 | + print("*********************************") | ||
524 | + print("Prediction done in: %fs" % (time() - t0)) | ||
525 | + | ||
526 | + with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile: | ||
527 | + oFile.write('\n') | ||
528 | + oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels))) | ||
529 | + oFile.write('\n') | ||
530 | + # labels = list(crf.classes_) | ||
531 | + sorted_labels = sorted( | ||
532 | + labels, | ||
533 | + key=lambda name: (name[1:], name[0]) | ||
534 | + ) | ||
535 | + oFile.write(metrics.flat_classification_report( y_test, y_pred, labels=sorted_labels, digits=3)) | ||
536 | + oFile.write('\n') | ||
537 | + | ||
538 | + oFile.write("\nTop likely transitions:\n") | ||
539 | + print_transitions(Counter(crf.transition_features_).most_common(options.nrules), oFile) | ||
540 | + oFile.write('\n') | ||
541 | + | ||
542 | + oFile.write("\nTop unlikely transitions:\n") | ||
543 | + print_transitions(Counter(crf.transition_features_).most_common()[-options.nrules:], oFile) | ||
544 | + oFile.write('\n') | ||
545 | + | ||
546 | + oFile.write("\nTop positive:\n") | ||
547 | + print_state_features(Counter(crf.state_features_).most_common(options.nrules), oFile) | ||
548 | + oFile.write('\n') | ||
549 | + | ||
550 | + oFile.write("\nTop negative:\n") | ||
551 | + print_state_features(Counter(crf.state_features_).most_common()[-options.nrules:], oFile) | ||
552 | + oFile.write('\n') | ||
553 | + |
This diff could not be displayed because it is too large.
1 | +-------------------------------- PARAMETERS -------------------------------- | ||
2 | +Path to read input files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/input/ | ||
3 | +Mode name: model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10 | ||
4 | +Model path: /home/egaytan/automatic-extraction-growth-conditions/CRF/models | ||
5 | +Path to place output files: /home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/ | ||
6 | +Filtering stop words: False | ||
7 | +Levels: S1: FalseS2: FalseS3: FalseS4: False | ||
8 | +Run variant: None | ||
9 | +Filtering symbols ['.', ',', ':', ';', '?', '!', "'", '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{', '}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']: False | ||
10 | +-------------------------------- PROCESSING -------------------------------- | ||
11 | +Reading CRF model... | ||
12 | +Reading CRF model done in: 0.008342s | ||
13 | +Processing corpus... | ||
14 | +Preprocessing file...annot-input_bg_v3.txt | ||
15 | +Sentences input data: 14716 | ||
16 | +Predicting tags with model | ||
17 | +Prediction done in: 0.983480s | ||
18 | +Tagging file | ||
19 | + 0 1 | ||
20 | +0 <Gtype> antibody : Flag <Gtype/> Gtype | ||
21 | +1 <Gversion> ChIP-Seq <Gversion/> Gversion | ||
22 | +2 Cultures of Caulobacter -LRB- TLS1631-TLS1633 ... Gtype | ||
23 | +3 <Gtype> developmental stage : mixed population... Gtype | ||
24 | +4 DNA was isolated using the Qiagen Cell Lysis a... | ||
25 | +5 Escherichia coli | ||
26 | +6 Escherichia coli AB1157 | ||
27 | +7 For analysis of ChIP-seq data , Hiseq 2500 Ill... | ||
28 | +8 For analysis of IDAP-seq data , Hiseq 2500 Ill... Gtype | ||
29 | +9 Genome _ build : NC _ 000913.3 | ||
30 | +10 Genome _ build : NC _ 011916.1 | ||
31 | +11 <Gtype> genotype : AB1157 ybbD : : parS scramb... Gtype | ||
32 | +12 <Gtype> genotype : AB1157 ybbD : : parS scramb... Gtype | ||
33 | +13 <Gtype> genotype : AB1157 ybbD : : parS site 1... Gtype | ||
34 | +14 <Gtype> genotype : AB1157 ybbD : : parS site 2... Gtype | ||
35 | +15 <Gtype> genotype : AB1157 ybbD : : parS site 2... Gtype | ||
36 | +16 <Gtype> genotype : AB1157 ybbD : : parS site 3... Gtype | ||
37 | +17 <Gtype> genotype : AB1157 ybbD : : parS site 3... Gtype | ||
38 | +18 <Gtype> genotype : AB1157 ybbD : : parS site 4... Gtype | ||
39 | +19 <Gtype> genotype : AB1157 ybbD : : parS site 4... Gtype | ||
40 | +20 <Gtype> genotype : AB1157 ybbD : : parS site 5... Gtype | ||
41 | +21 <Gtype> genotype : AB1157 ybbD : : parS site 5... Gtype | ||
42 | +22 <Gtype> genotype : AB1157 ybbD : : parS site 6... Gtype | ||
43 | +23 <Gtype> genotype : AB1157 ybbD : : parS site 7... Gtype | ||
44 | +24 <Gtype> genotype : AB1157 ybbD : : parS site 7... Gtype | ||
45 | +25 Hiseq 2500 Illumina short reads -LRB- 50 bp -R... | ||
46 | +26 LELab _ ChIP _ seq _ TLS1637 _ anti _ FLAG | ||
47 | +27 LELab _ ChIP _ seq _ TLS1638 _ anti _ FLAG | ||
48 | +28 LELab _ ChIP _ seq _ TLS1639 _ anti _ FLAG | ||
49 | +29 LELab _ ChIP _ seq _ TLS1640 _ anti _ FLAG | ||
50 | +... ... ... | ||
51 | +14686 <Phase> ESBL019 Coliform <Phase/> Phase | ||
52 | +14687 <Gtype> ESBL019 Filamented <Gtype/> Gtype | ||
53 | +14688 ESBL019 Reverted | ||
54 | +14689 <Phase> ESBL019 Transition <Phase/> Phase | ||
55 | +14690 Escherichia coli | ||
56 | +14691 Four morphologic states of ESBL019 were used d... | ||
57 | +14692 <Gtype> morphology : Coliform <Gtype/> Gtype | ||
58 | +14693 <Gtype> morphology : Filamented <Gtype/> Gtype | ||
59 | +14694 morphology : Reverted -LRB- reverted back from... | ||
60 | +14695 morphology : Transition -LRB- from Coli into F... | ||
61 | +14696 RNA isolation was performed using an RNeasy mi... | ||
62 | +14697 <Gtype> strain : beta-lactamase -LRB- ESBL -RR... Gtype | ||
63 | +14698 The E. coli isolate ESBL019 was originally iso... | ||
64 | +14699 Escherichia coli | ||
65 | +14700 lexA 10 ' after UV vs. 0 ' , MG1655 | ||
66 | +14701 <Gtype> lexA 10 min after UV treatment , 25 ug... Gtype | ||
67 | +14702 lexA 20 ' after NOuv vs. 0 ' , MG1655 | ||
68 | +14703 lexA 20 ' after UV vs. 0 ' , MG1655 | ||
69 | +14704 lexA 20 min after NOuv , 25 ug total RNA , 2 u... | ||
70 | +14705 <Gtype> lexA 20 min after UV treatment , 25 ug... Gtype | ||
71 | +14706 lexA 40 ' after UV vs. 0 ' , MG1655 | ||
72 | +14707 <Gtype> lexA 40 min after UV treatment , 25 ug... Gtype | ||
73 | +14708 lexA 5 ' after UV vs. 0 ' , MG1655 | ||
74 | +14709 <Gtype> lexA 5 min after UV treatment , 25 ug ... Gtype | ||
75 | +14710 lexA 60 ' after NOuv vs. 0 ' , MG1655 | ||
76 | +14711 lexA 60 ' after UV vs. 0 ' , MG1655 | ||
77 | +14712 lexA 60 min after NOuv , 25 ug total RNA , 2 u... | ||
78 | +14713 <Gtype> lexA 60 min after UV treatment , 25 ug... Gtype | ||
79 | +14714 lexA vs. wt , before UV treatment , MG1655 | ||
80 | +14715 untreated cells , 25 ug total RNA | ||
81 | + | ||
82 | +[14716 rows x 2 columns] |
-
Please register or login to post a comment