Final version for abstracts

iarroyof
Commit a5eecdbce5dda28d8c29fe188fcee125a2d25451 a5eecdbc 1 parent 3493a6f2
Showing 20 changed files with 233 additions and 57 deletions
filter_abstracts_binClass.py
filter_abstracts_oneClass.py
model_binClass/svd_model.pkl
model_binClass/svm_model.paper.pkl
model_binClass/svm_model.pkl
model_binClass/tfidf_model.paper.pkl
model_binClass/tfidf_model.pkl
model_oneClass/svd_model.pkl
model_oneClass/svm_model.pkl
model_oneClass/tfidf_model.pkl
model_params_binClass.conf
model_params_oneClass.conf
oneClass_trainUseful_out/useful.out
oneClass_trainUseful_out/useless.out
oneClass_trainUseless_out/useful.out
oneClass_trainUseless_out/useless.out
outRNAseq_binClass/useful.out
outRNAseq_binClass/useless.out
outRNAseq_oneClass/useful.out
outRNAseq_oneClass/useless.out
--- a/filter_abstracts_binClass.py
View file @a5eecdb
+++ b/filter_abstracts_binClass.py
View file @a5eecdb
- #from pdb import set_trace as st
- from sklearn.cross_validation import train_test_split as splitt
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.decomposition import TruncatedSVD
- from sklearn.model_selection import RandomizedSearchCV
 from sklearn.model_selection import GridSearchCV
 from sklearn import metrics
 from sklearn.svm import SVC
@@ -12,9 +9,6 @@ import csv
 import os
 from sklearn.externals import joblib
 from time import time
- from scipy.stats import randint as sp_randint
- from scipy.stats import expon
- from sklearn.preprocessing import label_binarize
 
 
 def get_abstracts(file_name, label):
@@ -75,22 +69,21 @@ parser = argparse.ArgumentParser(
 parser.add_argument("--input", help="Input file containing the abstracts to"
                                 "be predited.")
 parser.add_argument("--classA", help="Input file containing the abstracts of"
-                                 "class A to be learned.")
+                                 " class useless to be learned.")
 parser.add_argument("--classB", help="Input file containing the abstracts of"
-                                 "class B to be learned.")
+                                 " class USEFUL to be learned.")
 parser.add_argument("--out", help="Path to the output directory "
                      "(default='./filter_output')", default="filter_output")
 parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
-         "(default='./model/svm_model.pkl')", default="model/svm_model.pkl")
+         "(default='./model_binClass/svm_model.pkl')", default="model_binClass/svm_model.pkl")
 
 args = parser.parse_args()
 
 labels = {0: 'useless', 1: 'useful'}
 vectorizer = TfidfVectorizer(binary=True)
- print(vectorizer)
 
 if args.classA and args.classB and not args.input:
-     f0 = open("model_params.conf")
+     f0 = open("model_params_binClass.conf")
     n_iter_search = 10
     params = [p for p in csv.DictReader(f0)]
     f0.close()
@@ -115,38 +108,38 @@ if args.classA and args.classB and not args.input:
     svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20)
     svd_model = svd.fit(X)
     X = svd_model.transform(X)
-     #y = [x['topic'] for x in abstracs]
     y = [0 if x['topic'] == 'useless' else 1 for x in abstracs]    
 
-     #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
- 
-     clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
-     clf = GridSearchCV(clf, cv=3,
-         param_grid=model_params,
-     # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
+     clf = SVC()
+     clf = GridSearchCV(clf, cv=3, param_grid=model_params,
                                  n_jobs=-1, scoring='f1')
     start = time()
     clf.fit(X, y)
 
-     #clf.fit(X_train, y_train)
     print("GridSearch took %.2f seconds for %d candidates"
       " parameter settings." % ((time() - start), n_iter_search))
 
+     print()
+     print("The best model parameters:")
+     print(vectorizer)
+     print(svd)
     print(clf.best_estimator_)
     print()
+     print("The best F1 score:")
     print(clf.best_score_)
-     #print(metrics.f1_score(clf.predict(X_test), y_test))
 
-     #joblib.dump(clf, 'model/svm_model.pkl')
-     joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
-     joblib.dump(tfidf_model, 'model/tfidf_model.pkl')
-     joblib.dump(svd_model, 'model/svd_model.pkl')
+     joblib.dump(clf.best_estimator_, 'model_binClass/svm_model.pkl')
+     joblib.dump(tfidf_model, 'model_binClass/tfidf_model.pkl')
+     joblib.dump(svd_model, 'model_binClass/svd_model.pkl')
 
 else:
 
     clf = joblib.load(args.svcmodel)
-     vectorizer = joblib.load('model/tfidf_model.pkl')
-     svd = joblib.load('model/svd_model.pkl')
+     vectorizer = joblib.load('model_binClass/tfidf_model.pkl')
+     svd = joblib.load('model_binClass/svd_model.pkl')
+     print(vectorizer)
+     print(svd)
+     print(clf)
     abstracs = get_abstracts(file_name=args.input, label='unknown')
     X = vectorizer.transform([x['body'] for x in abstracs])
     X = svd.transform(X)
@@ -162,3 +155,5 @@ else:
                 f0.write("%d\t%s\n" % (a['pmid'], a['body']))
             elif c == 1:
                 f1.write("%d\t%s\n" % (a['pmid'], a['body']))
+ 
+     print ("FINISHED!!")
--- a/filter_abstracts_oneClass.py
View file @a5eecdb
+++ b/filter_abstracts_oneClass.py
View file @a5eecdb
- #from pdb import set_trace as st
- from sklearn.cross_validation import train_test_split as splitt
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.decomposition import TruncatedSVD
- from sklearn.model_selection import RandomizedSearchCV
 from sklearn.model_selection import GridSearchCV
 from sklearn import metrics
- from sklearn.svm import SVC
+ 
+ from sklearn.svm import OneClassSVM
 import numpy as np
 import argparse
 import csv
 import os
 from sklearn.externals import joblib
 from time import time
- from scipy.stats import randint as sp_randint
- from scipy.stats import expon
- from sklearn.preprocessing import label_binarize
 
 
 def get_abstracts(file_name, label):
@@ -75,22 +70,22 @@ parser = argparse.ArgumentParser(
 parser.add_argument("--input", help="Input file containing the abstracts to"
                                 "be predited.")
 parser.add_argument("--classA", help="Input file containing the abstracts of"
-                                 "class A to be learned.")
+                                 " class USEFUL to be learned.")
 parser.add_argument("--classB", help="Input file containing the abstracts of"
-                                 "class B to be learned.")
+                                 " class useless to be learned.")
 parser.add_argument("--out", help="Path to the output directory "
                      "(default='./filter_output')", default="filter_output")
 parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
-         "(default='./model/svm_model.pkl')", default="model/svm_model.pkl")
+         "(default='./model/svm_model.pkl')", default="model_oneClass/svm_model.pkl")
 
 args = parser.parse_args()
 
 labels = {0: 'useless', 1: 'useful'}
 vectorizer = TfidfVectorizer(binary=True)
- print(vectorizer)
+ 
 
 if args.classA and args.classB and not args.input:
-     f0 = open("model_params.conf")
+     f0 = open("model_params_oneClass.conf")
     n_iter_search = 10
     params = [p for p in csv.DictReader(f0)]
     f0.close()
@@ -115,50 +110,52 @@ if args.classA and args.classB and not args.input:
     svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20)
     svd_model = svd.fit(X)
     X = svd_model.transform(X)
-     #y = [x['topic'] for x in abstracs]
-     y = [0 if x['topic'] == 'useless' else 1 for x in abstracs]    
- 
-     #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
+     y = [-1 if x['topic'] == 'useless' else 1 for x in abstracs]    
 
-     clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
-     clf = GridSearchCV(clf, cv=3,
-         param_grid=model_params,
-     # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
+     clf = OneClassSVM()
+     clf = GridSearchCV(clf, cv=3, param_grid=model_params,
                                  n_jobs=-1, scoring='f1')
     start = time()
     clf.fit(X, y)
 
-     #clf.fit(X_train, y_train)
     print("GridSearch took %.2f seconds for %d candidates"
       " parameter settings." % ((time() - start), n_iter_search))
 
+     print()
+     print("The best model parameters:")
+     print(vectorizer)
+     print(svd)
     print(clf.best_estimator_)
     print()
+     print("The best F1 score:")
     print(clf.best_score_)
-     #print(metrics.f1_score(clf.predict(X_test), y_test))
 
-     #joblib.dump(clf, 'model/svm_model.pkl')
-     joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
-     joblib.dump(tfidf_model, 'model/tfidf_model.pkl')
-     joblib.dump(svd_model, 'model/svd_model.pkl')
+     joblib.dump(clf.best_estimator_, 'model_oneClass/svm_model.pkl')
+     joblib.dump(tfidf_model, 'model_oneClass/tfidf_model.pkl')
+     joblib.dump(svd_model, 'model_oneClass/svd_model.pkl')
 
 else:
 
     clf = joblib.load(args.svcmodel)
-     vectorizer = joblib.load('model/tfidf_model.pkl')
-     svd = joblib.load('model/svd_model.pkl')
+     vectorizer = joblib.load('model_oneClass/tfidf_model.pkl')
+     svd = joblib.load('model_oneClass/svd_model.pkl')
+     print(vectorizer)
+     print(svd)
+     print(clf)
     abstracs = get_abstracts(file_name=args.input, label='unknown')
     X = vectorizer.transform([x['body'] for x in abstracs])
     X = svd.transform(X)
     classes = clf.predict(X)
- 
+     
     if not os.path.exists(args.out):
         os.makedirs(args.out)
     # Writing predictions to output files
     with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \
                     open(args.out + "/" + labels[1] + ".out", 'w') as f1:
         for c, a in zip(classes, abstracs):
-             if c == 0:
+             if c == 1:
                 f0.write("%d\t%s\n" % (a['pmid'], a['body']))
-             elif c == 1:
+             elif c == -1:
                 f1.write("%d\t%s\n" % (a['pmid'], a['body']))
+ 
+     print("FINISHED!!")
--- a/model_binClass/svd_model.pkl 0 → 100644
View file @a5eecdb
+++ b/model_binClass/svd_model.pkl 0 → 100644
View file @a5eecdb
--- a/model_binClass/svm_model.paper.pkl 0 → 100644
View file @a5eecdb
+++ b/model_binClass/svm_model.paper.pkl 0 → 100644
View file @a5eecdb
--- a/model_binClass/svm_model.pkl 0 → 100644
View file @a5eecdb
+++ b/model_binClass/svm_model.pkl 0 → 100644
View file @a5eecdb
--- a/model_binClass/tfidf_model.paper.pkl 0 → 100644
View file @a5eecdb
+++ b/model_binClass/tfidf_model.paper.pkl 0 → 100644
View file @a5eecdb
--- a/model_binClass/tfidf_model.pkl 0 → 100644
View file @a5eecdb
+++ b/model_binClass/tfidf_model.pkl 0 → 100644
View file @a5eecdb
--- a/model_oneClass/svd_model.pkl 0 → 100644
View file @a5eecdb
+++ b/model_oneClass/svd_model.pkl 0 → 100644
View file @a5eecdb
--- a/model_oneClass/svm_model.pkl 0 → 100644
View file @a5eecdb
+++ b/model_oneClass/svm_model.pkl 0 → 100644
View file @a5eecdb
--- a/model_oneClass/tfidf_model.pkl 0 → 100644
View file @a5eecdb
+++ b/model_oneClass/tfidf_model.pkl 0 → 100644
View file @a5eecdb
--- a/model_params_binClass.conf 0 → 100644
View file @a5eecdb
+++ b/model_params_binClass.conf 0 → 100644
View file @a5eecdb
+ kernel,degree,coef0,C,gamma
+ linear,1,0.5,100,0.0
+ linear,1,0.5,10,0.0
+ linear,1,0.5,50,0.0
+ linear,1,0.5,100,0.0
+ linear,1,0.5,5,0.0
+ linear,1,0.5,150,0.0
+ linear,1,0.5,200,0.0
+ linear,1,0.5,300,0.0
+ linear,1,0.5,400,0.0
+ linear,1,0.5,1.0,0.0
+ linear,1,0.5,5.0,0.0
--- a/model_params_oneClass.conf 0 → 100644
View file @a5eecdb
+++ b/model_params_oneClass.conf 0 → 100644
View file @a5eecdb
+ kernel,degree,coef0,nu,gamma
+ linear,1,0.5,1.0,0.0
+ linear,1,0.5,0.9,0.0
+ linear,1,0.5,0.8,0.0
+ linear,1,0.5,0.7,0.0
+ linear,1,0.5,0.6,0.0
+ linear,1,0.5,0.5,0.0
+ linear,1,0.5,0.4,0.0
+ linear,1,0.5,0.3,0.0
+ linear,1,0.5,0.2,0.0
+ linear,1,0.5,0.1,0.0
+ rbf,1,0.5,1.0,2.0
+ rbf,1,0.5,0.9,0.0001
+ rbf,1,0.5,0.8,0.0001
+ rbf,1,0.5,0.7,0.0001
+ rbf,1,0.5,0.6,0.001
+ rbf,1,0.5,0.5,0.001
+ rbf,1,0.5,0.4,0.001
+ rbf,1,0.5,0.7,0.0001
+ rbf,1,0.5,0.4,0.0001
+ rbf,1,0.5,0.5,0.0001
--- a/oneClass_trainUseful_out/useful.out 0 → 100644
View file @a5eecdb
+++ b/oneClass_trainUseful_out/useful.out 0 → 100644
View file @a5eecdb
+ 29476659	 In Escherichia coli, one Sigma factor recognizes the majority of promoters, and six "alternative" Sigma factors recognize specific subsets of promoters. The alternative Sigma factor FliA (σ28 ) recognizes promoters upstream of many flagellar genes. We previously showed that most E. coli FliA binding sites are located inside genes. However, it was unclear whether these intragenic binding sites represent active promoters. Here, we construct and assay transcriptional promoter-lacZ fusions for all 52 putative FliA promoters previously identified by ChIP-seq. These experiments, coupled with integrative analysis of published genome-scale transcriptional datasets, strongly suggest that most intragenic FliA binding sites are active promoters that transcribe highly unstable RNAs. Additionally, we show that widespread intragenic FliA-dependent transcription may be a conserved phenomenon, but that specific promoters are not themselves conserved. We conclude that intragenic FliA-dependent promoters and the resulting RNAs are unlikely to have important regulatory functions. Nonetheless, one intragenic FliA promoter is broadly conserved, and constrains evolution of the overlapping protein-coding gene. Thus, our data indicate that intragenic regulatory elements can influence bacterial protein evolution, and suggest that the impact of intragenic regulatory sequences on genome evolution should be considered more broadly. This article is protected by copyright. All rights reserved. 
+ 29468196	 The RNA polymerase (RNAP) of Escherichia coli K-12 is a complex enzyme consisting of the core enzyme with the subunit structure α2ββ'ω and one of the σ subunits with promoter recognition properties. The smallest subunit, omega (the rpoZ gene  product), participates in subunit assembly by supporting the folding of the largest subunit, β', but its functional role remains unsolved except for its involvement in ppGpp binding and stringent response. As an initial approach for elucidation of its functional role, we performed in this study ChIP-chip (chromatin immunoprecipitation with microarray technology) analysis of wild-type  and rpoZ-defective mutant strains. The altered distribution of RpoZ-defective RNAP was identified mostly within open reading frames, in particular, of the genes inside prophages. For the genes that exhibited increased or decreased distribution of RpoZ-defective RNAP, the level of transcripts increased or decreased, respectively, as detected by reverse transcription-quantitative PCR (qRT-PCR). In parallel, we analyzed, using genomic SELEX (systemic evolution of ligands by exponential enrichment), the distribution of constitutive promoters that are recognized by RNAP RpoD holoenzyme alone and of general silencer H-NS within prophages. Since all 10 prophages in E. coli K-12 carry only a small number of promoters, the altered occupancy of RpoZ-defective RNAP and of transcripts might represent transcription initiated from as-yet-unidentified host promoters. The genes that exhibited transcription enhanced by RpoZ-defective RNAP are located in the regions of low-level H-NS binding. By using phenotype microarray (PM) assay, alterations of some phenotypes were detected for the rpoZ-deleted mutant, indicating the involvement of RpoZ in regulation of some genes. Possible mechanisms of altered distribution of RNAP inside prophages are discussed. IMPORTANCE The 91-amino-acid-residue small-subunit omega (the rpoZ gene product) of Escherichia coli RNA polymerase plays a structural role in the formation of RNA polymerase (RNAP) as a chaperone in folding the largest subunit  (β', of 1,407 residues in length), but except for binding of the stringent signal ppGpp, little is known of its role in the control of RNAP function. After analysis of genomewide distribution of wild-type and RpoZ-defective RNAP by the ChIP-chip method, we found alteration of the RpoZ-defective RNAP inside open reading frames, in particular, of the genes within prophages. For a set of the genes that exhibited altered occupancy of the RpoZ-defective RNAP, transcription  was found to be altered as observed by qRT-PCR assay. All the observations here described indicate the involvement of RpoZ in recognition of some of the prophage genes. This study advances understanding of not only the regulatory role of omega subunit in the functions of RNAP but also the regulatory interplay between prophages and the host E. coli for adjustment of cellular physiology to a variety of environments in nature. 
+ 29463657	 Transposon-directed insertion site sequencing (TraDIS) is a high-throughput method coupling transposon mutagenesis with short-fragment DNA sequencing. It is  commonly used to identify essential genes. Single gene deletion libraries are considered the gold standard for identifying essential genes. Currently, the TraDIS method has not been benchmarked against such libraries, and therefore, it  remains unclear whether the two methodologies are comparable. To address this, a  high-density transposon library was constructed in Escherichia coli K-12. Essential genes predicted from sequencing of this library were compared to existing essential gene databases. To decrease false-positive identification of essential genes, statistical data analysis included corrections for both gene length and genome length. Through this analysis, new essential genes and genes previously incorrectly designated essential were identified. We show that manual  analysis of TraDIS data reveals novel features that would not have been detected  by statistical analysis alone. Examples include short essential regions within genes, orientation-dependent effects, and fine-resolution identification of genome and protein features. Recognition of these insertion profiles in transposon mutagenesis data sets will assist genome annotation of less well characterized genomes and provides new insights into bacterial physiology and biochemistry.IMPORTANCE Incentives to define lists of genes that are essential for bacterial survival include the identification of potential targets for antibacterial drug development, genes required for rapid growth for exploitation  in biotechnology, and discovery of new biochemical pathways. To identify essential genes in Escherichia coli, we constructed a transposon mutant library of unprecedented density. Initial automated analysis of the resulting data revealed many discrepancies compared to the literature. We now report more extensive statistical analysis supported by both literature searches and detailed inspection of high-density TraDIS sequencing data for each putative essential gene for the E. coli model laboratory organism. This paper is important because it provides a better understanding of the essential genes of E. coli, reveals the limitations of relying on automated analysis alone, and provides a new standard for the analysis of TraDIS data. 
+ 29433444	 BACKGROUND: Due to the DNA triplet code, it is possible that the sequences of two or more protein-coding genes overlap to a large degree. However, such non-trivial overlaps are usually excluded by genome annotation pipelines and, thus, only a few overlapping gene pairs have been described in bacteria. In contrast, transcriptome and translatome sequencing reveals many signals originated from the antisense strand of annotated genes, of which we analyzed an example gene pair in more detail. RESULTS: A small open reading frame of Escherichia coli O157:H7 strain Sakai (EHEC), designated laoB (L-arginine responsive overlapping gene), is embedded in  reading frame -2 in the antisense strand of ECs5115, encoding a CadC-like transcriptional regulator. This overlapping gene shows evidence of transcription  and translation in Luria-Bertani (LB) and brain-heart infusion (BHI) medium based on RNA sequencing (RNAseq) and ribosomal-footprint sequencing (RIBOseq). The transcriptional start site is 289 base pairs (bp) upstream of the start codon and transcription termination is 155 bp downstream of the stop codon. Overexpression  of LaoB fused to an enhanced green fluorescent protein (EGFP) reporter was possible. The sequence upstream of the transcriptional start site displayed strong promoter activity under different conditions, whereas promoter activity was significantly decreased in the presence of L-arginine. A strand-specific translationally arrested mutant of laoB provided a significant growth advantage in competitive growth experiments in the presence of L-arginine compared to the wild type, which returned to wild type level after complementation of laoB in trans. A phylostratigraphic analysis indicated that the novel gene is restricted  to the Escherichia/Shigella clade and might have originated recently by overprinting leading to the expression of part of the antisense strand of ECs5115. CONCLUSIONS: Here, we present evidence of a novel small protein-coding gene laoB  encoded in the antisense frame -2 of the annotated gene ECs5115. Clearly, laoB is evolutionarily young and it originated in the Escherichia/Shigella clade by overprinting, a process which may cause the de novo evolution of bacterial genes  like laoB. 
+ 29394395	 Two major transcriptional regulators of carbon metabolism in bacteria are Cra and CRP. CRP is considered to be the main mediator of catabolite repression. Unlike for CRP, in vivo DNA binding information of Cra is scarce. Here we generate and integrate ChIP-exo and RNA-seq data to identify 39 binding sites for Cra and 97 regulon genes that are regulated by Cra in Escherichia coli. An integrated metabolic-regulatory network was formed by including experimentally-derived regulatory information and a genome-scale metabolic network reconstruction. Applying analysis methods of systems biology to this integrated network showed that Cra enables optimal bacterial growth on poor carbon sources by redirecting and repressing glycolysis flux, by activating the glyoxylate shunt pathway, and by activating the respiratory pathway. In these regulatory mechanisms, the overriding regulatory activity of Cra over CRP is fundamental. Thus, elucidation  of interacting transcriptional regulation of core carbon metabolism in bacteria by two key transcription factors was possible by combining genome-wide experimental measurement and simulation with a genome-scale metabolic model. 
+ 29150605	 CsrA is a post-transcriptional regulatory protein that is widely distributed among bacteria. This protein influences bacterial lifestyle decisions by binding  to the 5' untranslated and/or early coding regions of mRNA targets, causing changes in translation initiation, RNA stability, and/or transcription elongation. Here, we assess the contribution of CsrA to gene expression in Escherichia coli on a global scale. UV crosslinking immunoprecipitation and sequencing (CLIP-seq) identify RNAs that interact directly with CsrA in vivo, while ribosome profiling and RNA-seq uncover the impact of CsrA on translation, RNA abundance, and RNA stability. This combination of approaches reveals unprecedented detail about the regulatory role of CsrA, including novel binding targets and physiological roles, such as in envelope function and iron homeostasis. Our findings highlight the integration of CsrA throughout the E. coli regulatory network, where it orchestrates vast effects on gene expression. 
+ 28911122	 ChIP-exo/nexus experiments rely on innovative modifications of the commonly used  ChIP-seq protocol for high resolution mapping of transcription factor binding sites. Although many aspects of the ChIP-exo data analysis are similar to those of ChIP-seq, these high throughput experiments pose a number of unique quality control and analysis challenges. We develop a novel statistical quality control pipeline and accompanying R/Bioconductor package, ChIPexoQual, to enable exploration and analysis of ChIP-exo and related experiments. ChIPexoQual evaluates a number of key issues including strand imbalance, library complexity,  and signal enrichment of data. Assessment of these features are facilitated through diagnostic plots and summary statistics computed over regions of the genome with varying levels of coverage. We evaluated our QC pipeline with both large collections of public ChIP-exo/nexus data and multiple, new ChIP-exo datasets from Escherichia coli. ChIPexoQual analysis of these datasets resulted in guidelines for using these QC metrics across a wide range of sequencing depths and provided further insights for modelling ChIP-exo data. 
+ 28902868	 In the past, short protein-coding genes were often disregarded by genome annotation pipelines. Transcriptome sequencing (RNAseq) signals outside of annotated genes have usually been interpreted to indicate either ncRNA or pervasive transcription. Therefore, in addition to the transcriptome, the translatome (RIBOseq) of the enteric pathogen Escherichia coli O157:H7 strain Sakai was determined at two optimal growth conditions and a severe stress condition combining low temperature and high osmotic pressure. All intergenic open reading frames potentially encoding a protein of ≥ 30 amino acids were investigated with regard to coverage by transcription and translation signals and their translatability expressed by the ribosomal coverage value. This led to discovery of 465 unique, putative novel genes not yet annotated in this E. coli strain, which are evenly distributed over both DNA strands of the genome. For 255 of the novel genes, annotated homologs in other bacteria were found, and a machine-learning algorithm, trained on small protein-coding E. coli genes, predicted that 89% of these translated open reading frames represent bona fide genes. The remaining 210 putative novel genes without annotated homologs were compared to the 255 novel genes with homologs and to 250 short annotated genes of this E. coli strain. All three groups turned out to be similar with respect to their translatability distribution, fractions of differentially regulated genes,  secondary structure composition, and the distribution of evolutionary constraint, suggesting that both novel groups represent legitimate genes. However, the machine-learning algorithm only recognized a small fraction of the 210 genes without annotated homologs. It is possible that these genes represent a novel group of genes, which have unusual features dissimilar to the genes of the machine-learning algorithm training set. 
+ 28842878	 The advent of Chromatin Immunoprecipitation sequencing (ChIP-Seq) has allowed the identification of genomic regions bound by a DNA binding protein in-vivo on a genome-wide scale. The impact of the DNA binding protein on gene expression can be addressed using transcriptome experiments in appropriate genetic settings. Overlaying the above two sources of data enables us to dissect the direct and indirect effects of a DNA binding protein on gene expression. Application of these techniques to Nucleoid Associated Proteins (NAPs) and Global Transcription  Factors (GTFs) has underscored the complex relationship between DNA-protein interactions and gene expression change, highlighting the role of combinatorial control. Here, we demonstrate the usage of ChIP-Seq to infer binding properties and transcriptional effects of NAPs such as Fis and HNS, and the GTF CRP in the model organism Escherichia coli K12 MG1655 (E. coli). 
+ 28489862	 Uropathogenic Escherichia coli (UPEC) is the cause of ~75% of all urinary tract infections (UTIs) and is increasingly associated with multidrug resistance. This  includes UPEC strains from the recently emerged and globally disseminated sequence type 131 (ST131), which is now the dominant fluoroquinolone-resistant UPEC clone worldwide. Most ST131 strains are motile and produce H4-type flagella. Here, we applied a combination of saturated Tn5 mutagenesis and transposon directed insertion site sequencing (TraDIS) as a high throughput genetic screen and identified 30 genes associated with enhanced motility of the reference ST131  strain EC958. This included 12 genes that repress motility of E. coli K-12, four  of which (lrhA, ihfA, ydiV, lrp) were confirmed in EC958. Other genes represented novel factors that impact motility, and we focused our investigation on characterisation of the mprA, hemK and yjeA genes. Mutation of each of these genes in EC958 led to increased transcription of flagellar genes (flhD and fliC), increased expression of the FliC flagellin, enhanced flagella synthesis and a hyper-motile phenotype. Complementation restored all of these properties to wild-type level. We also identified Tn5 insertions in several intergenic regions  (IGRs) on the EC958 chromosome that were associated with enhanced motility; this  included flhDC and EC958_1546. In both of these cases, the Tn5 insertions were associated with increased transcription of the downstream gene(s), which resulted in enhanced motility. The EC958_1546 gene encodes a phage protein with similarity to esterase/deacetylase enzymes involved in the hydrolysis of sialic acid derivatives found in human mucus. We showed that over-expression of EC958_1546 led to enhanced motility of EC958 as well as the UPEC strains CFT073 and UTI89, demonstrating its activity affects the motility of different UPEC strains. Overall, this study has identified and characterised a number of novel factors associated with enhanced UPEC motility. 
+ 28245801	 BACKGROUND: While NGS allows rapid global detection of transcripts, it remains difficult to distinguish ncRNAs from short mRNAs. To detect potentially translated RNAs, we developed an improved protocol for bacterial ribosomal footprinting (RIBOseq). This allowed distinguishing ncRNA from mRNA in EHEC. A high ratio of ribosomal footprints per transcript (ribosomal coverage value, RCV) is expected to indicate a translated RNA, while a low RCV should point to a non-translated RNA. RESULTS: Based on their low RCV, 150 novel non-translated EHEC transcripts were identified as putative ncRNAs, representing both antisense and intergenic transcripts, 74 of which had expressed homologs in E. coli MG1655. Bioinformatics analysis predicted statistically significant target regulons for 15 of the intergenic transcripts; experimental analysis revealed 4-fold or higher differential expression of 46 novel ncRNA in different growth media. Out of 329 annotated EHEC ncRNAs, 52 showed an RCV similar to protein-coding genes, of those, 16 had RIBOseq patterns matching annotated genes in other enterobacteriaceae, and 11 seem to possess a Shine-Dalgarno sequence, suggesting  that such ncRNAs may encode small proteins instead of being solely non-coding. To support that the RIBOseq signals are reflecting translation, we tested the ribosomal-footprint covered ORF of ryhB and found a phenotype for the encoded peptide in iron-limiting condition. CONCLUSION: Determination of the RCV is a useful approach for a rapid first-step  differentiation between bacterial ncRNAs and small mRNAs. Further, many known ncRNAs may encode proteins as well. 
+ 28061857	 BACKGROUND: Enteric Escherichia coli survives the highly acidic environment of the stomach through multiple acid resistance (AR) mechanisms. The most effective  system, AR2, decarboxylates externally-derived glutamate to remove cytoplasmic protons and excrete GABA. The first described system, AR1, does not require an external amino acid. Its mechanism has not been determined. The regulation of the multiple AR systems and their coordination with broader cellular metabolism has not been fully explored. RESULTS: We utilized a combination of ChIP-Seq and gene expression analysis to experimentally map the regulatory interactions of four TFs: nac, ntrC, ompR, and  csiR. Our data identified all previously in vivo confirmed direct interactions and revealed several others previously inferred from gene expression data. Our data demonstrate that nac and csiR directly modulate AR, and leads to a regulatory network model in which all four TFs participate in coordinating acid resistance, glutamate metabolism, and nitrogen metabolism. This model predicts a  novel mechanism for AR1 by which the decarboxylation enzymes of AR2 are used with internally derived glutamate. This hypothesis makes several testable predictions  that we confirmed experimentally. CONCLUSIONS: Our data suggest that the regulatory network underlying AR is complex and deeply interconnected with the regulation of GABA and glutamate metabolism, nitrogen metabolism. These connections underlie and experimentally validated model of AR1 in which the decarboxylation enzymes of AR2 are used with  internally derived glutamate. 
+ 27900321	 The regulatory protein, GalR, is known for controlling transcription of genes related to D-galactose metabolism in Escherichia coli. Here, using a combination  of experimental and bioinformatic approaches, we identify novel GalR binding sites upstream of several genes whose function is not directly related to D-galactose metabolism. Moreover, we do not observe regulation of these genes by  GalR under standard growth conditions. Thus, our data indicate a broader regulatory role for GalR, and suggest that regulation by GalR is modulated by other factors. Surprisingly, we detect regulation of 158 transcripts by GalR, with few regulated genes being associated with a nearby GalR binding site. Based  on our earlier observation of long-range interactions between distally bound GalR dimers, we propose that GalR indirectly regulates the transcription of many genes by inducing large-scale restructuring of the chromosome. 
+ 27492737	 Conjugation plays an important role in the horizontal movement of DNA between bacterial species and even genera. Large conjugative plasmids in Gram-negative bacteria are associated with multi-drug resistance and have been implicated in the spread of these phenotypes to pathogenic organisms. A/C plasmids often carry  genes that confer resistance to multiple classes of antibiotics. Recently, transcription factors were characterized that regulate A/C conjugation. In this work, we expanded the regulon of the negative regulator Acr2. We developed an A/C variant, pARK01, by precise removal of resistance genes carried by the plasmid in order to make it more genetically tractable. Using pARK01, we conducted RNA-Seq and ChAP-Seq experiments to characterize the regulon of Acr2, an H-NS-like protein. We found that Acr2 binds several loci on the plasmid. We showed, in vitro, that Acr2 can bind specific promoter regions directly and identify key amino acids which are important for this binding. This study further characterizes Acr2 and suggests its role in modulating gene expression of multiple plasmid and chromosomal loci. 
+ 26911138	 BACKGROUND: Genomes of E. coli, including that of the human pathogen Escherichia  coli O157:H7 (EHEC) EDL933, still harbor undetected protein-coding genes which, apparently, have escaped annotation due to their small size and non-essential function. To find such genes, global gene expression of EHEC EDL933 was examined, using strand-specific RNAseq (transcriptome), ribosomal footprinting (translatome) and mass spectrometry (proteome). RESULTS: Using the above methods, 72 short, non-annotated protein-coding genes were detected. All of these showed signals in the ribosomal footprinting assay indicating mRNA translation. Seven were verified by mass spectrometry. Fifty-seven genes are annotated in other enterobacteriaceae, mainly as hypothetical genes; the remaining 15 genes constitute novel discoveries. In addition, protein structure and function were predicted computationally and compared between EHEC-encoded proteins and 100-times randomly shuffled proteins.  Based on this comparison, 61 of the 72 novel proteins exhibit predicted structural and functional features similar to those of annotated proteins. Many of the novel genes show differential transcription when grown under eleven diverse growth conditions suggesting environmental regulation. Three genes were found to confer a phenotype in previous studies, e.g., decreased cattle colonization. CONCLUSIONS: These findings demonstrate that ribosomal footprinting can be used to detect novel protein coding genes, contributing to the growing body of evidence that hypothetical genes are not annotation artifacts and opening an additional way to study their functionality. All 72 genes are taxonomically restricted and, therefore, appear to have evolved relatively recently de novo. 
+ 26789284	 Bacteria can acquire new traits through horizontal gene transfer. Inappropriate expression of transferred genes, however, can disrupt the physiology of the host  bacteria. To reduce this risk, Escherichia coli expresses the nucleoid-associated protein, H-NS, which preferentially binds to horizontally transferred genes to control their expression. Once expression is optimized, the horizontally transferred genes may actually contribute to E. coli survival in new habitats. Therefore, we investigated whether and how H-NS contributes to this optimization  process. A comparison of H-NS binding profiles on common chromosomal segments of  three E. coli strains belonging to different phylogenetic groups indicated that the positions of H-NS-bound regions have been conserved in E. coli strains. The sequences of the H-NS-bound regions appear to have diverged more so than H-NS-unbound regions only when H-NS-bound regions are located upstream or in coding regions of genes. Because these regions generally contain regulatory elements for gene expression, sequence divergence in these regions may be associated with alteration of gene expression. Indeed, nucleotide substitutions in H-NS-bound regions of the ybdO promoter and coding regions have diversified the potential for H-NS-independent negative regulation among E. coli strains. The ybdO expression in these strains was still negatively regulated by H-NS, which reduced the effect of H-NS-independent regulation under normal growth conditions. Hence, we propose that, during E. coli evolution, the conservation of H-NS binding sites resulted in the diversification of the regulation of horizontally transferred genes, which may have facilitated E. coli adaptation to new ecological niches. 
+ 26673755	 The two-component signal transduction system BarA-UvrY of Escherichia coli and its orthologs globally regulate metabolism, motility, biofilm formation, stress resistance, virulence of pathogens and quorum sensing by activating the transcription of genes for regulatory sRNAs, e.g. CsrB and CsrC in E. coli. These sRNAs act by sequestering the RNA binding protein CsrA (RsmA) away from lower affinity mRNA targets. In this study, we used ChIP-exo to identify, at single nucleotide resolution, genomic sites for UvrY (SirA) binding in E. coli and Salmonella enterica. The csrB and csrC genes were the strongest targets of crosslinking, which required UvrY phosphorylation by the BarA sensor kinase. Crosslinking occurred at two sites, an inverted repeat sequence far upstream of the promoter and a site near the -35 sequence. DNAse I footprinting revealed specific binding of UvrY in vitro only to the upstream site, indicative of additional binding requirements and/or indirect binding to the downstream site. Additional genes, including cspA, encoding the cold-shock RNA-binding protein CspA, showed weaker crosslinking and modest or negligible regulation by UvrY. We  conclude that the global effects of UvrY/SirA on gene expression are primarily mediated by activating csrB and csrC transcription. We also used in vivo crosslinking and other experimental approaches to reveal new features of csrB/csrC regulation by the DeaD and SrmB RNA helicases, IHF, ppGpp and DksA. Finally, the phylogenetic distribution of BarA-UvrY was analyzed and found to be  uniquely characteristic of γ-Proteobacteria and strongly anti-correlated with fliW, which encodes a protein that binds to CsrA and antagonizes its activity in  Bacillus subtilis. We propose that BarA-UvrY and orthologous TCS transcribe sRNA  antagonists of CsrA throughout the γ-Proteobacteria, but rarely or never perform  this function in other species. 
+ 26670385	 Iron, a major protein cofactor, is essential for most organisms. Despite the well-known effects of O2 on the oxidation state and solubility of iron, the impact of O2 on cellular iron homeostasis is not well understood. Here we report  that in Escherichia coli K-12, the lack of O2 dramatically changes expression of  genes controlled by the global regulators of iron homeostasis, the transcription  factor Fur and the small RNA RyhB. Using chromatin immunoprecipitation sequencing (ChIP-seq), we found anaerobic conditions promote Fur binding to more locations across the genome. However, by expression profiling, we discovered that the major effect of anaerobiosis was to increase the magnitude of Fur regulation, leading to increased expression of iron storage proteins and decreased expression of most iron uptake pathways and several Mn-binding proteins. This change in the pattern  of gene expression also correlated with an unanticipated decrease in Mn in anaerobic cells. Changes in the genes posttranscriptionally regulated by RyhB under aerobic and anaerobic conditions could be attributed to O2-dependent changes in transcription of the target genes: aerobic RyhB targets were enriched  in iron-containing proteins associated with aerobic energy metabolism, whereas anaerobic RyhB targets were enriched in iron-containing anaerobic respiratory functions. Overall, these studies showed that anaerobiosis has a larger impact on iron homeostasis than previously anticipated, both by expanding the number of direct Fur target genes and the magnitude of their regulation and by altering the expression of genes predicted to be posttranscriptionally regulated by the small  RNA RyhB under iron-limiting conditions.IMPORTANCE: Microbes and host cells engage in an "arms race" for iron, an essential nutrient that is often scarce in  the environment. Studies of iron homeostasis have been key to understanding the control of iron acquisition and the downstream pathways that enable microbes to compete for this valuable resource. Here we report that O2 availability affects the gene expression programs of two Escherichia coli master regulators that function in iron homeostasis: the transcription factor Fur and the small RNA regulator RyhB. Fur appeared to be more active under anaerobic conditions, suggesting a change in the set point for iron homeostasis. RyhB preferentially targeted iron-containing proteins of respiration-linked pathways, which are differentially expressed under aerobic and anaerobic conditions. Such findings may be relevant to the success of bacteria within their hosts since zones of reduced O2 may actually reduce bacterial iron demands, making it easier to win the arms race for iron. 
+ 26307168	 Repeated extragenic palindromes (REPs) in the enterobacterial genomes are usually composed of individual palindromic units separated by linker sequences. A total of 355 annotated REPs are distributed along the Escherichia coli genome. RNA sequence (RNAseq) analysis showed that almost 80% of the REPs in E. coli are transcribed. The DNA sequence of REP325 showed that it is a cluster of six repeats, each with two palindromic units capable of forming cruciform structures  in supercoiled DNA. Here, we report that components of the REP325 element and at  least one of its RNA products play a role in bacterial nucleoid DNA condensation. These RNA not only are present in the purified nucleoid but bind to the bacterial nucleoid-associated HU protein as revealed by RNA IP followed by microarray analysis (RIP-Chip) assays. Deletion of REP325 resulted in a dramatic increase of the nucleoid size as observed using transmission electron microscopy (TEM), and expression of one of the REP325 RNAs, nucleoid-associated noncoding RNA 4 (naRNA4), from a plasmid restored the wild-type condensed structure. Independently, chromosome conformation capture (3C) analysis demonstrated physical connections among various REP elements around the chromosome. These connections are dependent in some way upon the presence of HU and the REP325 element; deletion of HU genes and/or the REP325 element removed the connections.  Finally, naRNA4 together with HU condensed DNA in vitro by connecting REP325 or other DNA sequences that contain cruciform structures in a pairwise manner as observed by atomic force microscopy (AFM). On the basis of our results, we propose molecular models to explain connections of remote cruciform structures mediated by HU and naRNA4.IMPORTANCE: Nucleoid organization in bacteria is being  studied extensively, and several models have been proposed. However, the molecular nature of the structural organization is not well understood. Here we characterized the role of a novel nucleoid-associated noncoding RNA, naRNA4, in nucleoid structures both in vivo and in vitro. We propose models to explain how naRNA4 together with nucleoid-associated protein HU connects remote DNA elements  for nucleoid condensation. We present the first evidence of a noncoding RNA together with a nucleoid-associated protein directly condensing nucleoid DNA. 
+ 26125937	 Adherent-invasive Escherichia coli (AIEC) strains are detected more frequently within mucosal lesions of patients with Crohn's disease (CD). The AIEC phenotype  consists of adherence and invasion of intestinal epithelial cells and survival within macrophages of these bacteria in vitro. Our aim was to identify candidate  transcripts that distinguish AIEC from non-invasive E. coli (NIEC) strains and might be useful for rapid and accurate identification of AIEC by culture-independent technology. We performed comparative RNA-Sequence (RNASeq) analysis using AIEC strain LF82 and NIEC strain HS during exponential and stationary growth. Differential expression analysis of coding sequences (CDS) homologous to both strains demonstrated 224 and 241 genes with increased and decreased expression, respectively, in LF82 relative to HS. Transition metal transport and siderophore metabolism related pathway genes were up-regulated, while glycogen metabolic and oxidation-reduction related pathway genes were down-regulated, in LF82. Chemotaxis related transcripts were up-regulated in LF82 during the exponential phase, but flagellum-dependent motility pathway genes were down-regulated in LF82 during the stationary phase. CDS that mapped only to the LF82 genome accounted for 747 genes. We applied an in silico subtractive genomics approach to identify CDS specific to AIEC by incorporating the genomes of 10 other previously phenotyped NIEC. From this analysis, 166 CDS mapped to the LF82  genome and lacked homology to any of the 11 human NIEC strains. We compared these CDS across 13 AIEC, but none were homologous in each. Four LF82 gene loci belonging to clustered regularly interspaced short palindromic repeats region (CRISPR)--CRISPR-associated (Cas) genes were identified in 4 to 6 AIEC and absent from all non-pathogenic bacteria. As previously reported, AIEC strains were enriched for pdu operon genes. One CDS, encoding an excisionase, was shared by 9  AIEC strains. Reverse transcription quantitative polymerase chain reaction assays for 6 genes were conducted on fecal and ileal RNA samples from 22 inflammatory bowel disease (IBD), and 32 patients without IBD (non-IBD). The expression of Cas loci was detected in a higher proportion of CD than non-IBD fecal and ileal RNA samples (p <0.05). These results support a comparative genomic/transcriptomic approach towards identifying candidate AIEC signature transcripts. 
+ 26020590	 In bacteria, selective promoter recognition by RNA polymerase is achieved by its  association with σ factors, accessory subunits able to direct RNA polymerase "core enzyme" (E) to different promoter sequences. Using Chromatin Immunoprecipitation-sequencing (ChIP-seq), we searched for promoters bound by the σ(S)-associated RNA polymerase form (Eσ(S)) during transition from exponential to stationary phase. We identified 63 binding sites for Eσ(S) overlapping known or putative promoters, often located upstream of genes (encoding either ORFs or non-coding RNAs) showing at least some degree of dependence on the σ(S)-encoding  rpoS gene. Eσ(S) binding did not always correlate with an increase in transcription level, suggesting that, at some σ(S)-dependent promoters, Eσ(S) might remain poised in a pre-initiation state upon binding. A large fraction of Eσ(S)-binding sites corresponded to promoters recognized by RNA polymerase associated with σ(70) or other σ factors, suggesting a considerable overlap in promoter recognition between different forms of RNA polymerase. In particular, Eσ(S) appears to contribute significantly to transcription of genes encoding proteins involved in LPS biosynthesis and in cell surface composition. Finally, our results highlight a direct role of Eσ(S) in the regulation of non coding RNAs, such as OmrA/B, RyeA/B and SibC. 
+ 25735747	 DNA-binding motifs that are recognized by transcription factors (TFs) have been well studied; however, challenges remain in determining the in vivo architecture  of TF-DNA complexes on a genome-scale. Here, we determined the in vivo architecture of Escherichia coli arginine repressor (ArgR)-DNA complexes using high-throughput sequencing of exonuclease-treated chromatin-immunoprecipitated DNA (ChIP-exo). The ChIP-exo has a unique peak-pair pattern indicating 5' and 3'  ends of ArgR-binding region. We identified 62 ArgR-binding loci, which were classified into three groups, comprising single, double and triple peak-pairs. Each peak-pair has a unique 93 base pair (bp)-long (±2 bp) ArgR-binding sequence  containing two ARG boxes (39 bp) and residual sequences. Moreover, the three ArgR-binding modes defined by the position of the two ARG boxes indicate that DNA bends centered between the pair of ARG boxes facilitate the non-specific contacts between ArgR subunits and the residual sequences. Additionally, our approach may  also reveal other fundamental structural features of TF-DNA interactions that have implications for studying genome-scale transcriptional regulatory networks. 
+ 25275371	 Flagellar synthesis is a highly regulated process in all motile bacteria. In Escherichia coli and related species, the transcription factor FlhDC is the master regulator of a multi-tiered transcription network. FlhDC activates transcription of a number of genes, including some flagellar genes and the gene encoding the alternative Sigma factor FliA. Genes whose expression is required late in flagellar assembly are primarily transcribed by FliA, imparting temporal  regulation of transcription and coupling expression to flagellar assembly. In this study, we use ChIP-seq and RNA-seq to comprehensively map the E. coli FlhDC  and FliA regulons. We define a surprisingly restricted FlhDC regulon, including two novel regulated targets and two binding sites not associated with detectable  regulation of surrounding genes. In contrast, we greatly expand the known FliA regulon. Surprisingly, 30 of the 52 FliA binding sites are located inside genes.  Two of these intragenic promoters are associated with detectable noncoding RNAs,  while the others either produce highly unstable RNAs or are inactive under these  conditions. Together, our data redefine the E. coli flagellar regulatory network, and provide new insight into the temporal orchestration of gene expression that coordinates the flagellar assembly process. 
+ 25049088	 To further an improved understanding of the mechanisms used by bacterial cells to survive extreme exposure to ionizing radiation (IR), we broadly screened nonessential Escherichia coli genes for those involved in IR resistance by using  transposon-directed insertion sequencing (TraDIS). Forty-six genes were identified, most of which become essential upon heavy IR exposure. Most of these  were subjected to direct validation. The results reinforced the notion that survival after high doses of ionizing radiation does not depend on a single mechanism or process, but instead is multifaceted. Many identified genes affect either DNA repair or the cellular response to oxidative damage. However, contributions by genes involved in cell wall structure/function, cell division, and intermediary metabolism were also evident. About half of the identified genes have not previously been associated with IR resistance or recovery from IR exposure, including eight genes of unknown function. 
+ 24927582	 The molecular mechanisms of ethanol toxicity and tolerance in bacteria, although  important for biotechnology and bioenergy applications, remain incompletely understood. Genetic studies have identified potential cellular targets for ethanol and have revealed multiple mechanisms of tolerance, but it remains difficult to separate the direct and indirect effects of ethanol. We used adaptive evolution to generate spontaneous ethanol-tolerant strains of Escherichia coli, and then characterized mechanisms of toxicity and resistance using genome-scale DNAseq, RNAseq, and ribosome profiling coupled with specific assays of ribosome and RNA polymerase function. Evolved alleles of metJ, rho, and rpsQ recapitulated most of the observed ethanol tolerance, implicating translation and transcription as key processes affected by ethanol. Ethanol induced miscoding errors during protein synthesis, from which the evolved rpsQ allele protected cells by increasing ribosome accuracy. Ribosome profiling and RNAseq analyses established that ethanol negatively affects transcriptional and translational processivity. Ethanol-stressed cells exhibited ribosomal stalling at internal AUG codons, which may be ameliorated by the adaptive inactivation of  the MetJ repressor of methionine biosynthesis genes. Ethanol also caused aberrant intragenic transcription termination for mRNAs with low ribosome density, which was reduced in a strain with the adaptive rho mutation. Furthermore, ethanol inhibited transcript elongation by RNA polymerase in vitro. We propose that ethanol-induced inhibition and uncoupling of mRNA and protein synthesis through direct effects on ribosomes and RNA polymerase conformations are major contributors to ethanol toxicity in E. coli, and that adaptive mutations in metJ, rho, and rpsQ help protect these central dogma processes in the presence of ethanol. 
+ 24272778	 Escherichia coli AraC is a well-described transcription activator of genes involved in arabinose metabolism. Using complementary genomic approaches, chromatin immunoprecipitation (ChIP)-chip, and transcription profiling, we identify direct regulatory targets of AraC, including five novel target genes: ytfQ, ydeN, ydeM, ygeA, and polB. Strikingly, only ytfQ has an established connection to arabinose metabolism, suggesting that AraC has a broader function than previously described. We demonstrate arabinose-dependent repression of ydeNM by AraC, in contrast to the well-described arabinose-dependent activation of other target genes. We also demonstrate unexpected read-through of transcription  at the Rho-independent terminators downstream of araD and araE, leading to significant increases in the expression of polB and ygeA, respectively. AraC is highly conserved in the related species Salmonella enterica. We use ChIP sequencing (ChIP-seq) and RNA sequencing (RNA-seq) to map the AraC regulon in S.  enterica. A comparison of the E. coli and S. enterica AraC regulons, coupled with a bioinformatic analysis of other related species, reveals a conserved regulatory network across the family Enterobacteriaceae comprised of 10 genes associated with arabinose transport and metabolism. 
+ 24146625	 Despite the importance of maintaining redox homeostasis for cellular viability, how cells control redox balance globally is poorly understood. Here we provide new mechanistic insight into how the balance between reduced and oxidized electron carriers is regulated at the level of gene expression by mapping the regulon of the response regulator ArcA from Escherichia coli, which responds to the quinone/quinol redox couple via its membrane-bound sensor kinase, ArcB. Our genome-wide analysis reveals that ArcA reprograms metabolism under anaerobic conditions such that carbon oxidation pathways that recycle redox carriers via respiration are transcriptionally repressed by ArcA. We propose that this strategy favors use of catabolic pathways that recycle redox carriers via fermentation akin to lactate production in mammalian cells. Unexpectedly, bioinformatic analysis of the sequences bound by ArcA in ChIP-seq revealed that most ArcA binding sites contain additional direct repeat elements beyond the two  required for binding an ArcA dimer. DNase I footprinting assays suggest that non-canonical arrangements of cis-regulatory modules dictate both the length and  concentration-sensitive occupancy of DNA sites. We propose that this plasticity in ArcA binding site architecture provides both an efficient means of encoding binding sites for ArcA, σ(70)-RNAP and perhaps other transcription factors within the same narrow sequence space and an effective mechanism for global control of carbon metabolism to maintain redox homeostasis. 
+ 24146601	 Chromatin immunoprecipitation followed by high throughput sequencing (ChIP-Seq) has been successfully used for genome-wide profiling of transcription factor binding sites, histone modifications, and nucleosome occupancy in many model organisms and humans. Because the compact genomes of prokaryotes harbor many binding sites separated by only few base pairs, applications of ChIP-Seq in this  domain have not reached their full potential. Applications in prokaryotic genomes are further hampered by the fact that well studied data analysis methods for ChIP-Seq do not result in a resolution required for deciphering the locations of  nearby binding events. We generated single-end tag (SET) and paired-end tag (PET) ChIP-Seq data for σ⁷⁰ factor in Escherichia coli (E. coli). Direct comparison of  these datasets revealed that although PET assay enables higher resolution identification of binding events, standard ChIP-Seq analysis methods are not equipped to utilize PET-specific features of the data. To address this problem, we developed dPeak as a high resolution binding site identification (deconvolution) algorithm. dPeak implements a probabilistic model that accurately describes ChIP-Seq data generation process for both the SET and PET assays. For SET data, dPeak outperforms or performs comparably to the state-of-the-art high-resolution ChIP-Seq peak deconvolution algorithms such as PICS, GPS, and GEM. When coupled with PET data, dPeak significantly outperforms SET-based analysis with any of the current state-of-the-art methods. Experimental validations of a subset of dPeak predictions from σ⁷⁰ PET ChIP-Seq data indicate  that dPeak can estimate locations of binding events with as high as 2 to 21 bp resolution. Applications of dPeak to σ⁷⁰ ChIP-Seq data in E. coli under aerobic and anaerobic conditions reveal closely located promoters that are differentially occupied and further illustrate the importance of high resolution analysis of ChIP-Seq data. 
+ 23818864	 FNR is a well-studied global regulator of anaerobiosis, which is widely conserved across bacteria. Despite the importance of FNR and anaerobiosis in microbial lifestyles, the factors that influence its function on a genome-wide scale are poorly understood. Here, we report a functional genomic analysis of FNR action. We find that FNR occupancy at many target sites is strongly influenced by nucleoid-associated proteins (NAPs) that restrict access to many FNR binding sites. At a genome-wide level, only a subset of predicted FNR binding sites were  bound under anaerobic fermentative conditions and many appeared to be masked by the NAPs H-NS, IHF and Fis. Similar assays in cells lacking H-NS and its paralog  StpA showed increased FNR occupancy at sites bound by H-NS in WT strains, indicating that large regions of the genome are not readily accessible for FNR binding. Genome accessibility may also explain our finding that genome-wide FNR occupancy did not correlate with the match to consensus at binding sites, suggesting that significant variation in ChIP signal was attributable to cross-linking or immunoprecipitation efficiency rather than differences in binding affinities for FNR sites. Correlation of FNR ChIP-seq peaks with transcriptomic data showed that less than half of the FNR-regulated operons could be attributed to direct FNR binding. Conversely, FNR bound some promoters without regulating expression presumably requiring changes in activity of condition-specific transcription factors. Such combinatorial regulation may allow Escherichia coli to respond rapidly to environmental changes and confer an ecological advantage in the anaerobic but nutrient-fluctuating environment of the mammalian gut. 
+ 23632166	 To fit within the confines of the cell, bacterial chromosomes are highly condensed into a structure called the nucleoid. Despite the high degree of compaction in the nucleoid, the genome remains accessible to essential biological processes, such as replication and transcription. Here, we present the first high-resolution chromosome conformation capture-based molecular analysis of the spatial organization of the Escherichia coli nucleoid during rapid growth in rich medium and following an induced amino acid starvation that promotes the stringent response. Our analyses identify the presence of origin and terminus domains in exponentially growing cells. Moreover, we observe an increased number of interactions within the origin domain and significant clustering of SeqA-binding  sequences, suggesting a role for SeqA in clustering of newly replicated chromosomes. By contrast, 'histone-like' protein (i.e. Fis, IHF and H-NS) -binding sites did not cluster, and their role in global nucleoid organization does not manifest through the mediation of chromosomal contacts. Finally, genes that were downregulated after induction of the stringent response were spatially  clustered, indicating that transcription in E. coli occurs at transcription foci. 
+ 23586855	 BACKGROUND: ChIP-chip and ChIP-seq are widely used methods to map protein-DNA interactions on a genomic scale in vivo. Waldminghaus and Skarstad recently reported, in this journal, a modified method for ChIP-chip. Based on a comparison of our previously-published ChIP-chip data for Escherichia coli σ32 with their own data, Waldminghaus and Skarstad concluded that many of the σ32 targets identified in our earlier work are false positives. In particular, we identified  many non-canonical σ32 targets that are located inside genes or are associated with genes that show no detectable regulation by σ32. Waldminghaus and Skarstad propose that such non-canonical sites are artifacts, identified due to flaws in the standard ChIP methodology. Waldminghaus and Skarstad suggest specific changes to the standard ChIP procedure that reportedly eliminate the claimed artifacts. RESULTS: We reanalyzed our published ChIP-chip datasets for σ32 and the datasets  generated by Waldminghaus and Skarstad to assess data quality and reproducibility. We also performed targeted ChIP/qPCR for σ32 and an unrelated transcription factor, AraC, using the standard ChIP method and the modified ChIP  method proposed by Waldminghaus and Skarstad. Furthermore, we determined the association of core RNA polymerase with disputed σ32 promoters, with and without  overexpression of σ32. We show that (i) our published σ32 ChIP-chip datasets have a consistently higher dynamic range than those of Waldminghaus and Skarstad, (ii) our published σ32 ChIP-chip datasets are highly reproducible, whereas those of Waldminghaus and Skarstad are not, (iii) non-canonical σ32 target regions are enriched in a σ32 ChIP in a heat shock-dependent manner, regardless of the ChIP method used, (iv) association of core RNA polymerase with some disputed σ32 target genes is induced by overexpression of σ32, (v) σ32 targets disputed by Waldminghaus and Skarstad are predominantly those that are most weakly bound, and (vi) the modifications to the ChIP method proposed by Waldminghaus and Skarstad reduce enrichment of all protein-bound genomic regions. CONCLUSIONS: The modifications to the ChIP-chip method suggested by Waldminghaus  and Skarstad reduce rather than increase the quality of ChIP data. Hence, the non-canonical σ32 targets identified in our previous study are likely to be genuine. We propose that the failure of Waldminghaus and Skarstad to identify many of these σ32 targets is due predominantly to the lower data quality in their study. We conclude that surprising ChIP-chip results are not artifacts to be ignored, but rather indications that our understanding of DNA-binding proteins is incomplete. 
+ 23071782	 The phosphate starvation response in bacteria has been studied extensively for the past few decades and the phosphate-limiting signal is known to be mediated via the PhoBR two-component system. However, the global DNA binding profile of the response regulator PhoB and the PhoB downstream responses are currently unclear. In this study, chromatin immunoprecipitation for PhoB was combined with  high-density tiling array (ChIP-chip) as well as gene expression microarray to reveal the first global down-stream responses of the responding regulator, PhoB in E. coli. Based on our ChIP-chip experimental data, forty-three binding sites were identified throughout the genome and the known PhoB binding pattern was updated by identifying the conserved pattern from these sites. From the gene expression microarray data analysis, 287 differentially expressed genes were identified in the presence of PhoB activity. By comparing the results obtained from our ChIP-chip and microarray experiments, we were also able to identify genes that were directly or indirectly affected through PhoB regulation. Nineteen out of these 287 differentially expressed genes were identified as the genes directly regulated by PhoB. Seven of the 19 directly regulated genes (including phoB) are transcriptional regulators. These transcriptional regulators then further pass the signal of phosphate starvation down to the remaining differentially expressed genes. Our results unveiled the genome-wide binding profile of PhoB and the downstream responses under phosphate starvation. We also  present the hierarchical structure of the phosphate sensing regulatory network. The data suggest that PhoB plays protective roles in membrane integrity and oxidative stress reduction during phosphate starvation. 
+ 22180530	 IHF and HU are two heterodimeric nucleoid-associated proteins (NAP) that belong to the same protein family but interact differently with the DNA. IHF is a sequence-specific DNA-binding protein that bends the DNA by over 160°. HU is the  most conserved NAP, which binds non-specifically to duplex DNA with a particular  preference for targeting nicked and bent DNA. Despite their importance, the in vivo interactions of the two proteins to the DNA remain to be described at a high resolution and on a genome-wide scale. Further, the effects of these proteins on  gene expression on a global scale remain contentious. Finally, the contrast between the functions of the homo- and heterodimeric forms of proteins deserves the attention of further study. Here we present a genome-scale study of HU- and IHF binding to the Escherichia coli K12 chromosome using ChIP-seq. We also perform microarray analysis of gene expression in single- and double-deletion mutants of each protein to identify their regulons. The sequence-specific binding profile of IHF encompasses ∼30% of all operons, though the expression of <10% of  these is affected by its deletion suggesting combinatorial control or a molecular backup. The binding profile for HU is reflective of relatively non-specific binding to the chromosome, however, with a preference for A/T-rich DNA. The HU regulon comprises highly conserved genes including those that are essential and possibly supercoiling sensitive. Finally, by performing ChIP-seq experiments, where possible, of each subunit of IHF and HU in the absence of the other subunit, we define genome-wide maps of DNA binding of the proteins in their hetero- and homodimeric forms. 
+ 22082910	 Although metabolic networks have been reconstructed on a genome scale, the corresponding reconstruction and integration of governing transcriptional regulatory networks has not been fully achieved. Here we reconstruct such an integrated network for amino acid metabolism in Escherichia coli. Analysis of ChIP-chip and gene expression data for the transcription factors ArgR, Lrp and TrpR showed that 19 out of 20 amino acid biosynthetic pathways are either directly or indirectly controlled by these regulators. Classifying the regulated  genes into three functional categories of transport, biosynthesis and metabolism  leads to the elucidation of regulatory motifs that constitute the integrated network's basic building blocks. The regulatory logic of these motifs was determined on the basis of relationships between transcription factor binding and changes in the amount of transcript in response to exogenous amino acids. Remarkably, the resulting logic shows how amino acids are differentiated as signaling and nutrient molecules, revealing the overarching regulatory principles of the amino acid stimulon. 
+ 21572102	 The PurR transcription factor plays a critical role in transcriptional regulation of purine metabolism in enterobacteria. Here, we elucidate the role of PurR under exogenous adenine stimulation at the genome-scale using high-resolution chromatin immunoprecipitation (ChIP)-chip and gene expression data obtained under in vivo conditions. Analysis of microarray data revealed that adenine stimulation led to  changes in transcript level of about 10% of Escherichia coli genes, including the purine biosynthesis pathway. The E. coli strain lacking the purR gene showed that a total of 56 genes are affected by the deletion. From the ChIP-chip analysis, we determined that over 73% of genes directly regulated by PurR were enriched in the biosynthesis, utilization and transport of purine and pyrimidine nucleotides, and 20% of them were functionally unknown. Compared to the functional diversity of the regulon of the other general transcription factors in E. coli, the functions  and size of the PurR regulon are limited. 
+ 21097887	 Nucleoid-associated proteins (NAPs) are global regulators of gene expression in Escherichia coli, which affect DNA conformation by bending, wrapping and bridging the DNA. Two of these--H-NS and Fis--bind to specific DNA sequences and structures. Because of their importance to global gene expression, the binding of these NAPs to the DNA was previously investigated on a genome-wide scale using ChIP-chip. However, variation in their binding profiles across the growth phase and the genome-scale nature of their impact on gene expression remain poorly understood. Here, we present a genome-scale investigation of H-NS and Fis binding to the E. coli chromosome using chromatin immunoprecipitation combined with high-throughput sequencing (ChIP-seq). By performing our experiments under multiple time-points during growth in rich media, we show that the binding regions of the two proteins are mutually exclusive under our experimental conditions. H-NS binds to significantly longer tracts of DNA than Fis, consistent with the linear spread of H-NS binding from high- to surrounding lower-affinity sites; the length of binding regions is associated with the degree of transcriptional repression imposed by H-NS. For Fis, a majority of binding events do not lead to differential expression of the proximal gene; however, it has a significant indirect effect on gene expression partly through its effects on the  expression of other transcription factors. We propose that direct transcriptional regulation by Fis is associated with the interaction of tandem arrays of Fis molecules to the DNA and possible DNA bending, particularly at operon-upstream regions. Our study serves as a proof-of-principle for the use of ChIP-seq for global DNA-binding proteins in bacteria, which should become significantly more economical and feasible with the development of multiplexing techniques. 
+ 19706412	 The transcription termination factor Rho is a global regulator of RNA polymerase  (RNAP). Although individual Rho-dependent terminators have been studied extensively, less is known about the sites of RNAP regulation by Rho on a genome-wide scale. Using chromatin immunoprecipitation and microarrays (ChIP-chip), we examined changes in the distribution of Escherichia coli RNAP in  response to the Rho-specific inhibitor bicyclomycin (BCM). We found approximately 200 Rho-terminated loci that were divided evenly into 2 classes: intergenic (at the ends of genes) and intragenic (within genes). The intergenic class contained  noncoding RNAs such as small RNAs (sRNAs) and transfer RNAs (tRNAs), establishing a previously unappreciated role of Rho in termination of stable RNA synthesis. The intragenic class of terminators included a previously uncharacterized set of  short antisense transcripts, as judged by a shift in the distribution of RNAP in  BCM-treated cells that was opposite to the direction of the corresponding gene. These Rho-terminated antisense transcripts point to a role of noncoding transcription in E. coli gene regulation that may resemble the ubiquitous noncoding transcription recently found to play myriad roles in eukaryotic gene regulation. 
+ 19647521	 Protein-DNA interactions are fundamental to core biological processes, including  transcription, DNA replication, and chromosomal organization. We have developed in vivo protein occupancy display (IPOD), a technology that reveals protein occupancy across an entire bacterial chromosome at the resolution of individual binding sites. Application to Escherichia coli reveals thousands of protein occupancy peaks, highly enriched within and in close proximity to noncoding regulatory regions. In addition, we discovered extensive (>1 kilobase) protein occupancy domains (EPODs), some of which are localized to highly expressed genes, enriched in RNA-polymerase occupancy. However, the majority are localized to transcriptionally silent loci dominated by conserved hypothetical ORFs. These regions are highly enriched in both predicted and experimentally determined binding sites of nucleoid proteins and exhibit extreme biophysical characteristics such as high intrinsic curvature. Our observations implicate these transcriptionally silent EPODs as the elusive organizing centers, long proposed to topologically isolate chromosomal domains. 
+ 19052235	 Broad-acting transcription factors (TFs) in bacteria form regulons. Here, we present a 4-step method to fully reconstruct the leucine-responsive protein (Lrp) regulon in Escherichia coli K-12 MG 1655 that regulates nitrogen metabolism. Step 1 is composed of obtaining high-resolution ChIP-chip data for Lrp, the RNA polymerase and expression profiles under multiple environmental conditions. We identified 138 unique and reproducible Lrp-binding regions and classified their binding state under different conditions. In the second step, the analysis of these data revealed 6 distinct regulatory modes for individual ORFs. In the third step, we used the functional assignment of the regulated ORFs to reconstruct 4 types of regulatory network motifs around the metabolites that are affected by the corresponding gene products. In the fourth step, we determined how leucine, as a signaling molecule, shifts the regulatory motifs for particular metabolites. The physiological structure that emerges shows the regulatory motifs for different amino acid fall into the traditional classification of amino acid families, thus elucidating the structure and physiological functions of the Lrp-regulon. The same procedure can be applied to other broad-acting TFs, opening the way to full bottom-up reconstruction of the transcriptional regulatory network in bacterial cells. 
+ 18370100	 Interactions between cis-acting elements and proteins play a key role in transcriptional regulation of all known organisms. To better understand these interactions, researchers developed a method that couples chromatin immunoprecipitation with microarrays (also known as ChIP-chip), which is capable  of providing a whole-genome map of protein-DNA interactions. This versatile and high-throughput strategy is initiated by formaldehyde-mediated cross-linking of DNA and proteins, followed by cell lysis, DNA fragmentation, and immunopurification. The immunoprecipitated DNA fragments are then purified from the proteins by reverse-cross-linking followed by amplification, labeling, and hybridization to a whole-genome tiling microarray against a reference sample. The enriched signals obtained from the microarray then are normalized by the reference sample and used to generate the whole-genome map of protein-DNA interactions. The protocol described here has been used for discovering the genomewide distribution of RNA polymerase and several transcription factors of Escherichia coli. 
+ 18340041	 We determined the genome-wide distribution of the nucleoid-associated protein Fis in Escherichia coli using chromatin immunoprecipitation coupled with high-resolution whole genome-tiling microarrays. We identified 894 Fis-associated regions across the E. coli genome. A significant number of these binding sites were found within open reading frames (33%) and between divergently transcribed transcripts (5%). Analysis indicates that A-tracts and AT-tracts are an important signal for preferred Fis-binding sites, and that A(6)-tracts in particular constitute a high-affinity signal that dictates Fis phasing in stretches of DNA containing multiple and variably spaced A-tracts and AT-tracts. Furthermore, we find evidence for an average of two Fis-binding regions per supercoiling domain in the chromosome of exponentially growing cells. Transcriptome analysis shows that approximately 21% of genes are affected by the deletion of fis; however, the changes in magnitude are small. To address the differential Fis bindings under growth environment perturbation, ChIP-chip analysis was performed using cells grown under aerobic and anaerobic growth conditions. Interestingly, the Fis-binding regions are almost identical in aerobic and anaerobic growth conditions-indicating that the E. coli genome topology mediated by Fis is superficially identical in the two conditions. These novel results provide new insight into how Fis modulates DNA topology at a genome scale and thus advance our understanding of the architectural bases of the E. coli nucleoid. 
--- a/oneClass_trainUseful_out/useless.out 0 → 100644
View file @a5eecdb
+++ b/oneClass_trainUseful_out/useless.out 0 → 100644
View file @a5eecdb
--- a/oneClass_trainUseless_out/useful.out 0 → 100644
View file @a5eecdb
+++ b/oneClass_trainUseless_out/useful.out 0 → 100644
View file @a5eecdb
+ 29091192	 Objectives: Polymyxins remain one of the last-resort drugs to treat infections caused by MDR Gram-negative pathogens. Here, we determined the mechanisms by which chromosomally encoded resistance to colistin and polymyxin B can arise in the MDR uropathogenic Escherichia coli ST131 reference strain EC958. Methods: Two complementary approaches, saturated transposon mutagenesis and spontaneous mutation induction with high concentrations of colistin and polymyxin B, were employed to select for mutations associated with resistance to polymyxins. Mutants were identified using transposon-directed insertion-site sequencing or Illumina WGS. A resistance phenotype was confirmed by MIC and further investigated using RT-PCR. Competitive growth assays were used to measure fitness cost. Results: A transposon insertion at nucleotide 41 of the pmrB gene (EC958pmrB41-Tn5) enhanced its transcript level, resulting in a 64- and 32-fold increased MIC of colistin and polymyxin B, respectively. Three spontaneous mutations, also located within the pmrB gene, conferred resistance to both colistin and polymyxin B with a corresponding increase in transcription of the pmrCAB genes. All three mutations incurred a fitness cost in the absence of colistin and polymyxin B. Conclusions: This study identified the pmrB gene as the main chromosomal target for induction of colistin and polymyxin B resistance in E. coli. 
+ 29066548	 Uropathogenic Escherichia coli (UPEC) is a major cause of urinary tract and bloodstream infections and possesses an array of virulence factors for colonization, survival, and persistence. One such factor is the polysaccharide K  capsule. Among the different K capsule types, the K1 serotype is strongly associated with UPEC infection. In this study, we completely sequenced the K1 UPEC urosepsis strain PA45B and employed a novel combination of a lytic K1 capsule-specific phage, saturated Tn5 transposon mutagenesis, and high-throughput transposon-directed insertion site sequencing (TraDIS) to identify the complement of genes required for capsule production. Our analysis identified known genes involved in capsule biosynthesis, as well as two additional regulatory genes (mprA and lrhA) that we characterized at the molecular level. Mutation of mprA resulted in protection against K1 phage-mediated killing, a phenotype restored by complementation. We also identified a significantly increased unidirectional Tn5  insertion frequency upstream of the lrhA gene and showed that strong expression of LrhA induced by a constitutive Pcl promoter led to loss of capsule production. Further analysis revealed loss of MprA or overexpression of LrhA affected the transcription of capsule biosynthesis genes in PA45B and increased sensitivity to killing in whole blood. Similar phenotypes were also observed in UPEC strains UTI89 (K1) and CFT073 (K2), demonstrating that the effects were neither strain nor capsule type specific. Overall, this study defined the genome of a UPEC urosepsis isolate and identified and characterized two new regulatory factors that affect UPEC capsule production.IMPORTANCE Urinary tract infections (UTIs) are among the most common bacterial infections in humans and are primarily caused by uropathogenic Escherichia coli (UPEC). Many UPEC strains express a polysaccharide K capsule that provides protection against host innate immune factors and contributes to survival and persistence during infection. The K1 serotype is one example of a polysaccharide capsule type and is strongly associated with UPEC strains that cause UTIs, bloodstream infections, and meningitis. The number of UTIs caused by antibiotic-resistant UPEC is steadily increasing, highlighting the need to better understand factors (e.g., the capsule) that contribute to UPEC pathogenesis. This study describes the original  and novel application of lytic capsule-specific phage killing, saturated Tn5 transposon mutagenesis, and high-throughput transposon-directed insertion site sequencing to define the entire complement of genes required for capsule production in UPEC. Our comprehensive approach uncovered new genes involved in the regulation of this key virulence determinant. 
+ 28911122	 ChIP-exo/nexus experiments rely on innovative modifications of the commonly used  ChIP-seq protocol for high resolution mapping of transcription factor binding sites. Although many aspects of the ChIP-exo data analysis are similar to those of ChIP-seq, these high throughput experiments pose a number of unique quality control and analysis challenges. We develop a novel statistical quality control pipeline and accompanying R/Bioconductor package, ChIPexoQual, to enable exploration and analysis of ChIP-exo and related experiments. ChIPexoQual evaluates a number of key issues including strand imbalance, library complexity,  and signal enrichment of data. Assessment of these features are facilitated through diagnostic plots and summary statistics computed over regions of the genome with varying levels of coverage. We evaluated our QC pipeline with both large collections of public ChIP-exo/nexus data and multiple, new ChIP-exo datasets from Escherichia coli. ChIPexoQual analysis of these datasets resulted in guidelines for using these QC metrics across a wide range of sequencing depths and provided further insights for modelling ChIP-exo data. 
+ 28439033	 Upon oxygen limitation, the Bacillus subtilis ResE sensor kinase and its cognate  ResD response regulator play primary roles in the transcriptional activation of genes functioning in anaerobic respiration. The nitric oxide (NO)-sensitive NsrR  repressor controls transcription to support nitrate respiration. In addition, the ferric uptake repressor (Fur) can modulate transcription under anaerobic conditions. However, whether these controls are direct or indirect has been investigated only in a gene-specific manner. To gain a genomic view of anaerobic  gene regulation, we determined the genome-wide in vivo DNA binding of ResD, NsrR, and Fur transcription factors (TFs) using in situ DNase I footprinting combined with chromatin affinity precipitation sequencing (ChAP-seq; genome footprinting by high-throughput sequencing [GeF-seq]). A significant number of sites were targets of ResD and NsrR, and a majority of them were also bound by Fur. The binding of multiple TFs to overlapping targets affected each individual TF's binding, which led to combinatorial transcriptional control. ResD bound to both the promoters and the coding regions of genes under its positive control. Other genes showing enrichment of ResD at only the promoter regions are targets of direct ResD-dependent repression or antirepression. The results support previous  findings of ResD as an RNA polymerase (RNAP)-binding protein and indicated that ResD can associate with the transcription elongation complex. The data set allowed us to reexamine consensus sequence motifs of Fur, ResD, and NsrR and uncovered evidence that multiple TGW (where W is A or T) sequences surrounded by  an A- and T-rich sequence are often found at sites where all three TFs competitively bind.IMPORTANCE Bacteria encounter oxygen fluctuation in their natural environment as well as in host organisms. Hence, understanding how bacteria respond to oxygen limitation will impact environmental and human health. ResD, NsrR, and Fur control transcription under anaerobic conditions. This work using in situ DNase I footprinting uncovered the genome-wide binding profile of the three transcription factors (TFs). Binding of the TFs is often competitive or cooperative depending on the promoters and the presence of other TFs, indicating  that transcriptional regulation by multiple TFs is much more complex than we originally thought. The results from this study provide a more complete picture of anaerobic gene regulation governed by ResD, NsrR, and Fur and contribute to our further understanding of anaerobic physiology. 
+ 28039131	 Human enteric pathogens, such as Salmonella spp. and verotoxigenic Escherichia coli, are increasingly recognized as causes of gastroenteritis outbreaks associated with the consumption of fruits and vegetables. Persistence in plants represents an important part of the life cycle of these pathogens. The identification of the full complement of Salmonella genes involved in the colonization of the model plant (tomato) was carried out using transposon insertion sequencing analysis. With this approach, 230,000 transposon insertions  were screened in tomato pericarps to identify loci with reduction in fitness, followed by validation of the screen results using competition assays of the isogenic mutants against the wild type. A comparison with studies in animals revealed a distinct plant-associated set of genes, which only partially overlaps  with the genes required to elicit disease in animals. De novo biosynthesis of amino acids was critical to persistence within tomatoes, while amino acid scavenging was prevalent in animal infections. Fitness reduction of the Salmonella amino acid synthesis mutants was generally more severe in the tomato rin mutant, which hyperaccumulates certain amino acids, suggesting that these nutrients remain unavailable to Salmonella spp. within plants. Salmonella lipopolysaccharide (LPS) was required for persistence in both animals and plants, exemplifying some shared pathogenesis-related mechanisms in animal and plant hosts. Similarly to phytopathogens, Salmonella spp. required biosynthesis of amino acids, LPS, and nucleotides to colonize tomatoes. Overall, however, it appears that while Salmonella shares some strategies with phytopathogens and taps into its animal virulence-related functions, colonization of tomatoes represents  a distinct strategy, highlighting this pathogen's flexible metabolism.IMPORTANCE  Outbreaks of gastroenteritis caused by human pathogens have been increasingly associated with foods of plant origin, with tomatoes being one of the common culprits. Recent studies also suggest that these human pathogens can use plants as alternate hosts as a part of their life cycle. While dual (animal/plant) lifestyles of other members of the Enterobacteriaceae family are well known, the  strategies with which Salmonella colonizes plants are only partially understood.  Therefore, we undertook a high-throughput characterization of the functions required for Salmonella persistence within tomatoes. The results of this study were compared with what is known about genes required for Salmonella virulence in animals and interactions of plant pathogens with their hosts to determine whether Salmonella repurposes its virulence repertoire inside plants or whether it behaves more as a phytopathogen during plant colonization. Even though Salmonella utilized some of its virulence-related genes in tomatoes, plant colonization required a distinct set of functions. 
+ 27492287	 DNA of viral origin represents a ubiquitous element of bacterial genomes. Its integration into host regulatory circuits is a pivotal driver of microbial evolution but requires the stringent regulation of phage gene activity. In this study, we describe the nucleoid-associated protein CgpS, which represents an essential protein functioning as a xenogeneic silencer in the Gram-positive Corynebacterium glutamicum CgpS is encoded by the cryptic prophage CGP3 of the C. glutamicum strain ATCC 13032 and was first identified by DNA affinity chromatography using an early phage promoter of CGP3. Genome-wide profiling of CgpS binding using chromatin affinity purification and sequencing (ChAP-Seq) revealed its association with AT-rich DNA elements, including the entire CGP3 prophage region (187 kbp), as well as several other elements acquired by horizontal gene transfer. Countersilencing of CgpS resulted in a significantly increased induction frequency of the CGP3 prophage. In contrast, a strain lacking the CGP3 prophage was not affected and displayed stable growth. In a bioinformatics approach, cgpS orthologs were identified primarily in actinobacterial genomes as well as several phage and prophage genomes. Sequence analysis of 618 orthologous proteins revealed a strong conservation of the secondary structure, supporting an ancient function of these xenogeneic silencers in phage-host interaction. 
+ 26862720	 The DNA adenine methyltransferase identification (DamID) assay is a powerful method to detect protein-DNA interactions both locally and genome-wide. It is an  alternative approach to chromatin immunoprecipitation (ChIP). An expressed fusion protein consisting of the protein of interest and the E. coli DNA adenine methyltransferase can methylate the adenine base in GATC motifs near the sites of protein-DNA interactions. Adenine-methylated DNA fragments can then be specifically amplified and detected. The original DamID assay detects the genomic locations of methylated DNA fragments by hybridization to DNA microarrays, which  is limited by the availability of microarrays and the density of predetermined probes. In this paper, we report the detailed protocol of integrating high throughput DNA sequencing into DamID (DamID-seq). The large number of short reads generated from DamID-seq enables detecting and localizing protein-DNA interactions genome-wide with high precision and sensitivity. We have used the DamID-seq assay to study genome-nuclear lamina (NL) interactions in mammalian cells, and have noticed that DamID-seq provides a high resolution and a wide dynamic range in detecting genome-NL interactions. The DamID-seq approach enables probing NL associations within gene structures and allows comparing genome-NL interaction maps with other functional genomic data, such as ChIP-seq and RNA-seq. 
+ 26537891	 BACKGROUND: FNR homologues constitute an important class of transcription factors that control a wide range of anaerobic physiological functions in a number of bacterial species. Since FNR homologues are some of the most pervasive transcription factors, an understanding of their involvement in regulating anaerobic gene expression in different species sheds light on evolutionary similarity and differences. To address this question, we used a combination of high throughput RNA-Seq and ChIP-Seq analysis to define the extent of the FnrL regulon in Rhodobacter capsulatus and related our results to that of FnrL in Rhodobacter sphaeroides and FNR in Escherichia coli. RESULTS: Our RNA-seq results show that FnrL affects the expression of 807 genes,  which accounts for over 20 % of the Rba. capsulatus genome. ChIP-seq results indicate that 42 of these genes are directly regulated by FnrL. Importantly, this includes genes involved in the synthesis of the anoxygenic photosystem. Similarly, FnrL in Rba. sphaeroides affects 24 % of its genome, however, only 171 genes are differentially expressed in common between two Rhodobacter species, suggesting significant divergence in regulation. CONCLUSIONS: We show that FnrL in Rba. capsulatus activates photosynthesis while  in Rba. sphaeroides FnrL regulation reported to involve repression of the photosystem. This analysis highlights important differences in transcriptional control of photosynthetic events and other metabolic processes controlled by FnrL orthologues in closely related Rhodobacter species. Furthermore, we also show that the E. coli FNR regulon has limited transcriptional overlap with the FnrL regulons from either Rhodobacter species. 
+ 26483520	 An ability to sense and respond to changes in extracellular phosphate is critical for the survival of most bacteria. For Caulobacter crescentus, which typically lives in phosphate-limited environments, this process is especially crucial. Like many bacteria, Caulobacter responds to phosphate limitation through a conserved two-component signaling pathway called PhoR-PhoB, but the direct regulon of PhoB  in this organism is unknown. Here we used chromatin immunoprecipitation-DNA sequencing (ChIP-Seq) to map the global binding patterns of the phosphate-responsive transcriptional regulator PhoB under phosphate-limited and -replete conditions. Combined with genome-wide expression profiling, our work demonstrates that PhoB is induced to regulate nearly 50 genes under phosphate-starved conditions. The PhoB regulon is comprised primarily of genes known or predicted to help Caulobacter scavenge for and import inorganic phosphate, including 15 different membrane transporters. We also investigated the regulatory role of PhoU, a widely conserved protein proposed to coordinate phosphate import with expression of the PhoB regulon by directly modulating the histidine kinase PhoR. However, our studies show that it likely does not play such a role in Caulobacter, as PhoU depletion has no significant effect on PhoB-dependent gene expression. Instead, cells lacking PhoU exhibit striking accumulation of large polyphosphate granules, suggesting that PhoU participates in controlling intracellular phosphate metabolism.IMPORTANCE: The transcription factor PhoB is widely conserved throughout the bacterial kingdom, where it helps  organisms respond to phosphate limitation by driving the expression of a battery  of genes. Most of what is known about PhoB and its target genes is derived from studies of Escherichia coli. Our work documents the PhoB regulon in Caulobacter crescentus, and comparison to the regulon in E. coli reveals significant differences, highlighting the evolutionary plasticity of transcriptional responses driven by highly conserved transcription factors. We also demonstrated  that the conserved protein PhoU, which is implicated in bacterial persistence, does not regulate PhoB activity, as previously suggested. Instead, our results favor a model in which PhoU affects intracellular phosphate accumulation, possibly through the high-affinity phosphate transporter. 
+ 28348816	 Uropathogenic Escherchia coli (UPEC) is the causative agent of urinary tract infections. Nitric oxide (NO) is a toxic water-soluble gas that is encountered by UPEC in the urinary tract. Therefore, UPEC probably requires mechanisms to detoxify NO in the host environment. Thus far, flavohaemoglobin (Hmp), an NO denitrosylase, is the only demonstrated NO detoxification system in UPEC. Here we show that, in E. coli strain CFT073, the NADH-dependent NO reductase flavorubredoxin (FlRd) also plays a major role in NO scavenging. We generated a mutant that lacks all known and candidate NO detoxification pathways (Hmp, FlRd and the respiratory nitrite reductase, NrfA). When grown and assayed anaerobically, this mutant expresses an NO-inducible NO scavenging activity, pointing to the existence of a novel detoxification mechanism. Expression of this activity is inducible by both NO and nitrate, and the enzyme is membrane-associated. Genome-wide transcriptional profiling of UPEC grown under anaerobic conditions in the presence of nitrate (as a source of NO) highlighted various aspects of the response of the pathogen to nitrate and NO. Several virulence-associated genes are upregulated, suggesting that host-derived NO is a  potential regulator of UPEC virulence. Chromatin immunoprecipitation and sequencing was used to evaluate the NsrR regulon in CFT073. We identified 49 NsrR binding sites in promoter regions in the CFT073 genome, 29 of which were not previously identified in E. coli K-12. NsrR may regulate some CFT073 genes that do not have homologues in E. coli K-12. 
+ 26389830	 The alternative sigma factor σE functions to maintain bacterial homeostasis and membrane integrity in response to extracytoplasmic stress by regulating thousands of genes both directly and indirectly. The transcriptional regulatory network governed by σE in Salmonella and E. coli has been examined using microarray, however a genome-wide analysis of σE-binding sites in Salmonella has not yet been reported. We infected macrophages with Salmonella Typhimurium over a select time  course. Using chromatin immunoprecipitation followed by high-throughput DNA sequencing (ChIP-seq), 31 σE-binding sites were identified. Seventeen sites were  new, which included outer membrane proteins, a quorum-sensing protein, a cell division factor, and a signal transduction modulator. The consensus sequence identified for σE in vivo binding was similar to the one previously reported, except for a conserved G and A between the -35 and -10 regions. One third of the  σE-binding sites did not contain the consensus sequence, suggesting there may be  alternative mechanisms by which σE modulates transcription. By dissecting direct  and indirect modes of σE-mediated regulation, we found that σE activates gene expression through recognition of both canonical and reversed consensus sequence. New σE regulated genes (greA, luxS, ompA and ompX) are shown to be involved in heat shock and oxidative stress responses. 
+ 26070154	 In bacteria the concurrence of DNA replication and transcription leads to potentially deleterious encounters between the two machineries, which can occur in either the head-on (lagging strand genes) or co-directional (leading strand genes) orientations. These conflicts lead to replication fork stalling and can destabilize the genome. Both eukaryotic and prokaryotic cells possess resolution  factors that reduce the severity of these encounters. Though Escherichia coli accessory helicases have been implicated in the mitigation of head-on conflicts,  direct evidence of these proteins mitigating co-directional conflicts is lacking. Furthermore, the endogenous chromosomal regions where these helicases act, and the mechanism of recruitment, have not been identified. We show that the essential Bacillus subtilis accessory helicase PcrA aids replication progression  through protein coding genes of both head-on and co-directional orientations, as  well as rRNA and tRNA genes. ChIP-Seq experiments show that co-directional conflicts at highly transcribed rRNA, tRNA, and head-on protein coding genes are  major targets of PcrA activity on the chromosome. Partial depletion of PcrA renders cells extremely sensitive to head-on conflicts, linking the essential function of PcrA to conflict resolution. Furthermore, ablating PcrA's ATPase/helicase activity simultaneously increases its association with conflict regions, while incapacitating its ability to mitigate conflicts, and leads to cell death. In contrast, disruption of PcrA's C-terminal RNA polymerase interaction domain does not impact its ability to mitigate conflicts between replication and transcription, its association with conflict regions, or cell survival. Altogether, this work establishes PcrA as an essential factor involved  in mitigating transcription-replication conflicts and identifies chromosomal regions where it routinely acts. As both conflicts and accessory helicases are found in all domains of life, these results are broadly relevant. 
+ 25089258	 CarD is an essential mycobacterial protein that binds the RNA polymerase (RNAP) and affects the transcriptional profile of Mycobacterium smegmatis and Mycobacterium tuberculosis (6). We predicted that CarD was directly regulating RNAP function but our prior experiments had not determined at what stage of transcription CarD was functioning and at which genes CarD interacted with the RNAP. To begin to address these open questions, we performed Chromatin Immunoprecipitation sequencing (ChIP-seq) to survey the distribution of CarD throughout the M. smegmatis chromosome. The distribution of RNAP subunits β and σA were also profiled. We expected that RNAP β would be present throughout transcribed regions and RNAP σA would be predominantly enriched at promoters based on work in Escherichia coli (3), however this had yet to be determined in mycobacteria. The ChIP-seq analyses revealed that CarD was never present on the genome in the absence of RNAP, was primarily associated with promoter regions, and was highly correlated with the distribution of RNAP σA. The colocalization of σA and CarD led us to propose that in vivo, CarD associates with RNAP initiation  complexes at most promoters and is therefore a global regulator of transcription  initiation. Here we describe in detail the data from the ChIP-seq experiments associated with the study published by Srivastava and colleagues in the Proceedings of the National Academy of Science in 2013 (5) as well as discuss the findings from this dataset in relation to both CarD and mycobacterial transcription as a whole. The ChIP-seq data have been deposited in the Gene Expression Omnibus (GEO) database, www.ncbi.nlm.nih.gov/geo (accession no. GSE48164). 
+ 24650566	 Inferring gene regulatory networks from gene expression data at whole genome level is still an arduous challenge, especially in higher organisms where the number of genes is large but the number of experimental samples is small. It is reported that the accuracy of current methods at genome scale significantly drops from Escherichia coli to Saccharomyces cerevisiae due to the increase in number of genes. This limits the applicability of current methods to more complex genomes, like human and mouse. Least absolute shrinkage and selection operator (LASSO) is widely used for gene regulatory network inference from gene expression profiles. However, the accuracy of LASSO on large genomes is not satisfactory. In this study, we apply two extended models of LASSO, L0 and L1/2 regularization models to infer gene regulatory network from both high-throughput gene expression data and transcription factor binding data in mouse embryonic stem cells (mESCs). We find that both the L0 and L1/2 regularization models significantly outperform  LASSO in network inference. Incorporating interactions between transcription factors and their targets remarkably improved the prediction accuracy. Current study demonstrates the efficiency and applicability of these two models for gene  regulatory network inference from integrative omics data in large genomes. The applications of the two models will facilitate biologists to study the gene regulation of higher model organisms in a genome-wide scale. 
+ 24565265	 BACKGROUND: Chromatin immunoprecipitation (ChIP) experiments are now the most comprehensive experimental approaches for mapping the binding of transcription factors (TFs) to their target genes. However, ChIP data alone is insufficient for identifying functional binding target genes of TFs for two reasons. First, there  is an inherent high false positive/negative rate in ChIP-chip or ChIP-seq experiments. Second, binding signals in the ChIP data do not necessarily imply functionality. METHODS: It is known that ChIP-chip data and TF knockout (TFKO) data reveal complementary information on gene regulation. While ChIP-chip data can provide TF-gene binding pairs, TFKO data can provide TF-gene regulation pairs. Therefore, we propose a novel network approach for identifying functional TF-gene binding pairs by integrating the ChIP-chip data with the TFKO data. In our method, a TF-gene binding pair from the ChIP-chip data is regarded to be functional if it also has high confident curated TFKO TF-gene regulatory relation or deduced hypostatic TF-gene regulatory relation. RESULTS AND CONCLUSIONS: We first validated our method on a gathered ground truth set. Then we applied our method to the ChIP-chip data to identify functional TF-gene binding pairs. The biological significance of our identified functional TF-gene binding pairs was shown by assessing their functional enrichment, the prevalence of protein-protein interaction, and expression coherence. Our results  outperformed the results of three existing methods across all measures. And our identified functional targets of TFs also showed statistical significance over the randomly assigned TF-gene pairs. We also showed that our method is dataset independent and can apply to ChIP-seq data and the E. coli genome. Finally, we provided an example showing the biological applicability of our notion. 
+ 24053571	 BACKGROUND: Studies of protein association with DNA on a genome wide scale are possible through methods like ChIP-Chip or ChIP-Seq. Massive problems with false  positive signals in our own experiments motivated us to revise the standard ChIP-Chip protocol. Analysis of chromosome wide binding of the alternative sigma  factor σ³² in Escherichia coli with this new protocol resulted in detection of only a subset of binding sites found in a previous study by Wade and colleagues.  We suggested that the remainder of binding sites detected in the previous study are likely to be false positives. In a recent article the Wade group claimed that our conclusion is wrong and that the disputed sites are genuine σ³² binding sites. They further claimed that the non-detection of these sites in our study was due to low data quality. RESULTS/DISCUSSION: We respond to the criticism of Wade and colleagues and discuss some general questions of ChIP-based studies. We outline why the quality  of our data is sufficient to derive meaningful results. Specific points are: (i)  the modifications we introduced into the standard ChIP-Chip protocol do not necessarily result in a low dynamic range, (ii) correlation between ChIP-Chip replicates should not be calculated based on the whole data set as done in transcript analysis, (iii) control experiments are essential for identifying false positives. Suggestions are made how ChIP-based methods could be further optimized and which alternative approaches can be used to strengthen conclusions. CONCLUSION: We appreciate the ongoing discussion about the ChIP-Chip method and hope that it helps other scientist to analyze and interpret their results. The modifications we introduced into the ChIP-Chip protocol are a first step towards  reducing false positive signals but there is certainly potential for further optimization. The discussion about the σ³² binding sites in question highlights the need for alternative approaches and further investigation of appropriate methods for verification. 
+ 23717649	 Fis, one of the most important nucleoid-associated proteins, functions as a global regulator of transcription in bacteria that has been comprehensively studied in Escherichia coli K12. Fis also influences the virulence of Salmonella  enterica and pathogenic E. coli by regulating their virulence genes, however, the relevant mechanism is unclear. In this report, using combined RNA-seq and chromatin immunoprecipitation (ChIP)-seq technologies, we first identified 1646 Fis-regulated genes and 885 Fis-binding targets in the S. enterica serovar Typhimurium, and found a Fis regulon different from that in E. coli. Fis has been reported to contribute to the invasion ability of S. enterica. By using cell infection assays, we found it also enhances the intracellular replication ability of S. enterica within macrophage cell, which is of central importance for the pathogenesis of infections. Salmonella pathogenicity islands (SPI)-1 and SPI-2 are crucial for the invasion and survival of S. enterica in host cells. Using mutation and overexpression experiments, real-time PCR analysis, and electrophoretic mobility shift assays, we demonstrated that Fis regulates 63 of the 94 Salmonella pathogenicity island (SPI)-1 and SPI-2 genes, by three regulatory modes: i) binds to SPI regulators in the gene body or in upstream regions; ii) binds to SPI genes directly to mediate transcriptional activation of themselves and downstream genes; iii) binds to gene encoding OmpR which affects SPI gene expression by controlling SPI regulators SsrA and HilD. Our results provide new insights into the impact of Fis on SPI genes and the pathogenicity of S. enterica. 
+ 23580539	 Accurate identification of the DNA-binding sites of transcription factors and other DNA-binding proteins on the genome is crucial to understanding their molecular interactions with DNA. Here, we describe a new method: Genome Footprinting by high-throughput sequencing (GeF-seq), which combines in vivo DNase I digestion of genomic DNA with ChIP coupled with high-throughput sequencing. We have determined the in vivo binding sites of a Bacillus subtilis global regulator, AbrB, using GeF-seq. This method shows that exact DNA-binding sequences, which were protected from in vivo DNase I digestion, were resolved at  a comparable resolution to that achieved by in vitro DNase I footprinting, and this was simply attained without the necessity of prediction by peak-calling programs. Moreover, DNase I digestion of the bacterial nucleoid resolved the closely positioned AbrB-binding sites, which had previously appeared as one peak  in ChAP-chip and ChAP-seq experiments. The high-resolution determination of AbrB-binding sites using GeF-seq enabled us to identify bipartite TGGNA motifs in 96% of the AbrB-binding sites. Interestingly, in a thousand binding sites with very low-binding intensities, single TGGNA motifs were also identified. Thus, GeF-seq is a powerful method to elucidate the molecular mechanism of target protein binding to its cognate DNA sequences. 
+ 23511241	 As the first, and usually rate-limiting, step of transcription initiation, bacterial RNA polymerase (RNAP) binds to double stranded DNA (dsDNA) and subsequently opens the two strands of DNA (the open complex formation). The rate  determining step in the open complex formation is opening of a short (6 bp) DNA called the -10 region, which interacts with RNAP in both dsDNA and single stranded (ssDNA) forms. Accordingly, formation of the open complex depends on (physically independent) domains of RNAP that interact with ssDNA and dsDNA, as well as on parameters of DNA melting and sequences of -10 regions. We here aim to understand how these different interactions are mutually related to ensure efficient open complex formation. To achieve this, we use a recently developed biophysical model of transcription initiation, which allows the calculation of the kinetic parameters of transcription initiation on the scale of whole genome.  We consequently investigate kinetic properties of sequences derived from all E. coli intergenic regions, and from more than 300 experimentally confirmed E. coli  σ(70) promoters. We find that interaction specificities of σ(70) DNA binding domains reduce the number of sequences where RNAP binds strongly, but forms the open complex too slowly to achieve functional transcription (so-called poised promoters). However, we find that, despite this reduction, there is still a significant number of such poised promoters in the intergenic regions, which may  provide a major source of false positives in genome-wide searches of transcription start sites. Furthermore, we surprisingly find that sequences of -10 regions of the functional promoters increase the extent of RNAP poising, which we interpret in terms of an extension of a recently proposed model of promoter recognition ('mix-and-match model') to kinetic parameters. Overall, our  results allow better understanding of the design of σ(70) DNA binding domains and promoter sequences, and place a fundamental limit on accuracy of methods for promoter detection that are based on strong RNAP binding (e.g. ChIP-chip). 
+ 23470992	 Salmonella Typhi and Typhimurium diverged only ∼50 000 years ago, yet have very different host ranges and pathogenicity. Despite the availability of multiple whole-genome sequences, the genetic differences that have driven these changes in phenotype are only beginning to be understood. In this study, we use transposon-directed insertion-site sequencing to probe differences in gene requirements for competitive growth in rich media between these two closely related serovars. We identify a conserved core of 281 genes that are required for growth in both serovars, 228 of which are essential in Escherichia coli. We are able to identify active prophage elements through the requirement for their repressors. We also find distinct differences in requirements for genes involved  in cell surface structure biogenesis and iron utilization. Finally, we demonstrate that transposon-directed insertion-site sequencing is not only applicable to the protein-coding content of the cell but also has sufficient resolution to generate hypotheses regarding the functions of non-coding RNAs (ncRNAs) as well. We are able to assign probable functions to a number of cis-regulatory ncRNA elements, as well as to infer likely differences in trans-acting ncRNA regulatory networks. 
+ 23275538	 Nanobodies® are single-domain antibody fragments derived from camelid heavy-chain antibodies. Because of their small size, straightforward production in Escherichia coli, easy tailoring, high affinity, specificity, stability and solubility, nanobodies® have been exploited in various biotechnological applications. A major challenge in the post-genomics and post-proteomics era is the identification of regulatory networks involving nucleic acid-protein and protein-protein interactions. Here, we apply a nanobody® in chromatin immunoprecipitation followed by DNA microarray hybridization (ChIP-chip) for genome-wide identification of DNA-protein interactions. The Lrp-like regulator Ss-LrpB, arguably one of the best-studied specific transcription factors of the hyperthermophilic archaeon Sulfolobus solfataricus, was chosen for this proof-of-principle nanobody®-assisted ChIP. Three distinct Ss-LrpB-specific nanobodies®, each interacting with a different epitope, were generated for ChIP.  Genome-wide ChIP-chip with one of these nanobodies® identified the well-established Ss-LrpB binding sites and revealed several unknown target sequences. Furthermore, these ChIP-chip profiles revealed auxiliary operator sites in the open reading frame of Ss-lrpB. Our work introduces nanobodies® as a  novel class of affinity reagents for ChIP. Taking into account the unique characteristics of nanobodies®, in particular, their short generation time, nanobody®-based ChIP is expected to further streamline ChIP-chip and ChIP-Seq experiments, especially in organisms with no (or limited) possibility of genetic  manipulation. 
+ 23232715	 Cyclic AMP receptor protein (Crp) is a transcription regulator controlling diverse cellular processes in many bacteria. In Streptomyces coelicolor, it is well established that Crp plays a critical role in spore germination and colony development. Here, we demonstrate that Crp is a key regulator of secondary metabolism and antibiotic production in S. coelicolor and show that it may additionally coordinate precursor flux from primary to secondary metabolism. We found that crp deletion adversely affected the synthesis of three well-characterized antibiotics in S. coelicolor: actinorhodin (Act), undecylprodigiosin (Red), and calcium-dependent antibiotic (CDA). Using chromatin immunoprecipitation-microarray (ChIP-chip) assays, we determined that eight (out  of 22) secondary metabolic clusters encoded by S. coelicolor contained Crp-associated sites. We followed the effect of Crp induction using transcription profiling analyses and found secondary metabolic genes to be significantly affected: included in this Crp-dependent group were genes from six of the clusters identified in the ChIP-chip experiments. Overexpressing Crp in a panel of Streptomyces species led to enhanced antibiotic synthesis and new metabolite production, suggesting that Crp control over secondary metabolism is broadly conserved in the streptomycetes and that Crp overexpression could serve as a powerful tool for unlocking the chemical potential of these organisms. IMPORTANCE Streptomyces produces a remarkably diverse array of secondary metabolites, including many antibiotics. In recent years, genome sequencing has revealed that  these products represent only a small proportion of the total secondary metabolite potential of Streptomyces. There is, therefore, considerable interest  in discovering ways to stimulate the production of new metabolites. Here, we show that Crp (the classical regulator of carbon catabolite repression in Escherichia  coli) is a master regulator of secondary metabolism in Streptomyces. It binds to  eight of 22 secondary metabolic gene clusters in the Streptomyces coelicolor genome and directly affects the expression of six of these. Deletion of crp in S. coelicolor leads to dramatic reductions in antibiotic levels, while Crp overexpression enhances antibiotic production. We find that the antibiotic-stimulatory capacity of Crp extends to other streptomycetes, where its overexpression activates the production of "cryptic" metabolites that are not otherwise seen in the corresponding wild-type strain. 
+ 22207717	 Gene expression is tightly regulated by transcription factors and cofactors that  function by directly or indirectly interacting with DNA of the genome. Understanding how and where these proteins bind provides essential information to uncover genetic regulatory mechanisms. We have developed a new method to study DNA-protein interaction in vivo called DNA adenine methyltransferase (Dam)IP, which is based on fusing a protein of interest to a mutant form of Dam from Escherichia coli. We showed previously that DamIP can efficiently identify in vivo binding sites of Dam-tethered human estrogen receptor (hER)α. In current study, we present the cistrome of hERα determined by DamIP and high throughput sequencing (DamIP-seq). The DamIP-seq-defined hERα cistrome identifies many new binding regions and overlaps with those determined by chromatin immunoprecipitation (ChIP)-chip or ChIP-seq. Elements uniquely identified by DamIP-seq include a unique class of elements that show low, but persistent, hERα  binding when reexamined by conventional ChIP. In contrast, DamIP-seq fails to detect some elements with very transient hERα binding. The methyl-adenine modifications introduced by Dam are stable and do not decrease over 12 d. In summary, the current study provides both an alternate view of the hERα cistrome to further understand the mechanism of hERα-mediated transcription and a new tool to explore other transcriptional factors and cofactors that is very different from conventional ChIP. 
+ 19843227	 StpA is a paralogue of the nucleoid-associated protein H-NS that is conserved in  a range of enteric bacteria and had no known function in Salmonella Typhimurium.  We show that 5% of the Salmonella genome is regulated by StpA, which contrasts with the situation in Escherichia coli where deletion of stpA only had minor effects on gene expression. The StpA-dependent genes of S. Typhimurium are a specific subset of the H-NS regulon that are predominantly under the positive control of sigma(38) (RpoS), CRP-cAMP and PhoP. Regulation by StpA varied with growth phase; StpA controlled sigma(38) levels at mid-exponential phase by preventing inappropriate activation of sigma(38) during rapid bacterial growth. In contrast, StpA only activated the CRP-cAMP regulon during late exponential phase. ChIP-chip analysis revealed that StpA binds to PhoP-dependent genes but not to most genes of the CRP-cAMP and sigma(38) regulons. In fact, StpA indirectly regulates sigma(38)-dependent genes by enhancing sigma(38) turnover by repressing the anti-adaptor protein rssC. We discovered that StpA is essential for the dynamic regulation of sigma(38) in response to increased glucose levels.  Our findings identify StpA as a novel growth phase-specific regulator that plays  an important physiological role by linking sigma(38) levels to nutrient availability. 
--- a/oneClass_trainUseless_out/useless.out 0 → 100644
View file @a5eecdb
+++ b/oneClass_trainUseless_out/useless.out 0 → 100644
View file @a5eecdb
+ 29339415	 Escherichia coli K1 strains are major causative agents of invasive disease of newborn infants. The age dependency of infection can be reproduced in neonatal rats. Colonization of the small intestine following oral administration of K1 bacteria leads rapidly to invasion of the blood circulation; bacteria that avoid  capture by the mesenteric lymphatic system and evade antibacterial mechanisms in  the blood may disseminate to cause organ-specific infections such as meningitis.  Some E. coli K1 surface constituents, in particular the polysialic acid capsule,  are known to contribute to invasive potential, but a comprehensive picture of the factors that determine the fully virulent phenotype has not emerged so far. We constructed a library and constituent sublibraries of ∼775,000 Tn5 transposon mutants of E. coli K1 strain A192PP and employed transposon-directed insertion site sequencing (TraDIS) to identify genes required for fitness for infection of  2-day-old rats. Transposon insertions were lacking in 357 genes following recovery on selective agar; these genes were considered essential for growth in nutrient-replete medium. Colonization of the midsection of the small intestine was facilitated by 167 E. coli K1 gene products. Restricted bacterial translocation across epithelial barriers precluded TraDIS analysis of gut-to-blood and blood-to-brain transits; 97 genes were required for survival in  human serum. This study revealed that a large number of bacterial genes, many of  which were not previously associated with systemic E. coli K1 infection, are required to realize full invasive potential.IMPORTANCEEscherichia coli K1 strains cause life-threatening infections in newborn infants. They are acquired from the  mother at birth and colonize the small intestine, from where they invade the blood and central nervous system. It is difficult to obtain information from acutely ill patients that sheds light on physiological and bacterial factors determining invasive disease. Key aspects of naturally occurring age-dependent human infection can be reproduced in neonatal rats. Here, we employ transposon-directed insertion site sequencing to identify genes essential for the in vitro growth of E. coli K1 and genes that contribute to the colonization of susceptible rats. The presence of bottlenecks to invasion of the blood and cerebrospinal compartments precluded insertion site sequencing analysis, but we identified genes for survival in serum. 
+ 28791299	 Increasing evidence that microRNAs (miRNAs) play important roles in the immune response against infectious agents suggests that miRNA might be exploitable as signatures of exposure to specific infectious agents. In order to identify potential early miRNA biomarkers of bacterial infections, human peripheral blood  mononuclear cells (hPBMCs) were exposed to two select agents, Burkholderia pseudomallei K96243 and Francisella tularensis SHU S4, as well as to the nonpathogenic control Escherichia coli DH5α. RNA samples were harvested at three  early time points, 30, 60, and 120 minutes postexposure, then sequenced. RNAseq analyses identified 87 miRNAs to be differentially expressed (DE) in a linear fashion. Of these, 31 miRNAs were tested using the miScript miRNA qPCR assay. Through RNAseq identification and qPCR validation, we identified differentially expressed miRNA species that may be involved in the early response to bacterial infections. Based upon its upregulation at early time points postexposure in two  different individuals, hsa-mir-30c-5p is a miRNA species that could be studied further as a potential biomarker for exposure to these gram-negative intracellular pathogens. Gene ontology functional analyses demonstrated that programmed cell death is the first ranking biological process associated with miRNAs that are upregulated in F. tularensis-exposed hPBMCs. 
+ 28649444	 Inferring transcriptional gene regulatory networks from transcriptomic datasets is a key challenge of systems biology, with potential impacts ranging from medicine to agronomy. There are several techniques used presently to experimentally assay transcription factors to target relationships, defining important information about real gene regulatory networks connections. These techniques include classical ChIP-seq, yeast one-hybrid, or more recently, DAP-seq or target technologies. These techniques are usually used to validate algorithm predictions. Here, we developed a reverse engineering approach based on mathematical and computer simulation to evaluate the impact that this prior knowledge on gene regulatory networks may have on training machine learning algorithms. First, we developed a gene regulatory networks-simulating engine called FRANK (Fast Randomizing Algorithm for Network Knowledge) that is able to simulate large gene regulatory networks (containing 104 genes) with characteristics of gene regulatory networks observed in vivo. FRANK also generates stable or oscillatory gene expression directly produced by the simulated gene regulatory networks. The development of FRANK leads to important general conclusions concerning the design of large and stable gene regulatory networks harboring scale free properties (built ex nihilo). In combination with supervised (accepting prior knowledge) support vector machine algorithm we (i) address biologically oriented questions concerning our capacity to accurately reconstruct gene regulatory networks and in particular we demonstrate that prior-knowledge structure is crucial for accurate learning, and (ii) draw conclusions to inform experimental design to performed learning able to solve gene regulatory networks in the future. By demonstrating that our predictions concerning the influence of the prior-knowledge structure on support vector machine learning capacity holds true on real data (Escherichia coli K14 network reconstruction using network and transcriptomic data), we show that the formalism used to build FRANK can to some extent be a reasonable model for gene regulatory  networks in real cells. 
+ 28614372	 Infection with Shiga toxin (Stx) producing Escherichia coli O157:H7 can cause the potentially fatal complication hemolytic uremic syndrome, and currently only supportive therapy is available. Lack of suitable animal models has hindered study of this disease. Induced human intestinal organoids (iHIOs), generated by in vitro differentiation of pluripotent stem cells, represent differentiated human intestinal tissue. We show that iHIOs with addition of human neutrophils can model E. coli intestinal infection and innate cellular responses. Commensal and O157:H7 introduced into the iHIO lumen replicated rapidly achieving high numbers. Commensal E. coli did not cause damage, and were completely contained within the lumen, suggesting defenses, such as mucus production, can constrain non-pathogenic strains. Some O157:H7 initially co-localized with cellular actin.  Loss of actin and epithelial integrity was observed after 4 hours. O157:H7 grew as filaments, consistent with activation of the bacterial SOS stress response. SOS is induced by reactive oxygen species (ROS), and O157:H7 infection increased  ROS production. Transcriptional profiling (RNAseq) demonstrated that both commensal and O157:H7 upregulated genes associated with gastrointestinal maturation, while infection with O157:H7 upregulated inflammatory responses, including interleukin 8 (IL-8). IL-8 is associated with neutrophil recruitment, and infection with O157:H7 resulted in recruitment of human neutrophils into the  iHIO tissue. 
+ 28270101	 BACKGROUND: Avian pathogenic E. coli (APEC) can lead to a loss in millions of dollars in poultry annually because of mortality and produce contamination. Studies have verified that many immune-related genes undergo changes in alternative splicing (AS), along with nonsense mediated decay (NMD), to regulate  the immune system under different conditions. Therefore, the splicing profiles of primary lymphoid tissues with systemic APEC infection need to be comprehensively  examined. RESULTS: Gene expression in RNAseq data were obtained for three different immune  tissues (bone marrow, thymus, and bursa) from three phenotype birds (non-challenged, resistant, and susceptible birds) at two time points. Alternative 5' splice sites and exon skipping/inclusion were identified as the major alternative splicing events in avian primary immune organs under systemic APEC infection. In this study, we detected hundreds of differentially-expressed-transcript-containing genes (DETs) between different phenotype birds at 5 days post-infection (dpi). DETs, PSAP and STT3A, with NMD have important functions under systemic APEC infection. DETs, CDC45, CDK1, RAG2,  POLR1B, PSAP, and DNASE1L3, from the same transcription start sites (TSS) indicate that cell death, cell cycle, cellular function, and maintenance were predominant in host under systemic APEC. CONCLUSIONS: With the use of RNAseq technology and bioinformatics tools, this study provides a portrait of the AS event and NMD in primary lymphoid tissues, which play critical roles in host homeostasis under systemic APEC infection. According to this study, AS plays a pivotal regulatory role in the immune response in chicken under systemic APEC infection via either NMD or alternative TSSs. This study elucidates the regulatory role of AS for the immune complex under systemic APEC infection. 
+ 28060822	 Mosquitoes host communities of microbes in their digestive tract that consist primarily of bacteria. We previously reported that Aedes aegypti larvae colonized by a native community of bacteria and gnotobiotic larvae colonized by only Escherichia coli develop very similarly into adults, whereas axenic larvae never  molt and die as first instars. In this study, we extended these findings by first comparing the growth and abundance of bacteria in conventional, gnotobiotic, and  axenic larvae during the first instar. Results showed that conventional and gnotobiotic larvae exhibited no differences in growth, timing of molting, or number of bacteria in their digestive tract. Axenic larvae in contrast grew minimally and never achieved the critical size associated with molting by conventional and gnotobiotic larvae. In the second part of the study we compared  patterns of gene expression in conventional, gnotobiotic and axenic larvae by conducting an RNAseq analysis of gut and nongut tissues (carcass) at 22 h post-hatching. Approximately 12% of Ae. aegypti transcripts were differentially expressed in axenic versus conventional or gnotobiotic larvae. However, this profile consisted primarily of transcripts in seven categories that included the  down-regulation of select peptidases in the gut and up-regulation of several genes in the gut and carcass with roles in amino acid transport, hormonal signaling, and metabolism. Overall, our results indicate that axenic larvae exhibit alterations in gene expression consistent with defects in acquisition and assimilation of nutrients required for growth. 
+ 27872077	 Plasmids of incompatibility group A/C (IncA/C) are becoming increasingly prevalent within pathogenic Enterobacteriaceae They are associated with the dissemination of multiple clinically relevant resistance genes, including blaCMY  and blaNDM Current typing methods for IncA/C plasmids offer limited resolution. In this study, we present the complete sequence of a blaNDM-1-positive IncA/C plasmid, pMS6198A, isolated from a multidrug-resistant uropathogenic Escherichia  coli strain. Hypersaturated transposon mutagenesis, coupled with transposon-directed insertion site sequencing (TraDIS), was employed to identify  conserved genetic elements required for replication and maintenance of pMS6198A.  Our analysis of TraDIS data identified roles for the replicon, including repA, a  toxin-antitoxin system; two putative partitioning genes, parAB; and a putative gene, 053 Construction of mini-IncA/C plasmids and examination of their stability within E. coli confirmed that the region encompassing 053 contributes to the stable maintenance of IncA/C plasmids. Subsequently, the four major maintenance genes (repA, parAB, and 053) were used to construct a new plasmid multilocus sequence typing (PMLST) scheme for IncA/C plasmids. Application of this scheme to a database of 82 IncA/C plasmids identified 11 unique sequence types (STs), with  two dominant STs. The majority of blaNDM-positive plasmids examined (15/17; 88%)  fall into ST1, suggesting acquisition and subsequent expansion of this blaNDM-containing plasmid lineage. The IncA/C PMLST scheme represents a standardized tool to identify, track, and analyze the dissemination of important  IncA/C plasmid lineages, particularly in the context of epidemiological studies. 
+ 27836995	 RNA sequencing studies have identified hundreds of non-coding RNAs in bacteria, including regulatory small RNA (sRNA). However, our understanding of sRNA function has lagged behind their identification due to a lack of tools for the high-throughput analysis of RNA-RNA interactions in bacteria. Here we demonstrate that in vivo sRNA-mRNA duplexes can be recovered using UV-crosslinking, ligation  and sequencing of hybrids (CLASH). Many sRNAs recruit the endoribonuclease, RNase E, to facilitate processing of mRNAs. We were able to recover base-paired sRNA-mRNA duplexes in association with RNase E, allowing proximity-dependent ligation and sequencing of cognate sRNA-mRNA pairs as chimeric reads. We verified that this approach captures bona fide sRNA-mRNA interactions. Clustering analyses identified novel sRNA seed regions and sets of potentially co-regulated target mRNAs. We identified multiple mRNA targets for the pathotype-specific sRNA Esr41, which was shown to regulate colicin sensitivity and iron transport in E. coli Numerous sRNA interactions were also identified with non-coding RNAs, including sRNAs and tRNAs, demonstrating the high complexity of the sRNA interactome. 
+ 27466434	 Avian pathogenic Escherichia coli (APEC) can cause significant morbidity in chickens. The thymus provides the essential environment for T cell development; however, the thymus transcriptome has not been examined for gene expression in response to APEC infection. An improved understanding of the host genomic response to APEC infection could inform future breeding programs for disease resistance and APEC control. We therefore analyzed the transcriptome of the thymus of birds challenged with APEC, contrasting susceptible and resistant phenotypes. Thousands of genes were differentially expressed in birds of the 5-day post infection (dpi) challenged-susceptible group vs. 5 dpi non-challenged, in 5 dpi challenged-susceptible vs. 5 dpi challenged-resistant birds, as well as  in 5 dpi vs. one dpi challenged-susceptible birds. The Toll-like receptor signaling pathway was the major innate immune response for birds to respond to APEC infection. Moreover, lysosome and cell adhesion molecules pathways were common mechanisms for chicken response to APEC infection. The T-cell receptor signaling pathway, cell cycle, and p53 signaling pathways were significantly activated in resistant birds to resist APEC infection. These results provide a comprehensive assessment of global gene networks and biological functionalities of differentially expressed genes in the thymus under APEC infection. These findings provide novel insights into key molecular genetic mechanisms that differentiate host resistance from susceptibility in this primary lymphoid tissue, the thymus. 
+ 27424527	 Thermobifida fusca is a thermophilic actinobacterium. T. fusca muC obtained by adaptive evolution preferred yeast extract to ammonium sulfate for accumulating malic acid and ammonium sulfate for cell growth. We did transcriptome analysis of T. fusca muC on Avicel and cellobiose with addition of ammonium sulfate or yeast  extract, respectively by RNAseq. The transcriptional results indicate that ammonium sulfate induced the transcriptions of the genes related to carbohydrate  metabolisms significantly more than yeast extract. Importantly, Tfu_2487, encoding histidine-containing protein (HPr), didn't transcribe on yeast extract at all, while it transcribed highly on ammonium sulfate. In order to understand the impact of HPr on malate production and cell growth of the muC strain, we deleted Tfu_2487 to get a mutant strain: muCΔ2487, which had 1.33 mole/mole-glucose equivalent malate yield, much higher than that on yeast extract. We then developed an E. coli-T. fusca shuttle plasmid for over-expressing HPr in muCΔ2487, a strain without HPr background, forming the muCΔ2487S strain. The muCΔ2487S strain had a much lower malate yield but faster cell growth than the muC strain. The results of both mutant strains confirmed that HPr was the key regulatory protein for T. fusca's metabolisms on nitrogen sources. 
+ 27336699	 Our objective was to identify the biological response and the cross-talk between  liver and mammary tissue after intramammary infection (IMI) with Escherichia coli (E. coli) using RNAseq technology. Sixteen cows were inoculated with live E. coli into one mammary quarter at ~4-6 weeks in lactation. For all cows, biopsies were  performed at -144, 12 and 24 h relative to IMI in liver and at 24 h post-IMI in infected and non-infected (control) mammary quarters. For a subset of cows (n = 6), RNA was extracted from both liver and mammary tissue and sequenced using a 100 bp paired-end approach. Ingenuity Pathway Analysis and the Dynamic Impact Approach analysis of differentially expressed genes (overall effect False Discovery Rate≤0.05) indicated that IMI induced an overall activation of inflammation at 12 h post-IMI and a strong inhibition of metabolism, especially related to lipid, glucose, and xenobiotics at 24 h post-IMI in liver. The data indicated in mammary tissue an overall induction of inflammatory response with little effect on metabolism at 24 h post-IMI. We identified a large number of up-stream regulators potentially involved in the response to IMI in both tissues  but a relatively small core network of transcription factors controlling the response to IMI for liver whereas a large network in mammary tissue. Transcriptomic results in liver and mammary tissue were supported by changes in inflammatory and metabolic mediators in blood and milk. The analysis of potential cross-talk between the two tissues during IMI uncovered a large communication from the mammary tissue to the liver to coordinate the inflammatory response but  a relatively small communication from the liver to the mammary tissue. Our results indicate a strong induction of the inflammatory response in mammary tissue and impairment of liver metabolism 24h post-IMI partly driven by the signaling from infected mammary tissue. 
+ 27298336	 R loops form when transcripts hybridize to homologous DNA on chromosomes, yielding a DNA:RNA hybrid and a displaced DNA single strand. R loops impact the genome of many organisms, regulating chromosome stability, gene expression, and DNA repair. Understanding the parameters dictating R-loop formation in vivo has been hampered by the limited quantitative and spatial resolution of current genomic strategies for mapping R loops. We report a novel whole-genome method, S1-DRIP-seq (S1 nuclease DNA:RNA immunoprecipitation with deep sequencing), for mapping hybrid-prone regions in budding yeast Saccharomyces cerevisiae Using this methodology, we identified ∼800 hybrid-prone regions covering 8% of the genome. Given the pervasive transcription of the yeast genome, this result suggests that  R-loop formation is dictated by characteristics of the DNA, RNA, and/or chromatin. We successfully identified two features highly predictive of hybrid formation: high transcription and long homopolymeric dA:dT tracts. These accounted for >60% of the hybrid regions found in the genome. We demonstrated that these two factors play a causal role in hybrid formation by genetic manipulation. Thus, the hybrid map generated by S1-DRIP-seq led to the identification of the first global genomic features causal for R-loop formation in yeast. 
+ 27004424	 BACKGROUND: Biofilm formation is an important survival strategy of Salmonella in  all environments. By mutant screening, we showed a knock-out mutant of fabR, encoding a repressor of unsaturated fatty acid biosynthesis (UFA), to have impaired biofilm formation. In order to unravel how this regulator impinges on Salmonella biofilm formation, we aimed at elucidating the S. Typhimurium FabR regulon. Hereto, we applied a combinatorial high-throughput approach, combining ChIP-chip with transcriptomics. RESULTS: All the previously identified E. coli FabR transcriptional target genes  (fabA, fabB and yqfA) were shown to be direct S. Typhimurium FabR targets as well. As we found a fabB overexpressing strain to partly mimic the biofilm defect of the fabR mutant, the effect of FabR on biofilms can be attributed at least partly to FabB, which plays a key role in UFA biosynthesis. Additionally, ChIP-chip identified a number of novel direct FabR targets (the intergenic regions between hpaR/hpaG and ddg/ydfZ) and yet putative direct targets (i.a. genes involved in tRNA metabolism, ribosome synthesis and translation). Next to UFA biosynthesis, a number of these direct targets and other indirect targets identified by transcriptomics (e.g. ribosomal genes, ompA, ompC, ompX, osmB, osmC, sseI), could possibly contribute to the effect of FabR on biofilm formation. CONCLUSION: Overall, our results point at the importance of FabR and UFA biosynthesis in Salmonella biofilm formation and their role as potential targets  for biofilm inhibitory strategies. 
+ 26706151	 Proper division site selection is crucial for the survival of all organisms. What still eludes us is how bacteria position their division site with high precision, and in tight coordination with chromosome replication and segregation. Until recently, the general belief, at least in the model organisms Bacillus subtilis and Escherichia coli, was that spatial regulation of division comes about by the  combined negative regulatory mechanisms of the Min system and nucleoid occlusion. However, as we review here, these two systems cannot be solely responsible for division site selection and we highlight additional regulatory mechanisms that are at play. In this review, we put forward evidence of how chromosome replication and segregation may have direct links with cell division in these bacteria and the benefit of recent advances in chromosome conformation capture techniques in providing important information about how these three processes mechanistically work together to achieve accurate generation of progenitor cells. 
+ 26131613	 Escherichia coli ST131 is a recently emerged and globally disseminated multidrug  resistant clone associated with urinary tract and bloodstream infections in both  community and clinical settings. The most common group of ST131 strains are defined by resistance to fluoroquinolones and possession of the type 1 fimbriae fimH30 allele. Here we provide an update on our recent work describing the globally epidemiology of ST131. We review the phylogeny of ST131 based on whole genome sequence data and highlight the important role of recombination in the evolution of this clonal lineage. We also summarize our findings on the virulence of the ST131 reference strain EC958, and highlight the use of transposon directed insertion-site sequencing to define genes associated with serum resistance and essential features of its large antibiotic resistance plasmid pEC958. 
+ 25875675	 Escherichia coli sequence type 131 (E. coli ST131) is a recently emerged and globally disseminated multidrug resistant clone associated with urinary tract and bloodstream infections. Plasmids represent a major vehicle for the carriage of antibiotic resistance genes in E. coli ST131. In this study, we determined the complete sequence and performed a comprehensive annotation of pEC958, an IncF plasmid from the E. coli ST131 reference strain EC958. Plasmid pEC958 is 135.6 kb in size, harbours two replicons (RepFIA and RepFII) and contains 12 antibiotic resistance genes (including the blaCTX-M-15 gene). We also carried out hyper-saturated transposon mutagenesis and multiplexed transposon directed insertion-site sequencing (TraDIS) to investigate the biology of pEC958. TraDIS data showed that while only the RepFII replicon was required for pEC958 replication, the RepFIA replicon contains genes essential for its partitioning. Thus, our data provides direct evidence that the RepFIA and RepFII replicons in pEC958 cooperate to ensure their stable inheritance. The gene encoding the antitoxin component (ccdA) of the post-segregational killing system CcdAB was also protected from mutagenesis, demonstrating this system is active. Sequence comparison with a global collection of ST131 strains suggest that IncF represents the most common type of plasmid in this clone, and underscores the need to understand its evolution and contribution to the spread of antibiotic resistance  genes in E. coli ST131. 
+ 25873626	 The cMonkey integrated biclustering algorithm identifies conditionally co-regulated modules of genes (biclusters). cMonkey integrates various orthogonal pieces of information which support evidence of gene co-regulation, and optimizes biclusters to be supported simultaneously by one or more of these prior constraints. The algorithm served as the cornerstone for constructing the first global, predictive Environmental Gene Regulatory Influence Network (EGRIN) model  for a free-living cell, and has now been applied to many more organisms. However, due to its computational inefficiencies, long run-time and complexity of various  input data types, cMonkey was not readily usable by the wider community. To address these primary concerns, we have significantly updated the cMonkey algorithm and refactored its implementation, improving its usability and extendibility. These improvements provide a fully functioning and user-friendly platform for building co-regulated gene modules and the tools necessary for their exploration and interpretation. We show, via three separate analyses of data for  E. coli, M. tuberculosis and H. sapiens, that the updated algorithm and inclusion of novel scoring functions for new data types (e.g. ChIP-seq and transcription factor over-expression [TFOE]) improve discovery of biologically informative co-regulated modules. The complete cMonkey2 software package, including source code, is available at https://github.com/baliga-lab/cmonkey2. 
+ 25757765	 Plants consist of many functionally specialized cell types, each with its own unique epigenome, transcriptome, and proteome. Characterization of these cell type-specific properties is essential to understanding cell fate specification and the responses of individual cell types to the environment. In this chapter we describe an approach to map chromatin features in specific cell types of Arabidopsis thaliana using nuclei purification from individual cell types with the INTACT method (isolation of nuclei tagged in specific cell types) followed by chromatin immunoprecipitation and high-throughput sequencing (ChIP-seq). The INTACT system employs two transgenes to generate affinity-labeled nuclei in the cell type of interest, and these tagged nuclei can then be selectively purified from tissue homogenates. The primary transgene encodes the nuclear tagging fusion protein (NTF), which consists of a nuclear envelope-targeting domain, the green fluorescent protein, and a biotin ligase recognition peptide, while the second transgene encodes the E. coli biotin ligase (BirA), which selectively biotinylates NTF. Expression of NTF and BirA in a specific cell type thus yields  nuclei that are coated with biotin and can be purified by virtue of their affinity for streptavidin-coated magnetic beads. Compared with the original INTACT nuclei purification protocol, the procedure presented here is greatly simplified and shortened. After nuclei purification, we provide detailed instructions for chromatin isolation, shearing, and immunoprecipitation. Finally, we present a low input ChIP-seq library preparation protocol based on the nano-ChIP-seq method of Adli and Bernstein, and we describe multiplex Illumina sequencing of these libraries to produce high quality, cell type-specific epigenome profiles at a relatively low cost. The procedures given here are optimized for Arabidopsis but should be easily adaptable to other plant species. 
+ 25085508	 BACKGROUND: Burkholderia pseudomallei is a facultative intracellular pathogen and the causative agent of melioidosis. A conserved type III secretion system (T3SS3) and type VI secretion system (T6SS1) are critical for intracellular survival and  growth. The T3SS3 and T6SS1 genes are coordinately and hierarchically regulated by a TetR-type regulator, BspR. A central transcriptional regulator of the BspR regulatory cascade, BsaN, activates a subset of T3SS3 and T6SS1 loci. RESULTS: To elucidate the scope of the BsaN regulon, we used RNAseq analysis to compare the transcriptomes of wild-type B. pseudomallei KHW and a bsaN deletion mutant. The 60 genes positively-regulated by BsaN include those that we had previously identified in addition to a polyketide biosynthesis locus and genes involved in amino acid biosynthesis. BsaN was also found to repress the transcription of 51 genes including flagellar motility loci and those encoding components of the T3SS3 apparatus. Using a promoter-lacZ fusion assay in E. coli, we show that BsaN together with the chaperone BicA directly control the expression of the T3SS3 translocon, effector and associated regulatory genes that are organized into at least five operons (BPSS1516-BPSS1552). Using a mutagenesis approach, a consensus regulatory motif in the promoter regions of BsaN-regulated  genes was shown to be essential for transcriptional activation. CONCLUSIONS: BsaN/BicA functions as a central regulator of key virulence clusters in B. pseudomallei within a more extensive network of genetic regulation. We propose that BsaN/BicA controls a gene expression program that facilitates the adaption and intracellular survival of the pathogen within eukaryotic hosts. 
+ 24743342	 DNA:RNA hybrid formation is emerging as a significant cause of genome instability in biological systems ranging from bacteria to mammals. Here we describe the genome-wide distribution of DNA:RNA hybrid prone loci in Saccharomyces cerevisiae by DNA:RNA immunoprecipitation (DRIP) followed by hybridization on tiling microarray. These profiles show that DNA:RNA hybrids preferentially accumulated at rDNA, Ty1 and Ty2 transposons, telomeric repeat regions and a subset of open reading frames (ORFs). The latter are generally highly transcribed and have high  GC content. Interestingly, significant DNA:RNA hybrid enrichment was also detected at genes associated with antisense transcripts. The expression of antisense-associated genes was also significantly altered upon overexpression of  RNase H, which degrades the RNA in hybrids. Finally, we uncover mutant-specific differences in the DRIP profiles of a Sen1 helicase mutant, RNase H deletion mutant and Hpr1 THO complex mutant compared to wild type, suggesting different roles for these proteins in DNA:RNA hybrid biology. Our profiles of DNA:RNA hybrid prone loci provide a resource for understanding the properties of hybrid-forming regions in vivo, extend our knowledge of hybrid-mitigating enzymes, and contribute to models of antisense-mediated gene regulation. A summary of this paper was presented at the 26th International Conference on Yeast Genetics and Molecular Biology, August 2013. 
+ 24098145	 Escherichia coli ST131 is a globally disseminated, multidrug resistant clone responsible for a high proportion of urinary tract and bloodstream infections. The rapid emergence and successful spread of E. coli ST131 is strongly associated with antibiotic resistance; however, this phenotype alone is unlikely to explain  its dominance amongst multidrug resistant uropathogens circulating worldwide in hospitals and the community. Thus, a greater understanding of the molecular mechanisms that underpin the fitness of E. coli ST131 is required. In this study, we employed hyper-saturated transposon mutagenesis in combination with multiplexed transposon directed insertion-site sequencing to define the essential genes required for in vitro growth and the serum resistome (i.e. genes required for resistance to human serum) of E. coli EC958, a representative of the predominant E. coli ST131 clonal lineage. We identified 315 essential genes in E. coli EC958, 231 (73%) of which were also essential in E. coli K-12. The serum resistome comprised 56 genes, the majority of which encode membrane proteins or factors involved in lipopolysaccharide (LPS) biosynthesis. Targeted mutagenesis confirmed a role in serum resistance for 46 (82%) of these genes. The murein lipoprotein Lpp, along with two lipid A-core biosynthesis enzymes WaaP and WaaG,  were most strongly associated with serum resistance. While LPS was the main resistance mechanism defined for E. coli EC958 in serum, the enterobacterial common antigen and colanic acid also impacted on this phenotype. Our analysis also identified a novel function for two genes, hyxA and hyxR, as minor regulators of O-antigen chain length. This study offers novel insight into the genetic make-up of E. coli ST131, and provides a framework for future research on E. coli and other Gram-negative pathogens to define their essential gene repertoire and to dissect the molecular mechanisms that enable them to survive in the bloodstream and cause disease. 
+ 23865838	 BACKGROUND: Identification of transcription factor binding sites (also called 'motif discovery') in DNA sequences is a basic step in understanding genetic regulation. Although many successful programs have been developed, the problem is far from being solved on account of diversity in gene expression/regulation and the low specificity of binding sites. State-of-the-art algorithms have their own  constraints (e.g., high time or space complexity for finding long motifs, low precision in identification of weak motifs, or the OOPS constraint: one occurrence of the motif instance per sequence) which limit their scope of application. RESULTS: In this paper, we present a novel and fast algorithm we call TFBSGroup.  It is based on community detection from a graph and is used to discover long and  weak (l,d) motifs under the ZOMOPS constraint (zero, one or multiple occurrence(s) of the motif instance(s) per sequence), where l is the length of a  motif and d is the maximum number of mutations between a motif instance and the motif itself. Firstly, TFBSGroup transforms the (l, d) motif search in sequences  to focus on the discovery of dense subgraphs within a graph. It identifies these  subgraphs using a fast community detection method for obtaining coarse-grained candidate motifs. Next, it greedily refines these candidate motifs towards the true motif within their own communities. Empirical studies on synthetic (l, d) samples have shown that TFBSGroup is very efficient (e.g., it can find true (18,  6), (24, 8) motifs within 30 seconds). More importantly, the algorithm has succeeded in rapidly identifying motifs in a large data set of prokaryotic promoters generated from the Escherichia coli database RegulonDB. The algorithm has also accurately identified motifs in ChIP-seq data sets for 12 mouse transcription factors involved in ES cell pluripotency and self-renewal. CONCLUSIONS: Our novel heuristic algorithm, TFBSGroup, is able to quickly identify nearly exact matches for long and weak (l, d) motifs in DNA sequences under the ZOMOPS constraint. It is also capable of finding motifs in real applications. The source code for TFBSGroup can be obtained from http://bioinformatics.bioengr.uic.edu/TFBSGroup/. 
+ 23190111	 OmpR is a multifunctional DNA binding regulator with orthologues in many enteric  bacteria that exhibits classical regulator activity as well as nucleoid-associated protein-like characteristics. In the enteric pathogen Salmonella enterica, using chromatin immunoprecipitation of OmpR:FLAG and nucleotide sequencing, 43 putative OmpR binding sites were identified in S. enterica serovar Typhi, 22 of which were associated with OmpR-regulated genes. Mutation of a sequence motif (TGTWACAW) that was associated with the putative OmpR binding sites abrogated binding of OmpR:6×His to the tviA upstream region. A core set of 31 orthologous genes were found to exhibit OmpR-dependent expression  in both S. Typhi and S. Typhimurium. S. Typhimurium-encoded orthologues of two divergently transcribed OmpR-regulated operons (SL1068-71 and SL1066-67) had a putative OmpR binding site in the inter-operon region in S. Typhi, and were characterized using in vitro and in vivo assays. These operons are widely distributed within S. enterica but absent from the closely related Escherichia coli. SL1066 and SL1067 were required for growth on N-acetylmuramic acid as a sole carbon source. SL1068-71 exhibited sequence similarity to sialic acid uptake systems and contributed to colonization of the ileum and caecum in the streptomycin-pretreated mouse model of colitis. 
+ 22923524	 Typical approaches for predicting transcription factor binding sites (TFBSs) involve use of a position-specific weight matrix (PWM) to statistically characterize the sequences of the known sites. Recently, an alternative physicochemical approach, called SiteSleuth, was proposed. In this approach, a linear support vector machine (SVM) classifier is trained to distinguish TFBSs from background sequences based on local chemical and structural features of DNA. SiteSleuth appears to generally perform better than PWM-based methods. Here, we improve the SiteSleuth approach by considering both new physicochemical features  and algorithmic modifications. New features are derived from Gibbs energies of amino acid-DNA interactions and hydroxyl radical cleavage profiles of DNA. Algorithmic modifications consist of inclusion of a feature selection step, use of a nonlinear kernel in the SVM classifier, and use of a consensus-based post-processing step for predictions. We also considered SVM classification based on letter features alone to distinguish performance gains from use of SVM-based models versus use of physicochemical features. The accuracy of each of the variant methods considered was assessed by cross validation using data available  in the RegulonDB database for 54 Escherichia coli TFs, as well as by experimental validation using published ChIP-chip data available for Fis and Lrp. 
+ 22890136	 Two transcription termination mechanisms - intrinsic and Rho-dependent - have evolved in bacteria. The Rho factor occurs in most bacterial lineages, and has been hypothesized to play a global regulatory role. Genome-wide studies using microarray, 2D-gel electrophoresis and ChIP-chip provided evidence that Rho serves to silence transcription from horizontally acquired genes and prophages in Escherichia coli K-12, implicating the factor to be a part of the "cellular immune mechanism" protecting against deleterious phages and aberrant gene expression from acquired xenogenic DNA. We have investigated this model by adopting an alternate in silico approach and have extended the study to other species. Our analysis shows that several genomic islands across diverse phyla have under-representation of intrinsic terminators, similar to that experimentally observed in E. coli K-12. This implies that Rho-dependent termination is the predominant process operational in these islands and that silencing of foreign DNA is a conserved function of Rho. From the present analysis, it is evident that horizontally acquired islands have lost intrinsic terminators to facilitate Rho-dependent termination. These results underscore the importance of Rho as a conserved, genome-wide sentinel that regulates potentially toxic xenogenic DNA. 
+ 22555467	 Signature tagged mutagenesis is a genetic approach that was developed to identify novel bacterial virulence factors. It is a negative selection method in which unique identification tags allow analysis of pools of mutants in mixed populations. The approach is particularly well suited to functional genetic analysis of the gastrointestinal phase of infection in foodborne pathogens and has the capacity to guide the development of novel vaccines and therapeutics. In  this review we outline the technical principles underpinning signature-tagged mutagenesis as well as novel sequencing-based approaches for transposon mutant identification such as TraDIS (transposon directed insertion-site sequencing). We also provide an analysis of screens that have been performed in gastrointestinal  pathogens which are a global health concern (Escherichia coli, Listeria monocytogenes, Helicobacter pylori, Vibrio cholerae and Salmonella enterica). The identification of key virulence loci through the use of signature tagged mutagenesis in mice and relevant larger animal models is discussed. 
+ 21515770	 Bacterial Gre factors associate with RNA polymerase (RNAP) and stimulate intrinsic cleavage of the nascent transcript at the active site of RNAP. Biochemical and genetic studies to date have shown that Escherichia coli Gre factors prevent transcriptional arrest during elongation and enhance transcription fidelity. Furthermore, Gre factors participate in the stimulation of promoter escape and the suppression of promoter-proximal pausing during the beginning of RNA synthesis in E. coli. Although Gre factors are conserved in general bacteria, limited functional studies have been performed in bacteria other than E. coli. In this investigation, ChAP-chip analysis (chromatin affinity precipitation coupled with DNA microarray) was conducted to visualize the distribution of Bacillus subtilis GreA on the chromosome and to determine the effects of GreA inactivation on core RNAP trafficking. Our data show that GreA is uniformly distributed in the transcribed region from the promoter to coding region with core RNAP, and its inactivation induces RNAP accumulation at many promoter or promoter-proximal regions. Based on these findings, we propose that GreA would constantly associate with core RNAP during transcriptional initiation  and elongation and resolves its stalling at promoter or promoter-proximal regions, thus contributing to the even distribution of RNAP along the promoter and coding regions in B. subtilis cells. 
+ 21278291	 Massively parallel sequencing of transposon-flanking regions assigned the genotype and fitness score to 91% of Escherichia coli O157:H7 mutants previously  screened in cattle by signature-tagged mutagenesis (STM). The method obviates the limitations of STM and markedly extended the functional annotation of the prototype E. coli O157:H7 genome without further animal use. 
+ 21124945	 An important step in understanding gene regulation is to identify the DNA binding sites recognized by each transcription factor (TF). Conventional approaches to prediction of TF binding sites involve the definition of consensus sequences or position-specific weight matrices and rely on statistical analysis of DNA sequences of known binding sites. Here, we present a method called SiteSleuth in  which DNA structure prediction, computational chemistry, and machine learning are applied to develop models for TF binding sites. In this approach, binary classifiers are trained to discriminate between true and false binding sites based on the sequence-specific chemical and structural features of DNA. These features are determined via molecular dynamics calculations in which we consider  each base in different local neighborhoods. For each of 54 TFs in Escherichia coli, for which at least five DNA binding sites are documented in RegulonDB, the  TF binding sites and portions of the non-coding genome sequence are mapped to feature vectors and used in training. According to cross-validation analysis and  a comparison of computational predictions against ChIP-chip data available for the TF Fis, SiteSleuth outperforms three conventional approaches: Match, MATRIX SEARCH, and the method of Berg and von Hippel. SiteSleuth also outperforms QPMEME, a method similar to SiteSleuth in that it involves a learning algorithm.  The main advantage of SiteSleuth is a lower false positive rate. 
+ 21051353	 Immuno-precipitation of protein-DNA complexes followed by microarray hybridization is a powerful and cost-effective technology for discovering protein-DNA binding events at the genome scale. It is still an unresolved challenge to comprehensively, accurately and sensitively extract binding event information from the produced data. We have developed a novel strategy composed of an information-preserving signal-smoothing procedure, higher order derivative  analysis and application of the principle of maximum entropy to address this challenge. Importantly, our method does not require any input parameters to be specified by the user. Using genome-scale binding data of two Escherichia coli global transcription regulators for which a relatively large number of experimentally supported sites are known, we show that ∼90% of known sites were resolved to within four probes, or ∼88 bp. Over half of the sites were resolved to within two probes, or ∼38 bp. Furthermore, we demonstrate that our strategy delivers significant quantitative and qualitative performance gains over available methods. Such accurate and sensitive binding site resolution has important consequences for accurately reconstructing transcriptional regulatory networks, for motif discovery, for furthering our understanding of local and non-local factors in protein-DNA interactions and for extending the usefulness horizon of the ChIP-chip platform. 
+ 20817769	 To obtain insight into the in vivo dynamics of RNA polymerase (RNAP) on the Bacillus subtilis genome, we analyzed the distribution of the σ(A) and β' subunits of RNAP and the NusA elongation factor on the genome in exponentially growing cells using chromatin affinity precipitation coupled with gene chip mapping (ChAP-chip). In contrast to Escherichia coli RNAP, which often accumulates at the promoter-proximal region, B. subtilis RΝΑP is evenly distributed from the promoter to the coding sequences. This finding suggests that, in general, B. subtilis RNAP recruited to the promoter promptly translocates away from the promoter to form the elongation complex and proceeds without intragenic transcription attenuation. We detected RNAP accumulation in the promoter-proximal regions of some genes, most of which can be identified as transcription attenuation systems in the leader region. Our findings suggest that the differences in RNAP behavior between E. coli and B. subtilis during initiation and elongation steps might result in distinct strategies for postinitiation control of transcription. The E. coli mechanism involves trapping  at the promoter and promoter-proximal pausing of RNAP in addition to transcription attenuation, whereas transcription attenuation in leader sequences  is mainly employed in B. subtilis. 
+ 20639326	 Histone-like protein H1 (H-NS) family proteins are nucleoid-associated proteins (NAPs) conserved among many bacterial species. The IncP-7 plasmid pCAR1 is transmissible among various Pseudomonas strains and carries a gene encoding the H-NS family protein, Pmr. Pseudomonas putida KT2440 is a host of pCAR1, which harbors five genes encoding the H-NS family proteins PP_1366 (TurA), PP_3765 (TurB), PP_0017 (TurC), PP_3693 (TurD), and PP_2947 (TurE). Quantitative reverse  transcription-PCR (qRT-PCR) demonstrated that the presence of pCAR1 does not affect the transcription of these five genes and that only pmr, turA, and turB were primarily transcribed in KT2440(pCAR1). In vitro pull-down assays revealed that Pmr strongly interacted with itself and with TurA, TurB, and TurE. Transcriptome comparisons of the pmr disruptant, KT2440, and KT2440(pCAR1) strains indicated that pmr disruption had greater effects on the host transcriptome than did pCAR1 carriage. The transcriptional levels of some genes that increased with pCAR1 carriage, such as the mexEF-oprN efflux pump genes and  parI, reverted with pmr disruption to levels in pCAR1-free KT2440. Transcriptional levels of putative horizontally acquired host genes were not altered by pCAR1 carriage but were altered by pmr disruption. Identification of genome-wide Pmr binding sites by ChAP-chip (chromatin affinity purification coupled with high-density tiling chip) analysis demonstrated that Pmr preferentially binds to horizontally acquired DNA regions. The Pmr binding sites  overlapped well with the location of the genes differentially transcribed following pmr disruption on both the plasmid and the chromosome. Our findings indicate that Pmr is a key factor in optimizing gene transcription on pCAR1 and the host chromosome. 
+ 20460455	 Deregulation of the Wnt/β-catenin signaling pathway is a hallmark of colon cancer. Mutations in the adenomatous polyposis coli (APC) gene occur in the vast  majority of colorectal cancers and are an initiating event in cellular transformation. Cells harboring mutant APC contain elevated levels of the β-catenin transcription coactivator in the nucleus which leads to abnormal expression of genes controlled by β-catenin/T-cell factor 4 (TCF4) complexes. Here, we use chromatin immunoprecipitation coupled with massively parallel sequencing (ChIP-Seq) to identify β-catenin binding regions in HCT116 human colon cancer cells. We localized 2168 β-catenin enriched regions using a concordance approach for integrating the output from multiple peak alignment algorithms. Motif discovery algorithms found a core TCF4 motif (T/A-T/A-C-A-A-A-G), an extended TCF4 motif (A/T/G-C/G-T/A-T/A-C-A-A-A-G) and an AP-1 motif (T-G-A-C/T-T-C-A) to be significantly represented in β-catenin enriched regions.  Furthermore, 417 regions contained both TCF4 and AP-1 motifs. Genes associated with TCF4 and AP-1 motifs bound β-catenin, TCF4 and c-Jun in vivo and were activated by Wnt signaling and serum growth factors. Our work provides evidence that Wnt/β-catenin and mitogen signaling pathways intersect directly to regulate  a defined set of target genes. 
+ 18974181	 EcoCyc (http://EcoCyc.org) provides a comprehensive encyclopedia of Escherichia coli biology. EcoCyc integrates information about the genome, genes and gene products; the metabolic network; and the regulatory network of E. coli. Recent EcoCyc developments include a new initiative to represent and curate all types of E. coli regulatory processes such as attenuation and regulation by small RNAs. EcoCyc has started to curate Gene Ontology (GO) terms for E. coli and has made a  dataset of E. coli GO terms available through the GO Web site. The curation and visualization of electron transfer processes has been significantly improved. Other software and Web site enhancements include the addition of tracks to the EcoCyc genome browser, in particular a type of track designed for the display of  ChIP-chip datasets, and the development of a comparative genome browser. A new Genome Omics Viewer enables users to paint omics datasets onto the full E. coli genome for analysis. A new advanced query page guides users in interactively constructing complex database queries against EcoCyc. A Macintosh version of EcoCyc is now available. A series of Webinars is available to instruct users in the use of EcoCyc. 
+ 18697768	 MOTIVATION: Locating transcription factor binding sites (motifs) is a key step in understanding gene regulation. Based on Tompa's benchmark study, the performance  of current de novo motif finders is far from satisfactory (with sensitivity <or=0.222 and precision <or=0.307). The same study also shows that no motif finder performs consistently well over all datasets. Hence, it is not clear which finder one should use for a given dataset. To address this issue, a class of algorithms called ensemble methods have been proposed. Though the existing ensemble methods overall perform better than stand-alone motif finders, the improvement gained is not substantial. Our study reveals that these methods do not fully exploit the information obtained from the results of individual finders, resulting in minor improvement in sensitivity and poor precision. RESULTS: In this article, we identify several key observations on how to utilize  the results from individual finders and design a novel ensemble method, MotifVoter, to predict the motifs and binding sites. Evaluations on 186 datasets  show that MotifVoter can locate more than 95% of the binding sites found by its component motif finders. In terms of sensitivity and precision, MotifVoter outperforms stand-alone motif finders and ensemble methods significantly on Tompa's benchmark, Escherichia coli, and ChIP-Chip datasets. MotifVoter is available online via a web server with several biologist-friendly features. 
+ 18460200	 BACKGROUND: Expression profiles obtained from multiple perturbation experiments are increasingly used to reconstruct transcriptional regulatory networks, from well studied, simple organisms up to higher eukaryotes. Admittedly, a key ingredient in developing a reconstruction method is its ability to integrate heterogeneous sources of information, as well as to comply with practical observability issues: measurements can be scarce or noisy. In this work, we show  how to combine a network of genetic regulations with a set of expression profiles, in order to infer the functional effect of the regulations, as inducer  or repressor. Our approach is based on a consistency rule between a network and the signs of variation given by expression arrays. RESULTS: We evaluate our approach in several settings of increasing complexity. First, we generate artificial expression data on a transcriptional network of E.  coli extracted from the literature (1529 nodes and 3802 edges), and we estimate that 30% of the regulations can be annotated with about 30 profiles. We additionally prove that at most 40.8% of the network can be inferred using our approach. Second, we use this network in order to validate the predictions obtained with a compendium of real expression profiles. We describe a filtering algorithm that generates particularly reliable predictions. Finally, we apply our inference approach to S. cerevisiae transcriptional network (2419 nodes and 4344  interactions), by combining ChIP-chip data and 15 expression profiles. We are able to detect and isolate inconsistencies between the expression profiles and a  significant portion of the model (15% of all the interactions). In addition, we report predictions for 14.5% of all interactions. CONCLUSION: Our approach does not require accurate expression levels nor times series. Nevertheless, we show on both data, real and artificial, that a relatively small number of perturbation experiments are enough to determine a significant portion of regulatory effects. This is a key practical asset compared to statistical methods for network reconstruction. We demonstrate that our approach is able to provide accurate predictions, even when the network is incomplete and the data is noisy. 
--- a/outRNAseq_binClass/useful.out 0 → 100644
View file @a5eecdb
+++ b/outRNAseq_binClass/useful.out 0 → 100644
View file @a5eecdb
+ 29484588	 Small regulatory RNAs (sRNAs) are ubiquitous regulatory molecules expressed in living cells. In prokaryotes, sRNAs usually bind to target mRNAs to either promote their degradation or interfere with translation initiation. Because a single sRNA can regulate a considerable number of target mRNAs, we seek to identify those targets rapidly and reliably. Here, we present a robust method based on the co-purification of target mRNAs bound to MS2-tagged sRNAs expressed  in vivo. After purification of the tagged-sRNA, we use RNAseq to determine the identity of all RNA interacting partners and their enrichment level. We describe  how to analyze the RNAseq data through the Galaxy Project Platform bioinformatics tools to identify new mRNA targets. This technique is applicable to most sRNAs of E. coli and Salmonella. 
+ 29433444	 BACKGROUND: Due to the DNA triplet code, it is possible that the sequences of two or more protein-coding genes overlap to a large degree. However, such non-trivial overlaps are usually excluded by genome annotation pipelines and, thus, only a few overlapping gene pairs have been described in bacteria. In contrast, transcriptome and translatome sequencing reveals many signals originated from the antisense strand of annotated genes, of which we analyzed an example gene pair in more detail. RESULTS: A small open reading frame of Escherichia coli O157:H7 strain Sakai (EHEC), designated laoB (L-arginine responsive overlapping gene), is embedded in  reading frame -2 in the antisense strand of ECs5115, encoding a CadC-like transcriptional regulator. This overlapping gene shows evidence of transcription  and translation in Luria-Bertani (LB) and brain-heart infusion (BHI) medium based on RNA sequencing (RNAseq) and ribosomal-footprint sequencing (RIBOseq). The transcriptional start site is 289 base pairs (bp) upstream of the start codon and transcription termination is 155 bp downstream of the stop codon. Overexpression  of LaoB fused to an enhanced green fluorescent protein (EGFP) reporter was possible. The sequence upstream of the transcriptional start site displayed strong promoter activity under different conditions, whereas promoter activity was significantly decreased in the presence of L-arginine. A strand-specific translationally arrested mutant of laoB provided a significant growth advantage in competitive growth experiments in the presence of L-arginine compared to the wild type, which returned to wild type level after complementation of laoB in trans. A phylostratigraphic analysis indicated that the novel gene is restricted  to the Escherichia/Shigella clade and might have originated recently by overprinting leading to the expression of part of the antisense strand of ECs5115. CONCLUSIONS: Here, we present evidence of a novel small protein-coding gene laoB  encoded in the antisense frame -2 of the annotated gene ECs5115. Clearly, laoB is evolutionarily young and it originated in the Escherichia/Shigella clade by overprinting, a process which may cause the de novo evolution of bacterial genes  like laoB. 
+ 28902868	 In the past, short protein-coding genes were often disregarded by genome annotation pipelines. Transcriptome sequencing (RNAseq) signals outside of annotated genes have usually been interpreted to indicate either ncRNA or pervasive transcription. Therefore, in addition to the transcriptome, the translatome (RIBOseq) of the enteric pathogen Escherichia coli O157:H7 strain Sakai was determined at two optimal growth conditions and a severe stress condition combining low temperature and high osmotic pressure. All intergenic open reading frames potentially encoding a protein of ≥ 30 amino acids were investigated with regard to coverage by transcription and translation signals and their translatability expressed by the ribosomal coverage value. This led to discovery of 465 unique, putative novel genes not yet annotated in this E. coli strain, which are evenly distributed over both DNA strands of the genome. For 255 of the novel genes, annotated homologs in other bacteria were found, and a machine-learning algorithm, trained on small protein-coding E. coli genes, predicted that 89% of these translated open reading frames represent bona fide genes. The remaining 210 putative novel genes without annotated homologs were compared to the 255 novel genes with homologs and to 250 short annotated genes of this E. coli strain. All three groups turned out to be similar with respect to their translatability distribution, fractions of differentially regulated genes,  secondary structure composition, and the distribution of evolutionary constraint, suggesting that both novel groups represent legitimate genes. However, the machine-learning algorithm only recognized a small fraction of the 210 genes without annotated homologs. It is possible that these genes represent a novel group of genes, which have unusual features dissimilar to the genes of the machine-learning algorithm training set. 
+ 28245801	 BACKGROUND: While NGS allows rapid global detection of transcripts, it remains difficult to distinguish ncRNAs from short mRNAs. To detect potentially translated RNAs, we developed an improved protocol for bacterial ribosomal footprinting (RIBOseq). This allowed distinguishing ncRNA from mRNA in EHEC. A high ratio of ribosomal footprints per transcript (ribosomal coverage value, RCV) is expected to indicate a translated RNA, while a low RCV should point to a non-translated RNA. RESULTS: Based on their low RCV, 150 novel non-translated EHEC transcripts were identified as putative ncRNAs, representing both antisense and intergenic transcripts, 74 of which had expressed homologs in E. coli MG1655. Bioinformatics analysis predicted statistically significant target regulons for 15 of the intergenic transcripts; experimental analysis revealed 4-fold or higher differential expression of 46 novel ncRNA in different growth media. Out of 329 annotated EHEC ncRNAs, 52 showed an RCV similar to protein-coding genes, of those, 16 had RIBOseq patterns matching annotated genes in other enterobacteriaceae, and 11 seem to possess a Shine-Dalgarno sequence, suggesting  that such ncRNAs may encode small proteins instead of being solely non-coding. To support that the RIBOseq signals are reflecting translation, we tested the ribosomal-footprint covered ORF of ryhB and found a phenotype for the encoded peptide in iron-limiting condition. CONCLUSION: Determination of the RCV is a useful approach for a rapid first-step  differentiation between bacterial ncRNAs and small mRNAs. Further, many known ncRNAs may encode proteins as well. 
+ 28240544	 Facile and simple method is developed to synthesize silver-nanoparticle-decorated quercetin nanoparticles (QA NPs). Modification suggests that synergistic quercetin (Qe) improves the antibacterial effect of silver nanoparticles (Ag NPs). Characterization experiment indicates that QA NPs have a diameter of approximately 10 nm. QA NPs show highly effective antibacterial activities against drug-resistant Escherichia coli (E. coli) and Staphylococcus aureus (S. aureus). We explore antibacterial mechanisms using S. aureus and E. coli treated  with QA NPs. Through morphological changes in E. coli and S. aureus, mechanisms are examined for bacterial damage caused by particulate matter from local dissociation of silver ion and Qe from QA NPs trapped inside membranes. Moreover, we note that gene expression profiling methods, such as RNA sequencing, can be used to predict discover mechanisms of toxicity of QA NPs. Gene ontology (GO) assay analyses demonstrate the molecular mechanism of the antibacterial effect of QA NPs. Regarding cellular component ontology, "cell wall organization or biogenesis" (GO: 0071554) and "cell wall macromolecule metabolic process" (GO: 0044036) are the most represented categories. The present study reports that transcriptome analysis of the mechanism offers novel insights into the molecular  mechanism of antibacterial assays. 
+ 28174601	 BACKGROUND: Lignin is a potential biorefinery feedstock for the production of value-added chemicals including vanillin. A huge amount of lignin is produced as  a by-product of the paper industry, while cellulosic components of plant biomass  are utilized for the production of paper pulp. In spite of vast potential, lignin remains the least exploited component of plant biomass due to its extremely complex and heterogenous structure. Several enzymes have been reported to have lignin-degrading properties and could be potentially used in lignin biorefining if their catalytic properties could be improved by enzyme engineering. The much needed improvement of lignin-degrading enzymes by high-throughput selection techniques such as directed evolution is currently limited, as robust methods for detecting the conversion of lignin to desired small molecules are not available. RESULTS: We identified a vanillin-inducible promoter by RNAseq analysis of Escherichia coli cells treated with a sublethal dose of vanillin and developed a  genetically programmed vanillin-sensing cell by placing the 'very green fluorescent protein' gene under the control of this promoter. Fluorescence of the biosensing cell is enhanced significantly when grown in the presence of vanillin  and is readily visualized by fluorescence microscopy. The use of fluorescence-activated cell sorting analysis further enhances the sensitivity, enabling dose-dependent detection of as low as 200 µM vanillin. The biosensor is  highly specific to vanillin and no major response is elicited by the presence of  lignin, lignin model compound, DMSO, vanillin analogues or non-specific toxic chemicals. CONCLUSIONS: We developed an engineered E. coli cell that can detect vanillin at  a concentration as low as 200 µM. The vanillin-sensing cell did not show cross-reactivity towards lignin or major lignin degradation products including vanillin analogues. This engineered E. coli cell could potentially be used as a host cell for screening lignin-degrading enzymes that can convert lignin to vanillin. 
+ 27876680	 Recent advances in high-throughput sequencing have led to an explosion in the rate of small regulatory RNAs (sRNAs) discovery among bacteria. However, only a handful of them are functionally characterized. Most of the time, little to no targets are known. In Lalaouna et al. (2015), we proposed a new technology to uncover sRNAs targetome, which is based on the MS2-affinity purification (MAPS).  We were able to prove its efficiency by applying it on well-characterized sRNAs of Escherichia coli. Thereafter, we adapted the procedure to other kind of RNA (mRNAs and tRNA-derived RNA fragments) and bacteria (pathogenic or Gram-positive  strains). Here, we clearly report all improvements and adjustments made to MAPS technology since it was originally reported. 
+ 27856567	 The enteric pathogen Escherichia coli O157:H7 Sakai (EHEC) is able to grow at lower temperatures compared to commensal E. coli Growth at environmental conditions displays complex challenges different to those in a host. EHEC was grown at 37°C and at 14°C with 4% NaCl, a combination of cold and osmotic stress  as present in the food chain. Comparison of RNAseq and RIBOseq data provided a snap shot of ongoing transcription and translation, differentiating transcriptional and post-transcriptional gene regulation, respectively. Indeed, cold and osmotic stress related genes are simultaneously regulated at both levels, but translational regulation clearly dominates. Special emphasis was given to genes regulated by RNA secondary structures in their 5'UTRs, such as RNA thermometers and riboswitches, or genes controlled by small RNAs encoded in trans The results reveal large differences in gene expression between short-time shock  compared to adaptation in combined cold and osmotic stress. Whereas the majority  of cold shock proteins, such as CspA, are translationally downregulated after adaptation, many osmotic stress genes are still significantly upregulated mainly  translationally, but several also transcriptionally. 
+ 26911138	 BACKGROUND: Genomes of E. coli, including that of the human pathogen Escherichia  coli O157:H7 (EHEC) EDL933, still harbor undetected protein-coding genes which, apparently, have escaped annotation due to their small size and non-essential function. To find such genes, global gene expression of EHEC EDL933 was examined, using strand-specific RNAseq (transcriptome), ribosomal footprinting (translatome) and mass spectrometry (proteome). RESULTS: Using the above methods, 72 short, non-annotated protein-coding genes were detected. All of these showed signals in the ribosomal footprinting assay indicating mRNA translation. Seven were verified by mass spectrometry. Fifty-seven genes are annotated in other enterobacteriaceae, mainly as hypothetical genes; the remaining 15 genes constitute novel discoveries. In addition, protein structure and function were predicted computationally and compared between EHEC-encoded proteins and 100-times randomly shuffled proteins.  Based on this comparison, 61 of the 72 novel proteins exhibit predicted structural and functional features similar to those of annotated proteins. Many of the novel genes show differential transcription when grown under eleven diverse growth conditions suggesting environmental regulation. Three genes were found to confer a phenotype in previous studies, e.g., decreased cattle colonization. CONCLUSIONS: These findings demonstrate that ribosomal footprinting can be used to detect novel protein coding genes, contributing to the growing body of evidence that hypothetical genes are not annotation artifacts and opening an additional way to study their functionality. All 72 genes are taxonomically restricted and, therefore, appear to have evolved relatively recently de novo. 
+ 26818886	 Volatile organic compounds (VOCs) are commonly used as solvents in various industrial settings. Many of them present a challenge to receiving environments,  due to their toxicity and low bioavailability for degradation. Microorganisms are capable of sensing and responding to their surroundings and this makes them ideal detectors for toxic compounds. This study investigates the global transcriptomic  responses of Escherichia coli K-12 to selected VOCs at sub-toxic levels. Cells grown in the presence of VOCs were harvested during exponential growth, followed  by whole transcriptome shotgun sequencing (RNAseq). The analysis of the data revealed both shared and unique genetic responses compared to cells without exposure to VOCs. Results suggest that various functional gene categories, for example, those relating to Fe/S cluster biogenesis, oxidative stress responses and transport proteins, are responsive to selected VOCs in E. coli. The differential expression (DE) of genes was validated using GFP-promoter fusion assays. A variety of genes were differentially expressed even at non-inhibitory concentrations and when the cells are at their balanced-growth. Some of these genes belong to generic stress response and others could be specific to VOCs. Such candidate genes and their regulatory elements could be used as the basis for designing biosensors for selected VOCs. 
+ 26307168	 Repeated extragenic palindromes (REPs) in the enterobacterial genomes are usually composed of individual palindromic units separated by linker sequences. A total of 355 annotated REPs are distributed along the Escherichia coli genome. RNA sequence (RNAseq) analysis showed that almost 80% of the REPs in E. coli are transcribed. The DNA sequence of REP325 showed that it is a cluster of six repeats, each with two palindromic units capable of forming cruciform structures  in supercoiled DNA. Here, we report that components of the REP325 element and at  least one of its RNA products play a role in bacterial nucleoid DNA condensation. These RNA not only are present in the purified nucleoid but bind to the bacterial nucleoid-associated HU protein as revealed by RNA IP followed by microarray analysis (RIP-Chip) assays. Deletion of REP325 resulted in a dramatic increase of the nucleoid size as observed using transmission electron microscopy (TEM), and expression of one of the REP325 RNAs, nucleoid-associated noncoding RNA 4 (naRNA4), from a plasmid restored the wild-type condensed structure. Independently, chromosome conformation capture (3C) analysis demonstrated physical connections among various REP elements around the chromosome. These connections are dependent in some way upon the presence of HU and the REP325 element; deletion of HU genes and/or the REP325 element removed the connections.  Finally, naRNA4 together with HU condensed DNA in vitro by connecting REP325 or other DNA sequences that contain cruciform structures in a pairwise manner as observed by atomic force microscopy (AFM). On the basis of our results, we propose molecular models to explain connections of remote cruciform structures mediated by HU and naRNA4.IMPORTANCE: Nucleoid organization in bacteria is being  studied extensively, and several models have been proposed. However, the molecular nature of the structural organization is not well understood. Here we characterized the role of a novel nucleoid-associated noncoding RNA, naRNA4, in nucleoid structures both in vivo and in vitro. We propose models to explain how naRNA4 together with nucleoid-associated protein HU connects remote DNA elements  for nucleoid condensation. We present the first evidence of a noncoding RNA together with a nucleoid-associated protein directly condensing nucleoid DNA. 
+ 26125937	 Adherent-invasive Escherichia coli (AIEC) strains are detected more frequently within mucosal lesions of patients with Crohn's disease (CD). The AIEC phenotype  consists of adherence and invasion of intestinal epithelial cells and survival within macrophages of these bacteria in vitro. Our aim was to identify candidate  transcripts that distinguish AIEC from non-invasive E. coli (NIEC) strains and might be useful for rapid and accurate identification of AIEC by culture-independent technology. We performed comparative RNA-Sequence (RNASeq) analysis using AIEC strain LF82 and NIEC strain HS during exponential and stationary growth. Differential expression analysis of coding sequences (CDS) homologous to both strains demonstrated 224 and 241 genes with increased and decreased expression, respectively, in LF82 relative to HS. Transition metal transport and siderophore metabolism related pathway genes were up-regulated, while glycogen metabolic and oxidation-reduction related pathway genes were down-regulated, in LF82. Chemotaxis related transcripts were up-regulated in LF82 during the exponential phase, but flagellum-dependent motility pathway genes were down-regulated in LF82 during the stationary phase. CDS that mapped only to the LF82 genome accounted for 747 genes. We applied an in silico subtractive genomics approach to identify CDS specific to AIEC by incorporating the genomes of 10 other previously phenotyped NIEC. From this analysis, 166 CDS mapped to the LF82  genome and lacked homology to any of the 11 human NIEC strains. We compared these CDS across 13 AIEC, but none were homologous in each. Four LF82 gene loci belonging to clustered regularly interspaced short palindromic repeats region (CRISPR)--CRISPR-associated (Cas) genes were identified in 4 to 6 AIEC and absent from all non-pathogenic bacteria. As previously reported, AIEC strains were enriched for pdu operon genes. One CDS, encoding an excisionase, was shared by 9  AIEC strains. Reverse transcription quantitative polymerase chain reaction assays for 6 genes were conducted on fecal and ileal RNA samples from 22 inflammatory bowel disease (IBD), and 32 patients without IBD (non-IBD). The expression of Cas loci was detected in a higher proportion of CD than non-IBD fecal and ileal RNA samples (p <0.05). These results support a comparative genomic/transcriptomic approach towards identifying candidate AIEC signature transcripts. 
+ 25177315	 Efficient microbial conversion of lignocellulosic hydrolysates to biofuels is a key barrier to the economically viable deployment of lignocellulosic biofuels. A  chief contributor to this barrier is the impact on microbial processes and energy metabolism of lignocellulose-derived inhibitors, including phenolic carboxylates, phenolic amides (for ammonia-pretreated biomass), phenolic aldehydes, and furfurals. To understand the bacterial pathways induced by inhibitors present in  ammonia-pretreated biomass hydrolysates, which are less well studied than acid-pretreated biomass hydrolysates, we developed and exploited synthetic mimics of ammonia-pretreated corn stover hydrolysate (ACSH). To determine regulatory responses to the inhibitors normally present in ACSH, we measured transcript and  protein levels in an Escherichia coli ethanologen using RNA-seq and quantitative  proteomics during fermentation to ethanol of synthetic hydrolysates containing or lacking the inhibitors. Our study identified four major regulators mediating these responses, the MarA/SoxS/Rob network, AaeR, FrmR, and YqhC. Induction of these regulons was correlated with a reduced rate of ethanol production, buildup  of pyruvate, depletion of ATP and NAD(P)H, and an inhibition of xylose conversion. The aromatic aldehyde inhibitor 5-hydroxymethylfurfural appeared to be reduced to its alcohol form by the ethanologen during fermentation, whereas phenolic acid and amide inhibitors were not metabolized. Together, our findings establish that the major regulatory responses to lignocellulose-derived inhibitors are mediated by transcriptional rather than translational regulators,  suggest that energy consumed for inhibitor efflux and detoxification may limit biofuel production, and identify a network of regulators for future synthetic biology efforts. 
+ 24927582	 The molecular mechanisms of ethanol toxicity and tolerance in bacteria, although  important for biotechnology and bioenergy applications, remain incompletely understood. Genetic studies have identified potential cellular targets for ethanol and have revealed multiple mechanisms of tolerance, but it remains difficult to separate the direct and indirect effects of ethanol. We used adaptive evolution to generate spontaneous ethanol-tolerant strains of Escherichia coli, and then characterized mechanisms of toxicity and resistance using genome-scale DNAseq, RNAseq, and ribosome profiling coupled with specific assays of ribosome and RNA polymerase function. Evolved alleles of metJ, rho, and rpsQ recapitulated most of the observed ethanol tolerance, implicating translation and transcription as key processes affected by ethanol. Ethanol induced miscoding errors during protein synthesis, from which the evolved rpsQ allele protected cells by increasing ribosome accuracy. Ribosome profiling and RNAseq analyses established that ethanol negatively affects transcriptional and translational processivity. Ethanol-stressed cells exhibited ribosomal stalling at internal AUG codons, which may be ameliorated by the adaptive inactivation of  the MetJ repressor of methionine biosynthesis genes. Ethanol also caused aberrant intragenic transcription termination for mRNAs with low ribosome density, which was reduced in a strain with the adaptive rho mutation. Furthermore, ethanol inhibited transcript elongation by RNA polymerase in vitro. We propose that ethanol-induced inhibition and uncoupling of mRNA and protein synthesis through direct effects on ribosomes and RNA polymerase conformations are major contributors to ethanol toxicity in E. coli, and that adaptive mutations in metJ, rho, and rpsQ help protect these central dogma processes in the presence of ethanol. 
+ 23203983	 The 20th annual Database Issue of Nucleic Acids Research includes 176 articles, half of which describe new online molecular biology databases and the other half  provide updates on the databases previously featured in NAR and other journals. This year's highlights include two databases of DNA repeat elements; several databases of transcriptional factors and transcriptional factor-binding sites; databases on various aspects of protein structure and protein-protein interactions; databases for metagenomic and rRNA sequence analysis; and four databases specifically dedicated to Escherichia coli. The increased emphasis on using the genome data to improve human health is reflected in the development of  the databases of genomic structural variation (NCBI's dbVar and EBI's DGVa), the  NIH Genetic Testing Registry and several other databases centered on the genetic  basis of human disease, potential drugs, their targets and the mechanisms of protein-ligand binding. Two new databases present genomic and RNAseq data for monkeys, providing wealth of data on our closest relatives for comparative genomics purposes. The NAR online Molecular Biology Database Collection, available at http://www.oxfordjournals.org/nar/database/a/, has been updated and  currently lists 1512 online databases. The full content of the Database Issue is  freely available online on the Nucleic Acids Research website (http://nar.oxfordjournals.org/). 
+ 22821568	 RNAsnap™ is a simple and novel method that recovers all intracellular RNA quantitatively (>99%), faster (<15 min) and less expensively (∼3 cents/sample) than any of the currently available RNA isolation methods. In fact, none of the bacterial RNA isolation methods, including the commercial kits, are effective in  recovering all species of intracellular RNAs (76-5700 nt) with equal efficiency,  which can lead to biased results in genome-wide studies involving microarray or RNAseq analysis. The RNAsnap™ procedure yields ∼60 µg of RNA from 10(8) Escherichia coli cells that can be used directly for northern analysis without any further purification. Based on a comparative analysis of specific transcripts ranging in size from 76 to 5700 nt, the RNAsnap™ method provided the most accurate measure of the relative amounts of the various intracellular RNAs. Furthermore, the RNAsnap™ RNA was successfully used in enzymatic reactions such as RNA ligation, reverse transcription, primer extension and reverse transcriptase-polymerase chain reaction, following sodium acetate/ethanol precipitation. The RNAsnap™ method can be used to isolate RNA from a wide range of Gram-negative and Gram-positive bacteria as well as yeast. 
+ 22689638	 Translational efficiency is controlled by tRNAs and other genome-encoded mechanisms. In organelles, translational processes are dramatically altered because of genome shrinkage and horizontal acquisition of gene products. The influence of genome reduction on translation in endosymbionts is largely unknown. Here, we investigate whether divergent lineages of Buchnera aphidicola, the reduced-genome bacterial endosymbiont of aphids, possess altered translational features compared with their free-living relative, Escherichia coli. Our RNAseq data support the hypothesis that translation is less optimal in Buchnera than in  E. coli. We observed a specific, convergent, pattern of tRNA loss in Buchnera and other endosymbionts that have undergone genome shrinkage. Furthermore, many modified nucleoside pathways that are important for E. coli translation are lost  in Buchnera. Additionally, Buchnera's A + T compositional bias has resulted in reduced tRNA thermostability, and may have altered aminoacyl-tRNA synthetase recognition sites. Buchnera tRNA genes are shorter than those of E. coli, as the  majority no longer has a genome-encoded 3' CCA; however, all the expressed, shortened tRNAs undergo 3' CCA maturation. Moreover, expression of tRNA isoacceptors was not correlated with the usage of corresponding codons. Overall,  our data suggest that endosymbiont genome evolution alters tRNA characteristics that are known to influence translational efficiency in their free-living relative. 
--- a/outRNAseq_binClass/useless.out 0 → 100644
View file @a5eecdb
+++ b/outRNAseq_binClass/useless.out 0 → 100644
View file @a5eecdb
+ 28791299	 Increasing evidence that microRNAs (miRNAs) play important roles in the immune response against infectious agents suggests that miRNA might be exploitable as signatures of exposure to specific infectious agents. In order to identify potential early miRNA biomarkers of bacterial infections, human peripheral blood  mononuclear cells (hPBMCs) were exposed to two select agents, Burkholderia pseudomallei K96243 and Francisella tularensis SHU S4, as well as to the nonpathogenic control Escherichia coli DH5α. RNA samples were harvested at three  early time points, 30, 60, and 120 minutes postexposure, then sequenced. RNAseq analyses identified 87 miRNAs to be differentially expressed (DE) in a linear fashion. Of these, 31 miRNAs were tested using the miScript miRNA qPCR assay. Through RNAseq identification and qPCR validation, we identified differentially expressed miRNA species that may be involved in the early response to bacterial infections. Based upon its upregulation at early time points postexposure in two  different individuals, hsa-mir-30c-5p is a miRNA species that could be studied further as a potential biomarker for exposure to these gram-negative intracellular pathogens. Gene ontology functional analyses demonstrated that programmed cell death is the first ranking biological process associated with miRNAs that are upregulated in F. tularensis-exposed hPBMCs. 
+ 28614372	 Infection with Shiga toxin (Stx) producing Escherichia coli O157:H7 can cause the potentially fatal complication hemolytic uremic syndrome, and currently only supportive therapy is available. Lack of suitable animal models has hindered study of this disease. Induced human intestinal organoids (iHIOs), generated by in vitro differentiation of pluripotent stem cells, represent differentiated human intestinal tissue. We show that iHIOs with addition of human neutrophils can model E. coli intestinal infection and innate cellular responses. Commensal and O157:H7 introduced into the iHIO lumen replicated rapidly achieving high numbers. Commensal E. coli did not cause damage, and were completely contained within the lumen, suggesting defenses, such as mucus production, can constrain non-pathogenic strains. Some O157:H7 initially co-localized with cellular actin.  Loss of actin and epithelial integrity was observed after 4 hours. O157:H7 grew as filaments, consistent with activation of the bacterial SOS stress response. SOS is induced by reactive oxygen species (ROS), and O157:H7 infection increased  ROS production. Transcriptional profiling (RNAseq) demonstrated that both commensal and O157:H7 upregulated genes associated with gastrointestinal maturation, while infection with O157:H7 upregulated inflammatory responses, including interleukin 8 (IL-8). IL-8 is associated with neutrophil recruitment, and infection with O157:H7 resulted in recruitment of human neutrophils into the  iHIO tissue. 
+ 28270101	 BACKGROUND: Avian pathogenic E. coli (APEC) can lead to a loss in millions of dollars in poultry annually because of mortality and produce contamination. Studies have verified that many immune-related genes undergo changes in alternative splicing (AS), along with nonsense mediated decay (NMD), to regulate  the immune system under different conditions. Therefore, the splicing profiles of primary lymphoid tissues with systemic APEC infection need to be comprehensively  examined. RESULTS: Gene expression in RNAseq data were obtained for three different immune  tissues (bone marrow, thymus, and bursa) from three phenotype birds (non-challenged, resistant, and susceptible birds) at two time points. Alternative 5' splice sites and exon skipping/inclusion were identified as the major alternative splicing events in avian primary immune organs under systemic APEC infection. In this study, we detected hundreds of differentially-expressed-transcript-containing genes (DETs) between different phenotype birds at 5 days post-infection (dpi). DETs, PSAP and STT3A, with NMD have important functions under systemic APEC infection. DETs, CDC45, CDK1, RAG2,  POLR1B, PSAP, and DNASE1L3, from the same transcription start sites (TSS) indicate that cell death, cell cycle, cellular function, and maintenance were predominant in host under systemic APEC. CONCLUSIONS: With the use of RNAseq technology and bioinformatics tools, this study provides a portrait of the AS event and NMD in primary lymphoid tissues, which play critical roles in host homeostasis under systemic APEC infection. According to this study, AS plays a pivotal regulatory role in the immune response in chicken under systemic APEC infection via either NMD or alternative TSSs. This study elucidates the regulatory role of AS for the immune complex under systemic APEC infection. 
+ 28060822	 Mosquitoes host communities of microbes in their digestive tract that consist primarily of bacteria. We previously reported that Aedes aegypti larvae colonized by a native community of bacteria and gnotobiotic larvae colonized by only Escherichia coli develop very similarly into adults, whereas axenic larvae never  molt and die as first instars. In this study, we extended these findings by first comparing the growth and abundance of bacteria in conventional, gnotobiotic, and  axenic larvae during the first instar. Results showed that conventional and gnotobiotic larvae exhibited no differences in growth, timing of molting, or number of bacteria in their digestive tract. Axenic larvae in contrast grew minimally and never achieved the critical size associated with molting by conventional and gnotobiotic larvae. In the second part of the study we compared  patterns of gene expression in conventional, gnotobiotic and axenic larvae by conducting an RNAseq analysis of gut and nongut tissues (carcass) at 22 h post-hatching. Approximately 12% of Ae. aegypti transcripts were differentially expressed in axenic versus conventional or gnotobiotic larvae. However, this profile consisted primarily of transcripts in seven categories that included the  down-regulation of select peptidases in the gut and up-regulation of several genes in the gut and carcass with roles in amino acid transport, hormonal signaling, and metabolism. Overall, our results indicate that axenic larvae exhibit alterations in gene expression consistent with defects in acquisition and assimilation of nutrients required for growth. 
+ 27466434	 Avian pathogenic Escherichia coli (APEC) can cause significant morbidity in chickens. The thymus provides the essential environment for T cell development; however, the thymus transcriptome has not been examined for gene expression in response to APEC infection. An improved understanding of the host genomic response to APEC infection could inform future breeding programs for disease resistance and APEC control. We therefore analyzed the transcriptome of the thymus of birds challenged with APEC, contrasting susceptible and resistant phenotypes. Thousands of genes were differentially expressed in birds of the 5-day post infection (dpi) challenged-susceptible group vs. 5 dpi non-challenged, in 5 dpi challenged-susceptible vs. 5 dpi challenged-resistant birds, as well as  in 5 dpi vs. one dpi challenged-susceptible birds. The Toll-like receptor signaling pathway was the major innate immune response for birds to respond to APEC infection. Moreover, lysosome and cell adhesion molecules pathways were common mechanisms for chicken response to APEC infection. The T-cell receptor signaling pathway, cell cycle, and p53 signaling pathways were significantly activated in resistant birds to resist APEC infection. These results provide a comprehensive assessment of global gene networks and biological functionalities of differentially expressed genes in the thymus under APEC infection. These findings provide novel insights into key molecular genetic mechanisms that differentiate host resistance from susceptibility in this primary lymphoid tissue, the thymus. 
+ 27424527	 Thermobifida fusca is a thermophilic actinobacterium. T. fusca muC obtained by adaptive evolution preferred yeast extract to ammonium sulfate for accumulating malic acid and ammonium sulfate for cell growth. We did transcriptome analysis of T. fusca muC on Avicel and cellobiose with addition of ammonium sulfate or yeast  extract, respectively by RNAseq. The transcriptional results indicate that ammonium sulfate induced the transcriptions of the genes related to carbohydrate  metabolisms significantly more than yeast extract. Importantly, Tfu_2487, encoding histidine-containing protein (HPr), didn't transcribe on yeast extract at all, while it transcribed highly on ammonium sulfate. In order to understand the impact of HPr on malate production and cell growth of the muC strain, we deleted Tfu_2487 to get a mutant strain: muCΔ2487, which had 1.33 mole/mole-glucose equivalent malate yield, much higher than that on yeast extract. We then developed an E. coli-T. fusca shuttle plasmid for over-expressing HPr in muCΔ2487, a strain without HPr background, forming the muCΔ2487S strain. The muCΔ2487S strain had a much lower malate yield but faster cell growth than the muC strain. The results of both mutant strains confirmed that HPr was the key regulatory protein for T. fusca's metabolisms on nitrogen sources. 
+ 27336699	 Our objective was to identify the biological response and the cross-talk between  liver and mammary tissue after intramammary infection (IMI) with Escherichia coli (E. coli) using RNAseq technology. Sixteen cows were inoculated with live E. coli into one mammary quarter at ~4-6 weeks in lactation. For all cows, biopsies were  performed at -144, 12 and 24 h relative to IMI in liver and at 24 h post-IMI in infected and non-infected (control) mammary quarters. For a subset of cows (n = 6), RNA was extracted from both liver and mammary tissue and sequenced using a 100 bp paired-end approach. Ingenuity Pathway Analysis and the Dynamic Impact Approach analysis of differentially expressed genes (overall effect False Discovery Rate≤0.05) indicated that IMI induced an overall activation of inflammation at 12 h post-IMI and a strong inhibition of metabolism, especially related to lipid, glucose, and xenobiotics at 24 h post-IMI in liver. The data indicated in mammary tissue an overall induction of inflammatory response with little effect on metabolism at 24 h post-IMI. We identified a large number of up-stream regulators potentially involved in the response to IMI in both tissues  but a relatively small core network of transcription factors controlling the response to IMI for liver whereas a large network in mammary tissue. Transcriptomic results in liver and mammary tissue were supported by changes in inflammatory and metabolic mediators in blood and milk. The analysis of potential cross-talk between the two tissues during IMI uncovered a large communication from the mammary tissue to the liver to coordinate the inflammatory response but  a relatively small communication from the liver to the mammary tissue. Our results indicate a strong induction of the inflammatory response in mammary tissue and impairment of liver metabolism 24h post-IMI partly driven by the signaling from infected mammary tissue. 
+ 25085508	 BACKGROUND: Burkholderia pseudomallei is a facultative intracellular pathogen and the causative agent of melioidosis. A conserved type III secretion system (T3SS3) and type VI secretion system (T6SS1) are critical for intracellular survival and  growth. The T3SS3 and T6SS1 genes are coordinately and hierarchically regulated by a TetR-type regulator, BspR. A central transcriptional regulator of the BspR regulatory cascade, BsaN, activates a subset of T3SS3 and T6SS1 loci. RESULTS: To elucidate the scope of the BsaN regulon, we used RNAseq analysis to compare the transcriptomes of wild-type B. pseudomallei KHW and a bsaN deletion mutant. The 60 genes positively-regulated by BsaN include those that we had previously identified in addition to a polyketide biosynthesis locus and genes involved in amino acid biosynthesis. BsaN was also found to repress the transcription of 51 genes including flagellar motility loci and those encoding components of the T3SS3 apparatus. Using a promoter-lacZ fusion assay in E. coli, we show that BsaN together with the chaperone BicA directly control the expression of the T3SS3 translocon, effector and associated regulatory genes that are organized into at least five operons (BPSS1516-BPSS1552). Using a mutagenesis approach, a consensus regulatory motif in the promoter regions of BsaN-regulated  genes was shown to be essential for transcriptional activation. CONCLUSIONS: BsaN/BicA functions as a central regulator of key virulence clusters in B. pseudomallei within a more extensive network of genetic regulation. We propose that BsaN/BicA controls a gene expression program that facilitates the adaption and intracellular survival of the pathogen within eukaryotic hosts. 
--- a/outRNAseq_oneClass/useful.out 0 → 100644
View file @a5eecdb
+++ b/outRNAseq_oneClass/useful.out 0 → 100644
View file @a5eecdb
+ 29484588	 Small regulatory RNAs (sRNAs) are ubiquitous regulatory molecules expressed in living cells. In prokaryotes, sRNAs usually bind to target mRNAs to either promote their degradation or interfere with translation initiation. Because a single sRNA can regulate a considerable number of target mRNAs, we seek to identify those targets rapidly and reliably. Here, we present a robust method based on the co-purification of target mRNAs bound to MS2-tagged sRNAs expressed  in vivo. After purification of the tagged-sRNA, we use RNAseq to determine the identity of all RNA interacting partners and their enrichment level. We describe  how to analyze the RNAseq data through the Galaxy Project Platform bioinformatics tools to identify new mRNA targets. This technique is applicable to most sRNAs of E. coli and Salmonella. 
+ 28791299	 Increasing evidence that microRNAs (miRNAs) play important roles in the immune response against infectious agents suggests that miRNA might be exploitable as signatures of exposure to specific infectious agents. In order to identify potential early miRNA biomarkers of bacterial infections, human peripheral blood  mononuclear cells (hPBMCs) were exposed to two select agents, Burkholderia pseudomallei K96243 and Francisella tularensis SHU S4, as well as to the nonpathogenic control Escherichia coli DH5α. RNA samples were harvested at three  early time points, 30, 60, and 120 minutes postexposure, then sequenced. RNAseq analyses identified 87 miRNAs to be differentially expressed (DE) in a linear fashion. Of these, 31 miRNAs were tested using the miScript miRNA qPCR assay. Through RNAseq identification and qPCR validation, we identified differentially expressed miRNA species that may be involved in the early response to bacterial infections. Based upon its upregulation at early time points postexposure in two  different individuals, hsa-mir-30c-5p is a miRNA species that could be studied further as a potential biomarker for exposure to these gram-negative intracellular pathogens. Gene ontology functional analyses demonstrated that programmed cell death is the first ranking biological process associated with miRNAs that are upregulated in F. tularensis-exposed hPBMCs. 
+ 28614372	 Infection with Shiga toxin (Stx) producing Escherichia coli O157:H7 can cause the potentially fatal complication hemolytic uremic syndrome, and currently only supportive therapy is available. Lack of suitable animal models has hindered study of this disease. Induced human intestinal organoids (iHIOs), generated by in vitro differentiation of pluripotent stem cells, represent differentiated human intestinal tissue. We show that iHIOs with addition of human neutrophils can model E. coli intestinal infection and innate cellular responses. Commensal and O157:H7 introduced into the iHIO lumen replicated rapidly achieving high numbers. Commensal E. coli did not cause damage, and were completely contained within the lumen, suggesting defenses, such as mucus production, can constrain non-pathogenic strains. Some O157:H7 initially co-localized with cellular actin.  Loss of actin and epithelial integrity was observed after 4 hours. O157:H7 grew as filaments, consistent with activation of the bacterial SOS stress response. SOS is induced by reactive oxygen species (ROS), and O157:H7 infection increased  ROS production. Transcriptional profiling (RNAseq) demonstrated that both commensal and O157:H7 upregulated genes associated with gastrointestinal maturation, while infection with O157:H7 upregulated inflammatory responses, including interleukin 8 (IL-8). IL-8 is associated with neutrophil recruitment, and infection with O157:H7 resulted in recruitment of human neutrophils into the  iHIO tissue. 
+ 28270101	 BACKGROUND: Avian pathogenic E. coli (APEC) can lead to a loss in millions of dollars in poultry annually because of mortality and produce contamination. Studies have verified that many immune-related genes undergo changes in alternative splicing (AS), along with nonsense mediated decay (NMD), to regulate  the immune system under different conditions. Therefore, the splicing profiles of primary lymphoid tissues with systemic APEC infection need to be comprehensively  examined. RESULTS: Gene expression in RNAseq data were obtained for three different immune  tissues (bone marrow, thymus, and bursa) from three phenotype birds (non-challenged, resistant, and susceptible birds) at two time points. Alternative 5' splice sites and exon skipping/inclusion were identified as the major alternative splicing events in avian primary immune organs under systemic APEC infection. In this study, we detected hundreds of differentially-expressed-transcript-containing genes (DETs) between different phenotype birds at 5 days post-infection (dpi). DETs, PSAP and STT3A, with NMD have important functions under systemic APEC infection. DETs, CDC45, CDK1, RAG2,  POLR1B, PSAP, and DNASE1L3, from the same transcription start sites (TSS) indicate that cell death, cell cycle, cellular function, and maintenance were predominant in host under systemic APEC. CONCLUSIONS: With the use of RNAseq technology and bioinformatics tools, this study provides a portrait of the AS event and NMD in primary lymphoid tissues, which play critical roles in host homeostasis under systemic APEC infection. According to this study, AS plays a pivotal regulatory role in the immune response in chicken under systemic APEC infection via either NMD or alternative TSSs. This study elucidates the regulatory role of AS for the immune complex under systemic APEC infection. 
+ 28240544	 Facile and simple method is developed to synthesize silver-nanoparticle-decorated quercetin nanoparticles (QA NPs). Modification suggests that synergistic quercetin (Qe) improves the antibacterial effect of silver nanoparticles (Ag NPs). Characterization experiment indicates that QA NPs have a diameter of approximately 10 nm. QA NPs show highly effective antibacterial activities against drug-resistant Escherichia coli (E. coli) and Staphylococcus aureus (S. aureus). We explore antibacterial mechanisms using S. aureus and E. coli treated  with QA NPs. Through morphological changes in E. coli and S. aureus, mechanisms are examined for bacterial damage caused by particulate matter from local dissociation of silver ion and Qe from QA NPs trapped inside membranes. Moreover, we note that gene expression profiling methods, such as RNA sequencing, can be used to predict discover mechanisms of toxicity of QA NPs. Gene ontology (GO) assay analyses demonstrate the molecular mechanism of the antibacterial effect of QA NPs. Regarding cellular component ontology, "cell wall organization or biogenesis" (GO: 0071554) and "cell wall macromolecule metabolic process" (GO: 0044036) are the most represented categories. The present study reports that transcriptome analysis of the mechanism offers novel insights into the molecular  mechanism of antibacterial assays. 
+ 28174601	 BACKGROUND: Lignin is a potential biorefinery feedstock for the production of value-added chemicals including vanillin. A huge amount of lignin is produced as  a by-product of the paper industry, while cellulosic components of plant biomass  are utilized for the production of paper pulp. In spite of vast potential, lignin remains the least exploited component of plant biomass due to its extremely complex and heterogenous structure. Several enzymes have been reported to have lignin-degrading properties and could be potentially used in lignin biorefining if their catalytic properties could be improved by enzyme engineering. The much needed improvement of lignin-degrading enzymes by high-throughput selection techniques such as directed evolution is currently limited, as robust methods for detecting the conversion of lignin to desired small molecules are not available. RESULTS: We identified a vanillin-inducible promoter by RNAseq analysis of Escherichia coli cells treated with a sublethal dose of vanillin and developed a  genetically programmed vanillin-sensing cell by placing the 'very green fluorescent protein' gene under the control of this promoter. Fluorescence of the biosensing cell is enhanced significantly when grown in the presence of vanillin  and is readily visualized by fluorescence microscopy. The use of fluorescence-activated cell sorting analysis further enhances the sensitivity, enabling dose-dependent detection of as low as 200 µM vanillin. The biosensor is  highly specific to vanillin and no major response is elicited by the presence of  lignin, lignin model compound, DMSO, vanillin analogues or non-specific toxic chemicals. CONCLUSIONS: We developed an engineered E. coli cell that can detect vanillin at  a concentration as low as 200 µM. The vanillin-sensing cell did not show cross-reactivity towards lignin or major lignin degradation products including vanillin analogues. This engineered E. coli cell could potentially be used as a host cell for screening lignin-degrading enzymes that can convert lignin to vanillin. 
+ 28060822	 Mosquitoes host communities of microbes in their digestive tract that consist primarily of bacteria. We previously reported that Aedes aegypti larvae colonized by a native community of bacteria and gnotobiotic larvae colonized by only Escherichia coli develop very similarly into adults, whereas axenic larvae never  molt and die as first instars. In this study, we extended these findings by first comparing the growth and abundance of bacteria in conventional, gnotobiotic, and  axenic larvae during the first instar. Results showed that conventional and gnotobiotic larvae exhibited no differences in growth, timing of molting, or number of bacteria in their digestive tract. Axenic larvae in contrast grew minimally and never achieved the critical size associated with molting by conventional and gnotobiotic larvae. In the second part of the study we compared  patterns of gene expression in conventional, gnotobiotic and axenic larvae by conducting an RNAseq analysis of gut and nongut tissues (carcass) at 22 h post-hatching. Approximately 12% of Ae. aegypti transcripts were differentially expressed in axenic versus conventional or gnotobiotic larvae. However, this profile consisted primarily of transcripts in seven categories that included the  down-regulation of select peptidases in the gut and up-regulation of several genes in the gut and carcass with roles in amino acid transport, hormonal signaling, and metabolism. Overall, our results indicate that axenic larvae exhibit alterations in gene expression consistent with defects in acquisition and assimilation of nutrients required for growth. 
+ 27876680	 Recent advances in high-throughput sequencing have led to an explosion in the rate of small regulatory RNAs (sRNAs) discovery among bacteria. However, only a handful of them are functionally characterized. Most of the time, little to no targets are known. In Lalaouna et al. (2015), we proposed a new technology to uncover sRNAs targetome, which is based on the MS2-affinity purification (MAPS).  We were able to prove its efficiency by applying it on well-characterized sRNAs of Escherichia coli. Thereafter, we adapted the procedure to other kind of RNA (mRNAs and tRNA-derived RNA fragments) and bacteria (pathogenic or Gram-positive  strains). Here, we clearly report all improvements and adjustments made to MAPS technology since it was originally reported. 
+ 27856567	 The enteric pathogen Escherichia coli O157:H7 Sakai (EHEC) is able to grow at lower temperatures compared to commensal E. coli Growth at environmental conditions displays complex challenges different to those in a host. EHEC was grown at 37°C and at 14°C with 4% NaCl, a combination of cold and osmotic stress  as present in the food chain. Comparison of RNAseq and RIBOseq data provided a snap shot of ongoing transcription and translation, differentiating transcriptional and post-transcriptional gene regulation, respectively. Indeed, cold and osmotic stress related genes are simultaneously regulated at both levels, but translational regulation clearly dominates. Special emphasis was given to genes regulated by RNA secondary structures in their 5'UTRs, such as RNA thermometers and riboswitches, or genes controlled by small RNAs encoded in trans The results reveal large differences in gene expression between short-time shock  compared to adaptation in combined cold and osmotic stress. Whereas the majority  of cold shock proteins, such as CspA, are translationally downregulated after adaptation, many osmotic stress genes are still significantly upregulated mainly  translationally, but several also transcriptionally. 
+ 27466434	 Avian pathogenic Escherichia coli (APEC) can cause significant morbidity in chickens. The thymus provides the essential environment for T cell development; however, the thymus transcriptome has not been examined for gene expression in response to APEC infection. An improved understanding of the host genomic response to APEC infection could inform future breeding programs for disease resistance and APEC control. We therefore analyzed the transcriptome of the thymus of birds challenged with APEC, contrasting susceptible and resistant phenotypes. Thousands of genes were differentially expressed in birds of the 5-day post infection (dpi) challenged-susceptible group vs. 5 dpi non-challenged, in 5 dpi challenged-susceptible vs. 5 dpi challenged-resistant birds, as well as  in 5 dpi vs. one dpi challenged-susceptible birds. The Toll-like receptor signaling pathway was the major innate immune response for birds to respond to APEC infection. Moreover, lysosome and cell adhesion molecules pathways were common mechanisms for chicken response to APEC infection. The T-cell receptor signaling pathway, cell cycle, and p53 signaling pathways were significantly activated in resistant birds to resist APEC infection. These results provide a comprehensive assessment of global gene networks and biological functionalities of differentially expressed genes in the thymus under APEC infection. These findings provide novel insights into key molecular genetic mechanisms that differentiate host resistance from susceptibility in this primary lymphoid tissue, the thymus. 
+ 27424527	 Thermobifida fusca is a thermophilic actinobacterium. T. fusca muC obtained by adaptive evolution preferred yeast extract to ammonium sulfate for accumulating malic acid and ammonium sulfate for cell growth. We did transcriptome analysis of T. fusca muC on Avicel and cellobiose with addition of ammonium sulfate or yeast  extract, respectively by RNAseq. The transcriptional results indicate that ammonium sulfate induced the transcriptions of the genes related to carbohydrate  metabolisms significantly more than yeast extract. Importantly, Tfu_2487, encoding histidine-containing protein (HPr), didn't transcribe on yeast extract at all, while it transcribed highly on ammonium sulfate. In order to understand the impact of HPr on malate production and cell growth of the muC strain, we deleted Tfu_2487 to get a mutant strain: muCΔ2487, which had 1.33 mole/mole-glucose equivalent malate yield, much higher than that on yeast extract. We then developed an E. coli-T. fusca shuttle plasmid for over-expressing HPr in muCΔ2487, a strain without HPr background, forming the muCΔ2487S strain. The muCΔ2487S strain had a much lower malate yield but faster cell growth than the muC strain. The results of both mutant strains confirmed that HPr was the key regulatory protein for T. fusca's metabolisms on nitrogen sources. 
+ 27336699	 Our objective was to identify the biological response and the cross-talk between  liver and mammary tissue after intramammary infection (IMI) with Escherichia coli (E. coli) using RNAseq technology. Sixteen cows were inoculated with live E. coli into one mammary quarter at ~4-6 weeks in lactation. For all cows, biopsies were  performed at -144, 12 and 24 h relative to IMI in liver and at 24 h post-IMI in infected and non-infected (control) mammary quarters. For a subset of cows (n = 6), RNA was extracted from both liver and mammary tissue and sequenced using a 100 bp paired-end approach. Ingenuity Pathway Analysis and the Dynamic Impact Approach analysis of differentially expressed genes (overall effect False Discovery Rate≤0.05) indicated that IMI induced an overall activation of inflammation at 12 h post-IMI and a strong inhibition of metabolism, especially related to lipid, glucose, and xenobiotics at 24 h post-IMI in liver. The data indicated in mammary tissue an overall induction of inflammatory response with little effect on metabolism at 24 h post-IMI. We identified a large number of up-stream regulators potentially involved in the response to IMI in both tissues  but a relatively small core network of transcription factors controlling the response to IMI for liver whereas a large network in mammary tissue. Transcriptomic results in liver and mammary tissue were supported by changes in inflammatory and metabolic mediators in blood and milk. The analysis of potential cross-talk between the two tissues during IMI uncovered a large communication from the mammary tissue to the liver to coordinate the inflammatory response but  a relatively small communication from the liver to the mammary tissue. Our results indicate a strong induction of the inflammatory response in mammary tissue and impairment of liver metabolism 24h post-IMI partly driven by the signaling from infected mammary tissue. 
+ 26818886	 Volatile organic compounds (VOCs) are commonly used as solvents in various industrial settings. Many of them present a challenge to receiving environments,  due to their toxicity and low bioavailability for degradation. Microorganisms are capable of sensing and responding to their surroundings and this makes them ideal detectors for toxic compounds. This study investigates the global transcriptomic  responses of Escherichia coli K-12 to selected VOCs at sub-toxic levels. Cells grown in the presence of VOCs were harvested during exponential growth, followed  by whole transcriptome shotgun sequencing (RNAseq). The analysis of the data revealed both shared and unique genetic responses compared to cells without exposure to VOCs. Results suggest that various functional gene categories, for example, those relating to Fe/S cluster biogenesis, oxidative stress responses and transport proteins, are responsive to selected VOCs in E. coli. The differential expression (DE) of genes was validated using GFP-promoter fusion assays. A variety of genes were differentially expressed even at non-inhibitory concentrations and when the cells are at their balanced-growth. Some of these genes belong to generic stress response and others could be specific to VOCs. Such candidate genes and their regulatory elements could be used as the basis for designing biosensors for selected VOCs. 
+ 25177315	 Efficient microbial conversion of lignocellulosic hydrolysates to biofuels is a key barrier to the economically viable deployment of lignocellulosic biofuels. A  chief contributor to this barrier is the impact on microbial processes and energy metabolism of lignocellulose-derived inhibitors, including phenolic carboxylates, phenolic amides (for ammonia-pretreated biomass), phenolic aldehydes, and furfurals. To understand the bacterial pathways induced by inhibitors present in  ammonia-pretreated biomass hydrolysates, which are less well studied than acid-pretreated biomass hydrolysates, we developed and exploited synthetic mimics of ammonia-pretreated corn stover hydrolysate (ACSH). To determine regulatory responses to the inhibitors normally present in ACSH, we measured transcript and  protein levels in an Escherichia coli ethanologen using RNA-seq and quantitative  proteomics during fermentation to ethanol of synthetic hydrolysates containing or lacking the inhibitors. Our study identified four major regulators mediating these responses, the MarA/SoxS/Rob network, AaeR, FrmR, and YqhC. Induction of these regulons was correlated with a reduced rate of ethanol production, buildup  of pyruvate, depletion of ATP and NAD(P)H, and an inhibition of xylose conversion. The aromatic aldehyde inhibitor 5-hydroxymethylfurfural appeared to be reduced to its alcohol form by the ethanologen during fermentation, whereas phenolic acid and amide inhibitors were not metabolized. Together, our findings establish that the major regulatory responses to lignocellulose-derived inhibitors are mediated by transcriptional rather than translational regulators,  suggest that energy consumed for inhibitor efflux and detoxification may limit biofuel production, and identify a network of regulators for future synthetic biology efforts. 
+ 25085508	 BACKGROUND: Burkholderia pseudomallei is a facultative intracellular pathogen and the causative agent of melioidosis. A conserved type III secretion system (T3SS3) and type VI secretion system (T6SS1) are critical for intracellular survival and  growth. The T3SS3 and T6SS1 genes are coordinately and hierarchically regulated by a TetR-type regulator, BspR. A central transcriptional regulator of the BspR regulatory cascade, BsaN, activates a subset of T3SS3 and T6SS1 loci. RESULTS: To elucidate the scope of the BsaN regulon, we used RNAseq analysis to compare the transcriptomes of wild-type B. pseudomallei KHW and a bsaN deletion mutant. The 60 genes positively-regulated by BsaN include those that we had previously identified in addition to a polyketide biosynthesis locus and genes involved in amino acid biosynthesis. BsaN was also found to repress the transcription of 51 genes including flagellar motility loci and those encoding components of the T3SS3 apparatus. Using a promoter-lacZ fusion assay in E. coli, we show that BsaN together with the chaperone BicA directly control the expression of the T3SS3 translocon, effector and associated regulatory genes that are organized into at least five operons (BPSS1516-BPSS1552). Using a mutagenesis approach, a consensus regulatory motif in the promoter regions of BsaN-regulated  genes was shown to be essential for transcriptional activation. CONCLUSIONS: BsaN/BicA functions as a central regulator of key virulence clusters in B. pseudomallei within a more extensive network of genetic regulation. We propose that BsaN/BicA controls a gene expression program that facilitates the adaption and intracellular survival of the pathogen within eukaryotic hosts. 
+ 23203983	 The 20th annual Database Issue of Nucleic Acids Research includes 176 articles, half of which describe new online molecular biology databases and the other half  provide updates on the databases previously featured in NAR and other journals. This year's highlights include two databases of DNA repeat elements; several databases of transcriptional factors and transcriptional factor-binding sites; databases on various aspects of protein structure and protein-protein interactions; databases for metagenomic and rRNA sequence analysis; and four databases specifically dedicated to Escherichia coli. The increased emphasis on using the genome data to improve human health is reflected in the development of  the databases of genomic structural variation (NCBI's dbVar and EBI's DGVa), the  NIH Genetic Testing Registry and several other databases centered on the genetic  basis of human disease, potential drugs, their targets and the mechanisms of protein-ligand binding. Two new databases present genomic and RNAseq data for monkeys, providing wealth of data on our closest relatives for comparative genomics purposes. The NAR online Molecular Biology Database Collection, available at http://www.oxfordjournals.org/nar/database/a/, has been updated and  currently lists 1512 online databases. The full content of the Database Issue is  freely available online on the Nucleic Acids Research website (http://nar.oxfordjournals.org/). 
+ 22821568	 RNAsnap™ is a simple and novel method that recovers all intracellular RNA quantitatively (>99%), faster (<15 min) and less expensively (∼3 cents/sample) than any of the currently available RNA isolation methods. In fact, none of the bacterial RNA isolation methods, including the commercial kits, are effective in  recovering all species of intracellular RNAs (76-5700 nt) with equal efficiency,  which can lead to biased results in genome-wide studies involving microarray or RNAseq analysis. The RNAsnap™ procedure yields ∼60 µg of RNA from 10(8) Escherichia coli cells that can be used directly for northern analysis without any further purification. Based on a comparative analysis of specific transcripts ranging in size from 76 to 5700 nt, the RNAsnap™ method provided the most accurate measure of the relative amounts of the various intracellular RNAs. Furthermore, the RNAsnap™ RNA was successfully used in enzymatic reactions such as RNA ligation, reverse transcription, primer extension and reverse transcriptase-polymerase chain reaction, following sodium acetate/ethanol precipitation. The RNAsnap™ method can be used to isolate RNA from a wide range of Gram-negative and Gram-positive bacteria as well as yeast. 
+ 22689638	 Translational efficiency is controlled by tRNAs and other genome-encoded mechanisms. In organelles, translational processes are dramatically altered because of genome shrinkage and horizontal acquisition of gene products. The influence of genome reduction on translation in endosymbionts is largely unknown. Here, we investigate whether divergent lineages of Buchnera aphidicola, the reduced-genome bacterial endosymbiont of aphids, possess altered translational features compared with their free-living relative, Escherichia coli. Our RNAseq data support the hypothesis that translation is less optimal in Buchnera than in  E. coli. We observed a specific, convergent, pattern of tRNA loss in Buchnera and other endosymbionts that have undergone genome shrinkage. Furthermore, many modified nucleoside pathways that are important for E. coli translation are lost  in Buchnera. Additionally, Buchnera's A + T compositional bias has resulted in reduced tRNA thermostability, and may have altered aminoacyl-tRNA synthetase recognition sites. Buchnera tRNA genes are shorter than those of E. coli, as the  majority no longer has a genome-encoded 3' CCA; however, all the expressed, shortened tRNAs undergo 3' CCA maturation. Moreover, expression of tRNA isoacceptors was not correlated with the usage of corresponding codons. Overall,  our data suggest that endosymbiont genome evolution alters tRNA characteristics that are known to influence translational efficiency in their free-living relative. 
--- a/outRNAseq_oneClass/useless.out 0 → 100644
View file @a5eecdb
+++ b/outRNAseq_oneClass/useless.out 0 → 100644
View file @a5eecdb
+ 29433444	 BACKGROUND: Due to the DNA triplet code, it is possible that the sequences of two or more protein-coding genes overlap to a large degree. However, such non-trivial overlaps are usually excluded by genome annotation pipelines and, thus, only a few overlapping gene pairs have been described in bacteria. In contrast, transcriptome and translatome sequencing reveals many signals originated from the antisense strand of annotated genes, of which we analyzed an example gene pair in more detail. RESULTS: A small open reading frame of Escherichia coli O157:H7 strain Sakai (EHEC), designated laoB (L-arginine responsive overlapping gene), is embedded in  reading frame -2 in the antisense strand of ECs5115, encoding a CadC-like transcriptional regulator. This overlapping gene shows evidence of transcription  and translation in Luria-Bertani (LB) and brain-heart infusion (BHI) medium based on RNA sequencing (RNAseq) and ribosomal-footprint sequencing (RIBOseq). The transcriptional start site is 289 base pairs (bp) upstream of the start codon and transcription termination is 155 bp downstream of the stop codon. Overexpression  of LaoB fused to an enhanced green fluorescent protein (EGFP) reporter was possible. The sequence upstream of the transcriptional start site displayed strong promoter activity under different conditions, whereas promoter activity was significantly decreased in the presence of L-arginine. A strand-specific translationally arrested mutant of laoB provided a significant growth advantage in competitive growth experiments in the presence of L-arginine compared to the wild type, which returned to wild type level after complementation of laoB in trans. A phylostratigraphic analysis indicated that the novel gene is restricted  to the Escherichia/Shigella clade and might have originated recently by overprinting leading to the expression of part of the antisense strand of ECs5115. CONCLUSIONS: Here, we present evidence of a novel small protein-coding gene laoB  encoded in the antisense frame -2 of the annotated gene ECs5115. Clearly, laoB is evolutionarily young and it originated in the Escherichia/Shigella clade by overprinting, a process which may cause the de novo evolution of bacterial genes  like laoB. 
+ 28902868	 In the past, short protein-coding genes were often disregarded by genome annotation pipelines. Transcriptome sequencing (RNAseq) signals outside of annotated genes have usually been interpreted to indicate either ncRNA or pervasive transcription. Therefore, in addition to the transcriptome, the translatome (RIBOseq) of the enteric pathogen Escherichia coli O157:H7 strain Sakai was determined at two optimal growth conditions and a severe stress condition combining low temperature and high osmotic pressure. All intergenic open reading frames potentially encoding a protein of ≥ 30 amino acids were investigated with regard to coverage by transcription and translation signals and their translatability expressed by the ribosomal coverage value. This led to discovery of 465 unique, putative novel genes not yet annotated in this E. coli strain, which are evenly distributed over both DNA strands of the genome. For 255 of the novel genes, annotated homologs in other bacteria were found, and a machine-learning algorithm, trained on small protein-coding E. coli genes, predicted that 89% of these translated open reading frames represent bona fide genes. The remaining 210 putative novel genes without annotated homologs were compared to the 255 novel genes with homologs and to 250 short annotated genes of this E. coli strain. All three groups turned out to be similar with respect to their translatability distribution, fractions of differentially regulated genes,  secondary structure composition, and the distribution of evolutionary constraint, suggesting that both novel groups represent legitimate genes. However, the machine-learning algorithm only recognized a small fraction of the 210 genes without annotated homologs. It is possible that these genes represent a novel group of genes, which have unusual features dissimilar to the genes of the machine-learning algorithm training set. 
+ 28245801	 BACKGROUND: While NGS allows rapid global detection of transcripts, it remains difficult to distinguish ncRNAs from short mRNAs. To detect potentially translated RNAs, we developed an improved protocol for bacterial ribosomal footprinting (RIBOseq). This allowed distinguishing ncRNA from mRNA in EHEC. A high ratio of ribosomal footprints per transcript (ribosomal coverage value, RCV) is expected to indicate a translated RNA, while a low RCV should point to a non-translated RNA. RESULTS: Based on their low RCV, 150 novel non-translated EHEC transcripts were identified as putative ncRNAs, representing both antisense and intergenic transcripts, 74 of which had expressed homologs in E. coli MG1655. Bioinformatics analysis predicted statistically significant target regulons for 15 of the intergenic transcripts; experimental analysis revealed 4-fold or higher differential expression of 46 novel ncRNA in different growth media. Out of 329 annotated EHEC ncRNAs, 52 showed an RCV similar to protein-coding genes, of those, 16 had RIBOseq patterns matching annotated genes in other enterobacteriaceae, and 11 seem to possess a Shine-Dalgarno sequence, suggesting  that such ncRNAs may encode small proteins instead of being solely non-coding. To support that the RIBOseq signals are reflecting translation, we tested the ribosomal-footprint covered ORF of ryhB and found a phenotype for the encoded peptide in iron-limiting condition. CONCLUSION: Determination of the RCV is a useful approach for a rapid first-step  differentiation between bacterial ncRNAs and small mRNAs. Further, many known ncRNAs may encode proteins as well. 
+ 26911138	 BACKGROUND: Genomes of E. coli, including that of the human pathogen Escherichia  coli O157:H7 (EHEC) EDL933, still harbor undetected protein-coding genes which, apparently, have escaped annotation due to their small size and non-essential function. To find such genes, global gene expression of EHEC EDL933 was examined, using strand-specific RNAseq (transcriptome), ribosomal footprinting (translatome) and mass spectrometry (proteome). RESULTS: Using the above methods, 72 short, non-annotated protein-coding genes were detected. All of these showed signals in the ribosomal footprinting assay indicating mRNA translation. Seven were verified by mass spectrometry. Fifty-seven genes are annotated in other enterobacteriaceae, mainly as hypothetical genes; the remaining 15 genes constitute novel discoveries. In addition, protein structure and function were predicted computationally and compared between EHEC-encoded proteins and 100-times randomly shuffled proteins.  Based on this comparison, 61 of the 72 novel proteins exhibit predicted structural and functional features similar to those of annotated proteins. Many of the novel genes show differential transcription when grown under eleven diverse growth conditions suggesting environmental regulation. Three genes were found to confer a phenotype in previous studies, e.g., decreased cattle colonization. CONCLUSIONS: These findings demonstrate that ribosomal footprinting can be used to detect novel protein coding genes, contributing to the growing body of evidence that hypothetical genes are not annotation artifacts and opening an additional way to study their functionality. All 72 genes are taxonomically restricted and, therefore, appear to have evolved relatively recently de novo. 
+ 26307168	 Repeated extragenic palindromes (REPs) in the enterobacterial genomes are usually composed of individual palindromic units separated by linker sequences. A total of 355 annotated REPs are distributed along the Escherichia coli genome. RNA sequence (RNAseq) analysis showed that almost 80% of the REPs in E. coli are transcribed. The DNA sequence of REP325 showed that it is a cluster of six repeats, each with two palindromic units capable of forming cruciform structures  in supercoiled DNA. Here, we report that components of the REP325 element and at  least one of its RNA products play a role in bacterial nucleoid DNA condensation. These RNA not only are present in the purified nucleoid but bind to the bacterial nucleoid-associated HU protein as revealed by RNA IP followed by microarray analysis (RIP-Chip) assays. Deletion of REP325 resulted in a dramatic increase of the nucleoid size as observed using transmission electron microscopy (TEM), and expression of one of the REP325 RNAs, nucleoid-associated noncoding RNA 4 (naRNA4), from a plasmid restored the wild-type condensed structure. Independently, chromosome conformation capture (3C) analysis demonstrated physical connections among various REP elements around the chromosome. These connections are dependent in some way upon the presence of HU and the REP325 element; deletion of HU genes and/or the REP325 element removed the connections.  Finally, naRNA4 together with HU condensed DNA in vitro by connecting REP325 or other DNA sequences that contain cruciform structures in a pairwise manner as observed by atomic force microscopy (AFM). On the basis of our results, we propose molecular models to explain connections of remote cruciform structures mediated by HU and naRNA4.IMPORTANCE: Nucleoid organization in bacteria is being  studied extensively, and several models have been proposed. However, the molecular nature of the structural organization is not well understood. Here we characterized the role of a novel nucleoid-associated noncoding RNA, naRNA4, in nucleoid structures both in vivo and in vitro. We propose models to explain how naRNA4 together with nucleoid-associated protein HU connects remote DNA elements  for nucleoid condensation. We present the first evidence of a noncoding RNA together with a nucleoid-associated protein directly condensing nucleoid DNA. 
+ 26125937	 Adherent-invasive Escherichia coli (AIEC) strains are detected more frequently within mucosal lesions of patients with Crohn's disease (CD). The AIEC phenotype  consists of adherence and invasion of intestinal epithelial cells and survival within macrophages of these bacteria in vitro. Our aim was to identify candidate  transcripts that distinguish AIEC from non-invasive E. coli (NIEC) strains and might be useful for rapid and accurate identification of AIEC by culture-independent technology. We performed comparative RNA-Sequence (RNASeq) analysis using AIEC strain LF82 and NIEC strain HS during exponential and stationary growth. Differential expression analysis of coding sequences (CDS) homologous to both strains demonstrated 224 and 241 genes with increased and decreased expression, respectively, in LF82 relative to HS. Transition metal transport and siderophore metabolism related pathway genes were up-regulated, while glycogen metabolic and oxidation-reduction related pathway genes were down-regulated, in LF82. Chemotaxis related transcripts were up-regulated in LF82 during the exponential phase, but flagellum-dependent motility pathway genes were down-regulated in LF82 during the stationary phase. CDS that mapped only to the LF82 genome accounted for 747 genes. We applied an in silico subtractive genomics approach to identify CDS specific to AIEC by incorporating the genomes of 10 other previously phenotyped NIEC. From this analysis, 166 CDS mapped to the LF82  genome and lacked homology to any of the 11 human NIEC strains. We compared these CDS across 13 AIEC, but none were homologous in each. Four LF82 gene loci belonging to clustered regularly interspaced short palindromic repeats region (CRISPR)--CRISPR-associated (Cas) genes were identified in 4 to 6 AIEC and absent from all non-pathogenic bacteria. As previously reported, AIEC strains were enriched for pdu operon genes. One CDS, encoding an excisionase, was shared by 9  AIEC strains. Reverse transcription quantitative polymerase chain reaction assays for 6 genes were conducted on fecal and ileal RNA samples from 22 inflammatory bowel disease (IBD), and 32 patients without IBD (non-IBD). The expression of Cas loci was detected in a higher proportion of CD than non-IBD fecal and ileal RNA samples (p <0.05). These results support a comparative genomic/transcriptomic approach towards identifying candidate AIEC signature transcripts. 
+ 24927582	 The molecular mechanisms of ethanol toxicity and tolerance in bacteria, although  important for biotechnology and bioenergy applications, remain incompletely understood. Genetic studies have identified potential cellular targets for ethanol and have revealed multiple mechanisms of tolerance, but it remains difficult to separate the direct and indirect effects of ethanol. We used adaptive evolution to generate spontaneous ethanol-tolerant strains of Escherichia coli, and then characterized mechanisms of toxicity and resistance using genome-scale DNAseq, RNAseq, and ribosome profiling coupled with specific assays of ribosome and RNA polymerase function. Evolved alleles of metJ, rho, and rpsQ recapitulated most of the observed ethanol tolerance, implicating translation and transcription as key processes affected by ethanol. Ethanol induced miscoding errors during protein synthesis, from which the evolved rpsQ allele protected cells by increasing ribosome accuracy. Ribosome profiling and RNAseq analyses established that ethanol negatively affects transcriptional and translational processivity. Ethanol-stressed cells exhibited ribosomal stalling at internal AUG codons, which may be ameliorated by the adaptive inactivation of  the MetJ repressor of methionine biosynthesis genes. Ethanol also caused aberrant intragenic transcription termination for mRNAs with low ribosome density, which was reduced in a strain with the adaptive rho mutation. Furthermore, ethanol inhibited transcript elongation by RNA polymerase in vitro. We propose that ethanol-induced inhibition and uncoupling of mRNA and protein synthesis through direct effects on ribosomes and RNA polymerase conformations are major contributors to ethanol toxicity in E. coli, and that adaptive mutations in metJ, rho, and rpsQ help protect these central dogma processes in the presence of ethanol.