laste version

iarroyof
Commit 1e051ed3083ff3b1bc250e4e670e2fdd26773f7e 1e051ed3 1 parent 06d7dddb
Showing 11 changed files with 529 additions and 14 deletions
filter_abstracts.py.save
filter_abstracts_binClass.py
filter_abstracts_oneClass.py
filter_output/useful.out
filter_papers.py
model/svd_model.pkl
model/svm_model.paper.pkl
model/svm_model.pkl
model/tfidf_model.paper.pkl
outRNAseq/useful.out
outRNAseq/useless.out
--- a/filter_abstracts.py.save 0 → 100644
View file @1e051ed
+++ b/filter_abstracts.py.save 0 → 100644
View file @1e051ed
+#from pdb import set_trace as st
+from sklearn.cross_validation import train_test_split as splitt
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.decomposition import TruncatedSVD
+from sklearn.model_selection import RandomizedSearchCV
+from sklearn.model_selection import GridSearchCV
+from sklearn import metrics
+from sklearn.svm import SVC
+import numpy as np
+import argparse
+import csv
+import os
+from sklearn.externals import joblib
+from time import time
+from scipy.stats import randint as sp_randint
+from scipy.stats import expon
+from sklearn.preprocessing import label_binarize
+
+
+def get_abstracts(file_name, label):
+    f = open(file_name)
+    extract = {}
+    docs = []
+    empties = []
+    lines = f.readlines()
+    copyright = False
+
+    for i, ln in enumerate(lines):
+        if not ln.strip():
+            empties.append(i)
+            continue
+        elif ' doi: ' in ln:
+            for j in range(i, i + 10):
+                if not lines[j].strip():
+                    title_idx = j + 1
+                    break
+            continue
+
+        elif 'Copyright ' in ln or 'Publish' in ln or u'\N{COPYRIGHT SIGN}' in ln:
+            copyright = True
+
+        elif 'DOI: ' in ln:
+            if 'PMCID: ' in lines[i + 1]:
+                extract['pmid'] = int(lines[i + 2].strip().split()[1])
+            elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]:
+                extract['pmid'] = int(lines[i + 1].strip().split()[1])
+
+            if copyright:
+                get = slice(empties[-3], empties[-2])
+                copyright = False
+            else:
+                get = slice(empties[-2], empties[-1])
+
+            extract['body'] = " ".join(lines[get]).replace("\n", ' '
+                                                        ).replace("  ", ' ')
+            title = []
+            for j in range(title_idx, title_idx + 5):
+                if lines[j].strip():
+                    title.append(lines[j])
+                else:
+                    break
+            extract['title'] = " ".join(title).replace("\n", ' '
+                                                        ).replace("  ", ' ')
+            extract['topic'] = label
+            docs.append(extract)
+            empties = []
+            extract = {}
+
+    return docs
+
+
+parser = argparse.ArgumentParser(
+    description="This script separates abstracts of biomedical papers that"
+            "report data from biomedical experiments from those that do not.")
+parser.add_argument("--input", help="Input file containing the abstracts to"
+                                "be predited.")
+parser.add_argument("--classA", help="Input file containing the abstracts of"
+                                "class A to be learned.")
+parser.add_argument("--classB", help="Input file containing the abstracts of"
+                                "class B to be learned.")
+parser.add_argument("--out", help="Path to the output directory "
+                     "(default='./filter_output')", default="filter_output")
+parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
+        "(default='./model/svm_model.pkl')", default="model/svm_model.pkl")
+
+args = parser.parse_args()
+
+labels = {0: 'useless', 1: 'useful'}
+
+if args.classA and args.classB and not args.input:
+    vectorizer = TfidfVectorizer(binary=True)
+    print(vectorizer)
+    f0 = open("model_params.conf")
+    n_iter_search = 10
+    params = [p for p in csv.DictReader(f0)]
+    f0.close()
+    names = list(params[0].keys())
+    model_params = {n: [] for n in names}
+
+    for n in names:
+        for d in params:
+            for k in d:
+                if k == n:
+                    try:
+                        model_params[n].append(float(d[k]))
+                    except ValueError:
+                        model_params[n].append(d[k])
+
+    model_params = {k: list(set(model_params[k])) for k in model_params}
+    abstracs = get_abstracts(file_name=args.classA, label=labels[0])
+    abstracs += get_abstracts(file_name=args.classB, label=labels[1])
+
+    tfidf_model = vectorizer.fit([x['body'] for x in abstracs])
+    X = tfidf_model.transform([x['body'] for x in abstracs])
+    svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20)
+    svd_model = svd.fit(X)
+    X = svd_model.transform(X)
+    #y = [x['topic'] for x in abstracs]
+    y = [0 if x['topic'] == 'useless' else 1 for x in abstracs]    
+
+    #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
+
+    clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
+    clf = GridSearchCV(clf, cv=3,
+        param_grid=model_params,
+    # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
+                                 n_jobs=-1, scoring='f1')
+    start = time()
+    clf.fit(X, y)
+
+    #clf.fit(X_train, y_train)
+    print("GridSearch took %.2f seconds for %d candidates"
+      " parameter settings." % ((time() - start), n_iter_search))
+
+    print(clf.best_estimator_)
+    print()
+    print(clf.best_score_)
+    #print(metrics.f1_score(clf.predict(X_test), y_test))
+
+    #joblib.dump(clf, 'model/svm_model.pkl')
+    joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
+    joblib.dump(tfidf_model, 'model/tfidf_model.pkl')
+    joblib.dump(svd_model, 'model/svd_model.pkl')
+
+else:
+
+    clf = joblib.load(args.svcmodel)
+    vectorizer = joblib.load('model/tfidf_model.pkl')
+    svd = joblib.load('model/svd_model.pkl')
+    abstracs = get_abstracts(file_name=args.input, label='unknown')
+    X = vectorizer.transform([x['body'] for x in abstracs])
+    X = svd.transform(X)
+    classes = clf.predict(X)
+
+    if not os.path.exists(args.out):
+        os.makedirs(args.out)
+    # Writing predictions to output files
+    with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \
+                    open(args.out + "/" + labels[1] + ".out", 'w') as f1:
+        for c, a in zip(classes, abstracs):
+            if c == 0:
+                f0.write("%d\t%s\n" % (a['pmid'], a['body']))
+            elif c == 1:
+                f1.write("%d\t%s\n" % (a['pmid'], a['body']))
--- a/filter_abstracts_binClass.py 0 → 100644
View file @1e051ed
+++ b/filter_abstracts_binClass.py 0 → 100644
View file @1e051ed
+#from pdb import set_trace as st
+from sklearn.cross_validation import train_test_split as splitt
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.decomposition import TruncatedSVD
+from sklearn.model_selection import RandomizedSearchCV
+from sklearn.model_selection import GridSearchCV
+from sklearn import metrics
+from sklearn.svm import SVC
+import numpy as np
+import argparse
+import csv
+import os
+from sklearn.externals import joblib
+from time import time
+from scipy.stats import randint as sp_randint
+from scipy.stats import expon
+from sklearn.preprocessing import label_binarize
+
+
+def get_abstracts(file_name, label):
+    f = open(file_name)
+    extract = {}
+    docs = []
+    empties = []
+    lines = f.readlines()
+    copyright = False
+
+    for i, ln in enumerate(lines):
+        if not ln.strip():
+            empties.append(i)
+            continue
+        elif ' doi: ' in ln:
+            for j in range(i, i + 10):
+                if not lines[j].strip():
+                    title_idx = j + 1
+                    break
+            continue
+
+        elif 'Copyright ' in ln or 'Publish' in ln or u'\N{COPYRIGHT SIGN}' in ln:
+            copyright = True
+
+        elif 'DOI: ' in ln:
+            if 'PMCID: ' in lines[i + 1]:
+                extract['pmid'] = int(lines[i + 2].strip().split()[1])
+            elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]:
+                extract['pmid'] = int(lines[i + 1].strip().split()[1])
+
+            if copyright:
+                get = slice(empties[-3], empties[-2])
+                copyright = False
+            else:
+                get = slice(empties[-2], empties[-1])
+
+            extract['body'] = " ".join(lines[get]).replace("\n", ' '
+                                                        ).replace("  ", ' ')
+            title = []
+            for j in range(title_idx, title_idx + 5):
+                if lines[j].strip():
+                    title.append(lines[j])
+                else:
+                    break
+            extract['title'] = " ".join(title).replace("\n", ' '
+                                                        ).replace("  ", ' ')
+            extract['topic'] = label
+            docs.append(extract)
+            empties = []
+            extract = {}
+
+    return docs
+
+
+parser = argparse.ArgumentParser(
+    description="This script separates abstracts of biomedical papers that"
+            "report data from biomedical experiments from those that do not.")
+parser.add_argument("--input", help="Input file containing the abstracts to"
+                                "be predited.")
+parser.add_argument("--classA", help="Input file containing the abstracts of"
+                                "class A to be learned.")
+parser.add_argument("--classB", help="Input file containing the abstracts of"
+                                "class B to be learned.")
+parser.add_argument("--out", help="Path to the output directory "
+                     "(default='./filter_output')", default="filter_output")
+parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
+        "(default='./model/svm_model.pkl')", default="model/svm_model.pkl")
+
+args = parser.parse_args()
+
+labels = {0: 'useless', 1: 'useful'}
+vectorizer = TfidfVectorizer(binary=True)
+print(vectorizer)
+
+if args.classA and args.classB and not args.input:
+    f0 = open("model_params.conf")
+    n_iter_search = 10
+    params = [p for p in csv.DictReader(f0)]
+    f0.close()
+    names = list(params[0].keys())
+    model_params = {n: [] for n in names}
+
+    for n in names:
+        for d in params:
+            for k in d:
+                if k == n:
+                    try:
+                        model_params[n].append(float(d[k]))
+                    except ValueError:
+                        model_params[n].append(d[k])
+
+    model_params = {k: list(set(model_params[k])) for k in model_params}
+    abstracs = get_abstracts(file_name=args.classA, label=labels[0])
+    abstracs += get_abstracts(file_name=args.classB, label=labels[1])
+
+    tfidf_model = vectorizer.fit([x['body'] for x in abstracs])
+    X = tfidf_model.transform([x['body'] for x in abstracs])
+    svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20)
+    svd_model = svd.fit(X)
+    X = svd_model.transform(X)
+    #y = [x['topic'] for x in abstracs]
+    y = [0 if x['topic'] == 'useless' else 1 for x in abstracs]    
+
+    #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
+
+    clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
+    clf = GridSearchCV(clf, cv=3,
+        param_grid=model_params,
+    # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
+                                 n_jobs=-1, scoring='f1')
+    start = time()
+    clf.fit(X, y)
+
+    #clf.fit(X_train, y_train)
+    print("GridSearch took %.2f seconds for %d candidates"
+      " parameter settings." % ((time() - start), n_iter_search))
+
+    print(clf.best_estimator_)
+    print()
+    print(clf.best_score_)
+    #print(metrics.f1_score(clf.predict(X_test), y_test))
+
+    #joblib.dump(clf, 'model/svm_model.pkl')
+    joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
+    joblib.dump(tfidf_model, 'model/tfidf_model.pkl')
+    joblib.dump(svd_model, 'model/svd_model.pkl')
+
+else:
+
+    clf = joblib.load(args.svcmodel)
+    vectorizer = joblib.load('model/tfidf_model.pkl')
+    svd = joblib.load('model/svd_model.pkl')
+    abstracs = get_abstracts(file_name=args.input, label='unknown')
+    X = vectorizer.transform([x['body'] for x in abstracs])
+    X = svd.transform(X)
+    classes = clf.predict(X)
+
+    if not os.path.exists(args.out):
+        os.makedirs(args.out)
+    # Writing predictions to output files
+    with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \
+                    open(args.out + "/" + labels[1] + ".out", 'w') as f1:
+        for c, a in zip(classes, abstracs):
+            if c == 0:
+                f0.write("%d\t%s\n" % (a['pmid'], a['body']))
+            elif c == 1:
+                f1.write("%d\t%s\n" % (a['pmid'], a['body']))
--- a/filter_abstracts_oneClass.py 0 → 100644
View file @1e051ed
+++ b/filter_abstracts_oneClass.py 0 → 100644
View file @1e051ed
+#from pdb import set_trace as st
+from sklearn.cross_validation import train_test_split as splitt
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.decomposition import TruncatedSVD
+from sklearn.model_selection import RandomizedSearchCV
+from sklearn.model_selection import GridSearchCV
+from sklearn import metrics
+from sklearn.svm import SVC
+import numpy as np
+import argparse
+import csv
+import os
+from sklearn.externals import joblib
+from time import time
+from scipy.stats import randint as sp_randint
+from scipy.stats import expon
+from sklearn.preprocessing import label_binarize
+
+
+def get_abstracts(file_name, label):
+    f = open(file_name)
+    extract = {}
+    docs = []
+    empties = []
+    lines = f.readlines()
+    copyright = False
+
+    for i, ln in enumerate(lines):
+        if not ln.strip():
+            empties.append(i)
+            continue
+        elif ' doi: ' in ln:
+            for j in range(i, i + 10):
+                if not lines[j].strip():
+                    title_idx = j + 1
+                    break
+            continue
+
+        elif 'Copyright ' in ln or 'Publish' in ln or u'\N{COPYRIGHT SIGN}' in ln:
+            copyright = True
+
+        elif 'DOI: ' in ln:
+            if 'PMCID: ' in lines[i + 1]:
+                extract['pmid'] = int(lines[i + 2].strip().split()[1])
+            elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]:
+                extract['pmid'] = int(lines[i + 1].strip().split()[1])
+
+            if copyright:
+                get = slice(empties[-3], empties[-2])
+                copyright = False
+            else:
+                get = slice(empties[-2], empties[-1])
+
+            extract['body'] = " ".join(lines[get]).replace("\n", ' '
+                                                        ).replace("  ", ' ')
+            title = []
+            for j in range(title_idx, title_idx + 5):
+                if lines[j].strip():
+                    title.append(lines[j])
+                else:
+                    break
+            extract['title'] = " ".join(title).replace("\n", ' '
+                                                        ).replace("  ", ' ')
+            extract['topic'] = label
+            docs.append(extract)
+            empties = []
+            extract = {}
+
+    return docs
+
+
+parser = argparse.ArgumentParser(
+    description="This script separates abstracts of biomedical papers that"
+            "report data from biomedical experiments from those that do not.")
+parser.add_argument("--input", help="Input file containing the abstracts to"
+                                "be predited.")
+parser.add_argument("--classA", help="Input file containing the abstracts of"
+                                "class A to be learned.")
+parser.add_argument("--classB", help="Input file containing the abstracts of"
+                                "class B to be learned.")
+parser.add_argument("--out", help="Path to the output directory "
+                     "(default='./filter_output')", default="filter_output")
+parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
+        "(default='./model/svm_model.pkl')", default="model/svm_model.pkl")
+
+args = parser.parse_args()
+
+labels = {0: 'useless', 1: 'useful'}
+vectorizer = TfidfVectorizer(binary=True)
+print(vectorizer)
+
+if args.classA and args.classB and not args.input:
+    f0 = open("model_params.conf")
+    n_iter_search = 10
+    params = [p for p in csv.DictReader(f0)]
+    f0.close()
+    names = list(params[0].keys())
+    model_params = {n: [] for n in names}
+
+    for n in names:
+        for d in params:
+            for k in d:
+                if k == n:
+                    try:
+                        model_params[n].append(float(d[k]))
+                    except ValueError:
+                        model_params[n].append(d[k])
+
+    model_params = {k: list(set(model_params[k])) for k in model_params}
+    abstracs = get_abstracts(file_name=args.classA, label=labels[0])
+    abstracs += get_abstracts(file_name=args.classB, label=labels[1])
+
+    tfidf_model = vectorizer.fit([x['body'] for x in abstracs])
+    X = tfidf_model.transform([x['body'] for x in abstracs])
+    svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20)
+    svd_model = svd.fit(X)
+    X = svd_model.transform(X)
+    #y = [x['topic'] for x in abstracs]
+    y = [0 if x['topic'] == 'useless' else 1 for x in abstracs]    
+
+    #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
+
+    clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
+    clf = GridSearchCV(clf, cv=3,
+        param_grid=model_params,
+    # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
+                                 n_jobs=-1, scoring='f1')
+    start = time()
+    clf.fit(X, y)
+
+    #clf.fit(X_train, y_train)
+    print("GridSearch took %.2f seconds for %d candidates"
+      " parameter settings." % ((time() - start), n_iter_search))
+
+    print(clf.best_estimator_)
+    print()
+    print(clf.best_score_)
+    #print(metrics.f1_score(clf.predict(X_test), y_test))
+
+    #joblib.dump(clf, 'model/svm_model.pkl')
+    joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
+    joblib.dump(tfidf_model, 'model/tfidf_model.pkl')
+    joblib.dump(svd_model, 'model/svd_model.pkl')
+
+else:
+
+    clf = joblib.load(args.svcmodel)
+    vectorizer = joblib.load('model/tfidf_model.pkl')
+    svd = joblib.load('model/svd_model.pkl')
+    abstracs = get_abstracts(file_name=args.input, label='unknown')
+    X = vectorizer.transform([x['body'] for x in abstracs])
+    X = svd.transform(X)
+    classes = clf.predict(X)
+
+    if not os.path.exists(args.out):
+        os.makedirs(args.out)
+    # Writing predictions to output files
+    with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \
+                    open(args.out + "/" + labels[1] + ".out", 'w') as f1:
+        for c, a in zip(classes, abstracs):
+            if c == 0:
+                f0.write("%d\t%s\n" % (a['pmid'], a['body']))
+            elif c == 1:
+                f1.write("%d\t%s\n" % (a['pmid'], a['body']))
--- a/filter_output/useful.out
View file @1e051ed
+++ b/filter_output/useful.out
View file @1e051ed
--- a/filter_papers.py
View file @1e051ed
+++ b/filter_papers.py
View file @1e051ed
@@ -30,15 +30,14 @@ parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
         "(default='./model/svm_model.paper.pkl')", default="model/svm_model.paper.pkl")
 args = parser.parse_args()
+labels = {0: 'useless', 1: 'useful'}
-data=load_files(container_path=args.traind, encoding=None, 
+if args.traind and not args.input:
+    data=load_files(container_path=args.traind, encoding=None, 
                                                     decode_error='replace')
-labels = data.target_names
+    labels = data.target_names
-
+    vectorizer = TfidfVectorizer(binary=True)
-vectorizer = TfidfVectorizer(binary=True)
+    print(vectorizer)
-print(vectorizer)
-
-if args.train and not args.input:
     f0 = open("model_params.conf")
     n_iter_search = 10
     params = [p for p in csv.DictReader(f0)]
@@ -56,10 +55,9 @@ if args.train and not args.input:
                         model_params[n].append(d[k])
     model_params = {k: list(set(model_params[k])) for k in model_params}
-    papers = data.data
-    tfidf_model = vectorizer.fit(papers)
+    tfidf_model = vectorizer.fit(data.data)
-    X = vectorizer.transform(papers)
+    X = vectorizer.transform(data.data)
     #y = [x['topic'] for x in abstracs]
     y = data.target    
@@ -87,15 +85,15 @@ if args.train and not args.input:
     joblib.dump(tfidf_model, 'model/tfidf_model.paper.pkl')
 else:
-
+    from pdb import set_trace as st
     data=load_files(container_path=args.input, encoding=None,
                                                     decode_error='replace')
     clf = joblib.load(args.svcmodel)
     vectorizer = joblib.load('model/tfidf_model.paper.pkl')
-    papers = data.data
+    X = vectorizer.transform(data.data)
-    X = vectorizer.transform(papers)
-    classes = clf.predict(X)
+    classes = clf.predict(X)
+    st()
     if not os.path.exists(args.out):
         os.makedirs(args.out)
     # Writing predictions to output files
--- a/model/svd_model.pkl 0 → 100644
View file @1e051ed
+++ b/model/svd_model.pkl 0 → 100644
View file @1e051ed
--- a/model/svm_model.paper.pkl 0 → 100644
View file @1e051ed
+++ b/model/svm_model.paper.pkl 0 → 100644
View file @1e051ed
--- a/model/svm_model.pkl
View file @1e051ed
+++ b/model/svm_model.pkl
View file @1e051ed
--- a/model/tfidf_model.paper.pkl 0 → 100644
View file @1e051ed
+++ b/model/tfidf_model.paper.pkl 0 → 100644
View file @1e051ed
--- a/outRNAseq/useful.out 0 → 100644
View file @1e051ed
+++ b/outRNAseq/useful.out 0 → 100644
View file @1e051ed
+29484588	 Small regulatory RNAs (sRNAs) are ubiquitous regulatory molecules expressed in living cells. In prokaryotes, sRNAs usually bind to target mRNAs to either promote their degradation or interfere with translation initiation. Because a single sRNA can regulate a considerable number of target mRNAs, we seek to identify those targets rapidly and reliably. Here, we present a robust method based on the co-purification of target mRNAs bound to MS2-tagged sRNAs expressed  in vivo. After purification of the tagged-sRNA, we use RNAseq to determine the identity of all RNA interacting partners and their enrichment level. We describe  how to analyze the RNAseq data through the Galaxy Project Platform bioinformatics tools to identify new mRNA targets. This technique is applicable to most sRNAs of E. coli and Salmonella. 
+29433444	 BACKGROUND: Due to the DNA triplet code, it is possible that the sequences of two or more protein-coding genes overlap to a large degree. However, such non-trivial overlaps are usually excluded by genome annotation pipelines and, thus, only a few overlapping gene pairs have been described in bacteria. In contrast, transcriptome and translatome sequencing reveals many signals originated from the antisense strand of annotated genes, of which we analyzed an example gene pair in more detail. RESULTS: A small open reading frame of Escherichia coli O157:H7 strain Sakai (EHEC), designated laoB (L-arginine responsive overlapping gene), is embedded in  reading frame -2 in the antisense strand of ECs5115, encoding a CadC-like transcriptional regulator. This overlapping gene shows evidence of transcription  and translation in Luria-Bertani (LB) and brain-heart infusion (BHI) medium based on RNA sequencing (RNAseq) and ribosomal-footprint sequencing (RIBOseq). The transcriptional start site is 289 base pairs (bp) upstream of the start codon and transcription termination is 155 bp downstream of the stop codon. Overexpression  of LaoB fused to an enhanced green fluorescent protein (EGFP) reporter was possible. The sequence upstream of the transcriptional start site displayed strong promoter activity under different conditions, whereas promoter activity was significantly decreased in the presence of L-arginine. A strand-specific translationally arrested mutant of laoB provided a significant growth advantage in competitive growth experiments in the presence of L-arginine compared to the wild type, which returned to wild type level after complementation of laoB in trans. A phylostratigraphic analysis indicated that the novel gene is restricted  to the Escherichia/Shigella clade and might have originated recently by overprinting leading to the expression of part of the antisense strand of ECs5115. CONCLUSIONS: Here, we present evidence of a novel small protein-coding gene laoB  encoded in the antisense frame -2 of the annotated gene ECs5115. Clearly, laoB is evolutionarily young and it originated in the Escherichia/Shigella clade by overprinting, a process which may cause the de novo evolution of bacterial genes  like laoB. 
+28902868	 In the past, short protein-coding genes were often disregarded by genome annotation pipelines. Transcriptome sequencing (RNAseq) signals outside of annotated genes have usually been interpreted to indicate either ncRNA or pervasive transcription. Therefore, in addition to the transcriptome, the translatome (RIBOseq) of the enteric pathogen Escherichia coli O157:H7 strain Sakai was determined at two optimal growth conditions and a severe stress condition combining low temperature and high osmotic pressure. All intergenic open reading frames potentially encoding a protein of ≥ 30 amino acids were investigated with regard to coverage by transcription and translation signals and their translatability expressed by the ribosomal coverage value. This led to discovery of 465 unique, putative novel genes not yet annotated in this E. coli strain, which are evenly distributed over both DNA strands of the genome. For 255 of the novel genes, annotated homologs in other bacteria were found, and a machine-learning algorithm, trained on small protein-coding E. coli genes, predicted that 89% of these translated open reading frames represent bona fide genes. The remaining 210 putative novel genes without annotated homologs were compared to the 255 novel genes with homologs and to 250 short annotated genes of this E. coli strain. All three groups turned out to be similar with respect to their translatability distribution, fractions of differentially regulated genes,  secondary structure composition, and the distribution of evolutionary constraint, suggesting that both novel groups represent legitimate genes. However, the machine-learning algorithm only recognized a small fraction of the 210 genes without annotated homologs. It is possible that these genes represent a novel group of genes, which have unusual features dissimilar to the genes of the machine-learning algorithm training set. 
+28791299	 Increasing evidence that microRNAs (miRNAs) play important roles in the immune response against infectious agents suggests that miRNA might be exploitable as signatures of exposure to specific infectious agents. In order to identify potential early miRNA biomarkers of bacterial infections, human peripheral blood  mononuclear cells (hPBMCs) were exposed to two select agents, Burkholderia pseudomallei K96243 and Francisella tularensis SHU S4, as well as to the nonpathogenic control Escherichia coli DH5α. RNA samples were harvested at three  early time points, 30, 60, and 120 minutes postexposure, then sequenced. RNAseq analyses identified 87 miRNAs to be differentially expressed (DE) in a linear fashion. Of these, 31 miRNAs were tested using the miScript miRNA qPCR assay. Through RNAseq identification and qPCR validation, we identified differentially expressed miRNA species that may be involved in the early response to bacterial infections. Based upon its upregulation at early time points postexposure in two  different individuals, hsa-mir-30c-5p is a miRNA species that could be studied further as a potential biomarker for exposure to these gram-negative intracellular pathogens. Gene ontology functional analyses demonstrated that programmed cell death is the first ranking biological process associated with miRNAs that are upregulated in F. tularensis-exposed hPBMCs. 
+28614372	 Infection with Shiga toxin (Stx) producing Escherichia coli O157:H7 can cause the potentially fatal complication hemolytic uremic syndrome, and currently only supportive therapy is available. Lack of suitable animal models has hindered study of this disease. Induced human intestinal organoids (iHIOs), generated by in vitro differentiation of pluripotent stem cells, represent differentiated human intestinal tissue. We show that iHIOs with addition of human neutrophils can model E. coli intestinal infection and innate cellular responses. Commensal and O157:H7 introduced into the iHIO lumen replicated rapidly achieving high numbers. Commensal E. coli did not cause damage, and were completely contained within the lumen, suggesting defenses, such as mucus production, can constrain non-pathogenic strains. Some O157:H7 initially co-localized with cellular actin.  Loss of actin and epithelial integrity was observed after 4 hours. O157:H7 grew as filaments, consistent with activation of the bacterial SOS stress response. SOS is induced by reactive oxygen species (ROS), and O157:H7 infection increased  ROS production. Transcriptional profiling (RNAseq) demonstrated that both commensal and O157:H7 upregulated genes associated with gastrointestinal maturation, while infection with O157:H7 upregulated inflammatory responses, including interleukin 8 (IL-8). IL-8 is associated with neutrophil recruitment, and infection with O157:H7 resulted in recruitment of human neutrophils into the  iHIO tissue. 
+28270101	 BACKGROUND: Avian pathogenic E. coli (APEC) can lead to a loss in millions of dollars in poultry annually because of mortality and produce contamination. Studies have verified that many immune-related genes undergo changes in alternative splicing (AS), along with nonsense mediated decay (NMD), to regulate  the immune system under different conditions. Therefore, the splicing profiles of primary lymphoid tissues with systemic APEC infection need to be comprehensively  examined. RESULTS: Gene expression in RNAseq data were obtained for three different immune  tissues (bone marrow, thymus, and bursa) from three phenotype birds (non-challenged, resistant, and susceptible birds) at two time points. Alternative 5' splice sites and exon skipping/inclusion were identified as the major alternative splicing events in avian primary immune organs under systemic APEC infection. In this study, we detected hundreds of differentially-expressed-transcript-containing genes (DETs) between different phenotype birds at 5 days post-infection (dpi). DETs, PSAP and STT3A, with NMD have important functions under systemic APEC infection. DETs, CDC45, CDK1, RAG2,  POLR1B, PSAP, and DNASE1L3, from the same transcription start sites (TSS) indicate that cell death, cell cycle, cellular function, and maintenance were predominant in host under systemic APEC. CONCLUSIONS: With the use of RNAseq technology and bioinformatics tools, this study provides a portrait of the AS event and NMD in primary lymphoid tissues, which play critical roles in host homeostasis under systemic APEC infection. According to this study, AS plays a pivotal regulatory role in the immune response in chicken under systemic APEC infection via either NMD or alternative TSSs. This study elucidates the regulatory role of AS for the immune complex under systemic APEC infection. 
+28245801	 BACKGROUND: While NGS allows rapid global detection of transcripts, it remains difficult to distinguish ncRNAs from short mRNAs. To detect potentially translated RNAs, we developed an improved protocol for bacterial ribosomal footprinting (RIBOseq). This allowed distinguishing ncRNA from mRNA in EHEC. A high ratio of ribosomal footprints per transcript (ribosomal coverage value, RCV) is expected to indicate a translated RNA, while a low RCV should point to a non-translated RNA. RESULTS: Based on their low RCV, 150 novel non-translated EHEC transcripts were identified as putative ncRNAs, representing both antisense and intergenic transcripts, 74 of which had expressed homologs in E. coli MG1655. Bioinformatics analysis predicted statistically significant target regulons for 15 of the intergenic transcripts; experimental analysis revealed 4-fold or higher differential expression of 46 novel ncRNA in different growth media. Out of 329 annotated EHEC ncRNAs, 52 showed an RCV similar to protein-coding genes, of those, 16 had RIBOseq patterns matching annotated genes in other enterobacteriaceae, and 11 seem to possess a Shine-Dalgarno sequence, suggesting  that such ncRNAs may encode small proteins instead of being solely non-coding. To support that the RIBOseq signals are reflecting translation, we tested the ribosomal-footprint covered ORF of ryhB and found a phenotype for the encoded peptide in iron-limiting condition. CONCLUSION: Determination of the RCV is a useful approach for a rapid first-step  differentiation between bacterial ncRNAs and small mRNAs. Further, many known ncRNAs may encode proteins as well. 
+28240544	 Facile and simple method is developed to synthesize silver-nanoparticle-decorated quercetin nanoparticles (QA NPs). Modification suggests that synergistic quercetin (Qe) improves the antibacterial effect of silver nanoparticles (Ag NPs). Characterization experiment indicates that QA NPs have a diameter of approximately 10 nm. QA NPs show highly effective antibacterial activities against drug-resistant Escherichia coli (E. coli) and Staphylococcus aureus (S. aureus). We explore antibacterial mechanisms using S. aureus and E. coli treated  with QA NPs. Through morphological changes in E. coli and S. aureus, mechanisms are examined for bacterial damage caused by particulate matter from local dissociation of silver ion and Qe from QA NPs trapped inside membranes. Moreover, we note that gene expression profiling methods, such as RNA sequencing, can be used to predict discover mechanisms of toxicity of QA NPs. Gene ontology (GO) assay analyses demonstrate the molecular mechanism of the antibacterial effect of QA NPs. Regarding cellular component ontology, "cell wall organization or biogenesis" (GO: 0071554) and "cell wall macromolecule metabolic process" (GO: 0044036) are the most represented categories. The present study reports that transcriptome analysis of the mechanism offers novel insights into the molecular  mechanism of antibacterial assays. 
+28174601	 BACKGROUND: Lignin is a potential biorefinery feedstock for the production of value-added chemicals including vanillin. A huge amount of lignin is produced as  a by-product of the paper industry, while cellulosic components of plant biomass  are utilized for the production of paper pulp. In spite of vast potential, lignin remains the least exploited component of plant biomass due to its extremely complex and heterogenous structure. Several enzymes have been reported to have lignin-degrading properties and could be potentially used in lignin biorefining if their catalytic properties could be improved by enzyme engineering. The much needed improvement of lignin-degrading enzymes by high-throughput selection techniques such as directed evolution is currently limited, as robust methods for detecting the conversion of lignin to desired small molecules are not available. RESULTS: We identified a vanillin-inducible promoter by RNAseq analysis of Escherichia coli cells treated with a sublethal dose of vanillin and developed a  genetically programmed vanillin-sensing cell by placing the 'very green fluorescent protein' gene under the control of this promoter. Fluorescence of the biosensing cell is enhanced significantly when grown in the presence of vanillin  and is readily visualized by fluorescence microscopy. The use of fluorescence-activated cell sorting analysis further enhances the sensitivity, enabling dose-dependent detection of as low as 200 µM vanillin. The biosensor is  highly specific to vanillin and no major response is elicited by the presence of  lignin, lignin model compound, DMSO, vanillin analogues or non-specific toxic chemicals. CONCLUSIONS: We developed an engineered E. coli cell that can detect vanillin at  a concentration as low as 200 µM. The vanillin-sensing cell did not show cross-reactivity towards lignin or major lignin degradation products including vanillin analogues. This engineered E. coli cell could potentially be used as a host cell for screening lignin-degrading enzymes that can convert lignin to vanillin. 
+28060822	 Mosquitoes host communities of microbes in their digestive tract that consist primarily of bacteria. We previously reported that Aedes aegypti larvae colonized by a native community of bacteria and gnotobiotic larvae colonized by only Escherichia coli develop very similarly into adults, whereas axenic larvae never  molt and die as first instars. In this study, we extended these findings by first comparing the growth and abundance of bacteria in conventional, gnotobiotic, and  axenic larvae during the first instar. Results showed that conventional and gnotobiotic larvae exhibited no differences in growth, timing of molting, or number of bacteria in their digestive tract. Axenic larvae in contrast grew minimally and never achieved the critical size associated with molting by conventional and gnotobiotic larvae. In the second part of the study we compared  patterns of gene expression in conventional, gnotobiotic and axenic larvae by conducting an RNAseq analysis of gut and nongut tissues (carcass) at 22 h post-hatching. Approximately 12% of Ae. aegypti transcripts were differentially expressed in axenic versus conventional or gnotobiotic larvae. However, this profile consisted primarily of transcripts in seven categories that included the  down-regulation of select peptidases in the gut and up-regulation of several genes in the gut and carcass with roles in amino acid transport, hormonal signaling, and metabolism. Overall, our results indicate that axenic larvae exhibit alterations in gene expression consistent with defects in acquisition and assimilation of nutrients required for growth. 
+27876680	 Recent advances in high-throughput sequencing have led to an explosion in the rate of small regulatory RNAs (sRNAs) discovery among bacteria. However, only a handful of them are functionally characterized. Most of the time, little to no targets are known. In Lalaouna et al. (2015), we proposed a new technology to uncover sRNAs targetome, which is based on the MS2-affinity purification (MAPS).  We were able to prove its efficiency by applying it on well-characterized sRNAs of Escherichia coli. Thereafter, we adapted the procedure to other kind of RNA (mRNAs and tRNA-derived RNA fragments) and bacteria (pathogenic or Gram-positive  strains). Here, we clearly report all improvements and adjustments made to MAPS technology since it was originally reported. 
+27856567	 The enteric pathogen Escherichia coli O157:H7 Sakai (EHEC) is able to grow at lower temperatures compared to commensal E. coli Growth at environmental conditions displays complex challenges different to those in a host. EHEC was grown at 37°C and at 14°C with 4% NaCl, a combination of cold and osmotic stress  as present in the food chain. Comparison of RNAseq and RIBOseq data provided a snap shot of ongoing transcription and translation, differentiating transcriptional and post-transcriptional gene regulation, respectively. Indeed, cold and osmotic stress related genes are simultaneously regulated at both levels, but translational regulation clearly dominates. Special emphasis was given to genes regulated by RNA secondary structures in their 5'UTRs, such as RNA thermometers and riboswitches, or genes controlled by small RNAs encoded in trans The results reveal large differences in gene expression between short-time shock  compared to adaptation in combined cold and osmotic stress. Whereas the majority  of cold shock proteins, such as CspA, are translationally downregulated after adaptation, many osmotic stress genes are still significantly upregulated mainly  translationally, but several also transcriptionally. 
+27466434	 Avian pathogenic Escherichia coli (APEC) can cause significant morbidity in chickens. The thymus provides the essential environment for T cell development; however, the thymus transcriptome has not been examined for gene expression in response to APEC infection. An improved understanding of the host genomic response to APEC infection could inform future breeding programs for disease resistance and APEC control. We therefore analyzed the transcriptome of the thymus of birds challenged with APEC, contrasting susceptible and resistant phenotypes. Thousands of genes were differentially expressed in birds of the 5-day post infection (dpi) challenged-susceptible group vs. 5 dpi non-challenged, in 5 dpi challenged-susceptible vs. 5 dpi challenged-resistant birds, as well as  in 5 dpi vs. one dpi challenged-susceptible birds. The Toll-like receptor signaling pathway was the major innate immune response for birds to respond to APEC infection. Moreover, lysosome and cell adhesion molecules pathways were common mechanisms for chicken response to APEC infection. The T-cell receptor signaling pathway, cell cycle, and p53 signaling pathways were significantly activated in resistant birds to resist APEC infection. These results provide a comprehensive assessment of global gene networks and biological functionalities of differentially expressed genes in the thymus under APEC infection. These findings provide novel insights into key molecular genetic mechanisms that differentiate host resistance from susceptibility in this primary lymphoid tissue, the thymus. 
+27424527	 Thermobifida fusca is a thermophilic actinobacterium. T. fusca muC obtained by adaptive evolution preferred yeast extract to ammonium sulfate for accumulating malic acid and ammonium sulfate for cell growth. We did transcriptome analysis of T. fusca muC on Avicel and cellobiose with addition of ammonium sulfate or yeast  extract, respectively by RNAseq. The transcriptional results indicate that ammonium sulfate induced the transcriptions of the genes related to carbohydrate  metabolisms significantly more than yeast extract. Importantly, Tfu_2487, encoding histidine-containing protein (HPr), didn't transcribe on yeast extract at all, while it transcribed highly on ammonium sulfate. In order to understand the impact of HPr on malate production and cell growth of the muC strain, we deleted Tfu_2487 to get a mutant strain: muCΔ2487, which had 1.33 mole/mole-glucose equivalent malate yield, much higher than that on yeast extract. We then developed an E. coli-T. fusca shuttle plasmid for over-expressing HPr in muCΔ2487, a strain without HPr background, forming the muCΔ2487S strain. The muCΔ2487S strain had a much lower malate yield but faster cell growth than the muC strain. The results of both mutant strains confirmed that HPr was the key regulatory protein for T. fusca's metabolisms on nitrogen sources. 
+27336699	 Our objective was to identify the biological response and the cross-talk between  liver and mammary tissue after intramammary infection (IMI) with Escherichia coli (E. coli) using RNAseq technology. Sixteen cows were inoculated with live E. coli into one mammary quarter at ~4-6 weeks in lactation. For all cows, biopsies were  performed at -144, 12 and 24 h relative to IMI in liver and at 24 h post-IMI in infected and non-infected (control) mammary quarters. For a subset of cows (n = 6), RNA was extracted from both liver and mammary tissue and sequenced using a 100 bp paired-end approach. Ingenuity Pathway Analysis and the Dynamic Impact Approach analysis of differentially expressed genes (overall effect False Discovery Rate≤0.05) indicated that IMI induced an overall activation of inflammation at 12 h post-IMI and a strong inhibition of metabolism, especially related to lipid, glucose, and xenobiotics at 24 h post-IMI in liver. The data indicated in mammary tissue an overall induction of inflammatory response with little effect on metabolism at 24 h post-IMI. We identified a large number of up-stream regulators potentially involved in the response to IMI in both tissues  but a relatively small core network of transcription factors controlling the response to IMI for liver whereas a large network in mammary tissue. Transcriptomic results in liver and mammary tissue were supported by changes in inflammatory and metabolic mediators in blood and milk. The analysis of potential cross-talk between the two tissues during IMI uncovered a large communication from the mammary tissue to the liver to coordinate the inflammatory response but  a relatively small communication from the liver to the mammary tissue. Our results indicate a strong induction of the inflammatory response in mammary tissue and impairment of liver metabolism 24h post-IMI partly driven by the signaling from infected mammary tissue. 
+26911138	 BACKGROUND: Genomes of E. coli, including that of the human pathogen Escherichia  coli O157:H7 (EHEC) EDL933, still harbor undetected protein-coding genes which, apparently, have escaped annotation due to their small size and non-essential function. To find such genes, global gene expression of EHEC EDL933 was examined, using strand-specific RNAseq (transcriptome), ribosomal footprinting (translatome) and mass spectrometry (proteome). RESULTS: Using the above methods, 72 short, non-annotated protein-coding genes were detected. All of these showed signals in the ribosomal footprinting assay indicating mRNA translation. Seven were verified by mass spectrometry. Fifty-seven genes are annotated in other enterobacteriaceae, mainly as hypothetical genes; the remaining 15 genes constitute novel discoveries. In addition, protein structure and function were predicted computationally and compared between EHEC-encoded proteins and 100-times randomly shuffled proteins.  Based on this comparison, 61 of the 72 novel proteins exhibit predicted structural and functional features similar to those of annotated proteins. Many of the novel genes show differential transcription when grown under eleven diverse growth conditions suggesting environmental regulation. Three genes were found to confer a phenotype in previous studies, e.g., decreased cattle colonization. CONCLUSIONS: These findings demonstrate that ribosomal footprinting can be used to detect novel protein coding genes, contributing to the growing body of evidence that hypothetical genes are not annotation artifacts and opening an additional way to study their functionality. All 72 genes are taxonomically restricted and, therefore, appear to have evolved relatively recently de novo. 
+26818886	 Volatile organic compounds (VOCs) are commonly used as solvents in various industrial settings. Many of them present a challenge to receiving environments,  due to their toxicity and low bioavailability for degradation. Microorganisms are capable of sensing and responding to their surroundings and this makes them ideal detectors for toxic compounds. This study investigates the global transcriptomic  responses of Escherichia coli K-12 to selected VOCs at sub-toxic levels. Cells grown in the presence of VOCs were harvested during exponential growth, followed  by whole transcriptome shotgun sequencing (RNAseq). The analysis of the data revealed both shared and unique genetic responses compared to cells without exposure to VOCs. Results suggest that various functional gene categories, for example, those relating to Fe/S cluster biogenesis, oxidative stress responses and transport proteins, are responsive to selected VOCs in E. coli. The differential expression (DE) of genes was validated using GFP-promoter fusion assays. A variety of genes were differentially expressed even at non-inhibitory concentrations and when the cells are at their balanced-growth. Some of these genes belong to generic stress response and others could be specific to VOCs. Such candidate genes and their regulatory elements could be used as the basis for designing biosensors for selected VOCs. 
+26307168	 Repeated extragenic palindromes (REPs) in the enterobacterial genomes are usually composed of individual palindromic units separated by linker sequences. A total of 355 annotated REPs are distributed along the Escherichia coli genome. RNA sequence (RNAseq) analysis showed that almost 80% of the REPs in E. coli are transcribed. The DNA sequence of REP325 showed that it is a cluster of six repeats, each with two palindromic units capable of forming cruciform structures  in supercoiled DNA. Here, we report that components of the REP325 element and at  least one of its RNA products play a role in bacterial nucleoid DNA condensation. These RNA not only are present in the purified nucleoid but bind to the bacterial nucleoid-associated HU protein as revealed by RNA IP followed by microarray analysis (RIP-Chip) assays. Deletion of REP325 resulted in a dramatic increase of the nucleoid size as observed using transmission electron microscopy (TEM), and expression of one of the REP325 RNAs, nucleoid-associated noncoding RNA 4 (naRNA4), from a plasmid restored the wild-type condensed structure. Independently, chromosome conformation capture (3C) analysis demonstrated physical connections among various REP elements around the chromosome. These connections are dependent in some way upon the presence of HU and the REP325 element; deletion of HU genes and/or the REP325 element removed the connections.  Finally, naRNA4 together with HU condensed DNA in vitro by connecting REP325 or other DNA sequences that contain cruciform structures in a pairwise manner as observed by atomic force microscopy (AFM). On the basis of our results, we propose molecular models to explain connections of remote cruciform structures mediated by HU and naRNA4.IMPORTANCE: Nucleoid organization in bacteria is being  studied extensively, and several models have been proposed. However, the molecular nature of the structural organization is not well understood. Here we characterized the role of a novel nucleoid-associated noncoding RNA, naRNA4, in nucleoid structures both in vivo and in vitro. We propose models to explain how naRNA4 together with nucleoid-associated protein HU connects remote DNA elements  for nucleoid condensation. We present the first evidence of a noncoding RNA together with a nucleoid-associated protein directly condensing nucleoid DNA. 
+26125937	 Adherent-invasive Escherichia coli (AIEC) strains are detected more frequently within mucosal lesions of patients with Crohn's disease (CD). The AIEC phenotype  consists of adherence and invasion of intestinal epithelial cells and survival within macrophages of these bacteria in vitro. Our aim was to identify candidate  transcripts that distinguish AIEC from non-invasive E. coli (NIEC) strains and might be useful for rapid and accurate identification of AIEC by culture-independent technology. We performed comparative RNA-Sequence (RNASeq) analysis using AIEC strain LF82 and NIEC strain HS during exponential and stationary growth. Differential expression analysis of coding sequences (CDS) homologous to both strains demonstrated 224 and 241 genes with increased and decreased expression, respectively, in LF82 relative to HS. Transition metal transport and siderophore metabolism related pathway genes were up-regulated, while glycogen metabolic and oxidation-reduction related pathway genes were down-regulated, in LF82. Chemotaxis related transcripts were up-regulated in LF82 during the exponential phase, but flagellum-dependent motility pathway genes were down-regulated in LF82 during the stationary phase. CDS that mapped only to the LF82 genome accounted for 747 genes. We applied an in silico subtractive genomics approach to identify CDS specific to AIEC by incorporating the genomes of 10 other previously phenotyped NIEC. From this analysis, 166 CDS mapped to the LF82  genome and lacked homology to any of the 11 human NIEC strains. We compared these CDS across 13 AIEC, but none were homologous in each. Four LF82 gene loci belonging to clustered regularly interspaced short palindromic repeats region (CRISPR)--CRISPR-associated (Cas) genes were identified in 4 to 6 AIEC and absent from all non-pathogenic bacteria. As previously reported, AIEC strains were enriched for pdu operon genes. One CDS, encoding an excisionase, was shared by 9  AIEC strains. Reverse transcription quantitative polymerase chain reaction assays for 6 genes were conducted on fecal and ileal RNA samples from 22 inflammatory bowel disease (IBD), and 32 patients without IBD (non-IBD). The expression of Cas loci was detected in a higher proportion of CD than non-IBD fecal and ileal RNA samples (p <0.05). These results support a comparative genomic/transcriptomic approach towards identifying candidate AIEC signature transcripts. 
+25177315	 Efficient microbial conversion of lignocellulosic hydrolysates to biofuels is a key barrier to the economically viable deployment of lignocellulosic biofuels. A  chief contributor to this barrier is the impact on microbial processes and energy metabolism of lignocellulose-derived inhibitors, including phenolic carboxylates, phenolic amides (for ammonia-pretreated biomass), phenolic aldehydes, and furfurals. To understand the bacterial pathways induced by inhibitors present in  ammonia-pretreated biomass hydrolysates, which are less well studied than acid-pretreated biomass hydrolysates, we developed and exploited synthetic mimics of ammonia-pretreated corn stover hydrolysate (ACSH). To determine regulatory responses to the inhibitors normally present in ACSH, we measured transcript and  protein levels in an Escherichia coli ethanologen using RNA-seq and quantitative  proteomics during fermentation to ethanol of synthetic hydrolysates containing or lacking the inhibitors. Our study identified four major regulators mediating these responses, the MarA/SoxS/Rob network, AaeR, FrmR, and YqhC. Induction of these regulons was correlated with a reduced rate of ethanol production, buildup  of pyruvate, depletion of ATP and NAD(P)H, and an inhibition of xylose conversion. The aromatic aldehyde inhibitor 5-hydroxymethylfurfural appeared to be reduced to its alcohol form by the ethanologen during fermentation, whereas phenolic acid and amide inhibitors were not metabolized. Together, our findings establish that the major regulatory responses to lignocellulose-derived inhibitors are mediated by transcriptional rather than translational regulators,  suggest that energy consumed for inhibitor efflux and detoxification may limit biofuel production, and identify a network of regulators for future synthetic biology efforts. 
+25085508	 BACKGROUND: Burkholderia pseudomallei is a facultative intracellular pathogen and the causative agent of melioidosis. A conserved type III secretion system (T3SS3) and type VI secretion system (T6SS1) are critical for intracellular survival and  growth. The T3SS3 and T6SS1 genes are coordinately and hierarchically regulated by a TetR-type regulator, BspR. A central transcriptional regulator of the BspR regulatory cascade, BsaN, activates a subset of T3SS3 and T6SS1 loci. RESULTS: To elucidate the scope of the BsaN regulon, we used RNAseq analysis to compare the transcriptomes of wild-type B. pseudomallei KHW and a bsaN deletion mutant. The 60 genes positively-regulated by BsaN include those that we had previously identified in addition to a polyketide biosynthesis locus and genes involved in amino acid biosynthesis. BsaN was also found to repress the transcription of 51 genes including flagellar motility loci and those encoding components of the T3SS3 apparatus. Using a promoter-lacZ fusion assay in E. coli, we show that BsaN together with the chaperone BicA directly control the expression of the T3SS3 translocon, effector and associated regulatory genes that are organized into at least five operons (BPSS1516-BPSS1552). Using a mutagenesis approach, a consensus regulatory motif in the promoter regions of BsaN-regulated  genes was shown to be essential for transcriptional activation. CONCLUSIONS: BsaN/BicA functions as a central regulator of key virulence clusters in B. pseudomallei within a more extensive network of genetic regulation. We propose that BsaN/BicA controls a gene expression program that facilitates the adaption and intracellular survival of the pathogen within eukaryotic hosts. 
+24927582	 The molecular mechanisms of ethanol toxicity and tolerance in bacteria, although  important for biotechnology and bioenergy applications, remain incompletely understood. Genetic studies have identified potential cellular targets for ethanol and have revealed multiple mechanisms of tolerance, but it remains difficult to separate the direct and indirect effects of ethanol. We used adaptive evolution to generate spontaneous ethanol-tolerant strains of Escherichia coli, and then characterized mechanisms of toxicity and resistance using genome-scale DNAseq, RNAseq, and ribosome profiling coupled with specific assays of ribosome and RNA polymerase function. Evolved alleles of metJ, rho, and rpsQ recapitulated most of the observed ethanol tolerance, implicating translation and transcription as key processes affected by ethanol. Ethanol induced miscoding errors during protein synthesis, from which the evolved rpsQ allele protected cells by increasing ribosome accuracy. Ribosome profiling and RNAseq analyses established that ethanol negatively affects transcriptional and translational processivity. Ethanol-stressed cells exhibited ribosomal stalling at internal AUG codons, which may be ameliorated by the adaptive inactivation of  the MetJ repressor of methionine biosynthesis genes. Ethanol also caused aberrant intragenic transcription termination for mRNAs with low ribosome density, which was reduced in a strain with the adaptive rho mutation. Furthermore, ethanol inhibited transcript elongation by RNA polymerase in vitro. We propose that ethanol-induced inhibition and uncoupling of mRNA and protein synthesis through direct effects on ribosomes and RNA polymerase conformations are major contributors to ethanol toxicity in E. coli, and that adaptive mutations in metJ, rho, and rpsQ help protect these central dogma processes in the presence of ethanol. 
+23203983	 The 20th annual Database Issue of Nucleic Acids Research includes 176 articles, half of which describe new online molecular biology databases and the other half  provide updates on the databases previously featured in NAR and other journals. This year's highlights include two databases of DNA repeat elements; several databases of transcriptional factors and transcriptional factor-binding sites; databases on various aspects of protein structure and protein-protein interactions; databases for metagenomic and rRNA sequence analysis; and four databases specifically dedicated to Escherichia coli. The increased emphasis on using the genome data to improve human health is reflected in the development of  the databases of genomic structural variation (NCBI's dbVar and EBI's DGVa), the  NIH Genetic Testing Registry and several other databases centered on the genetic  basis of human disease, potential drugs, their targets and the mechanisms of protein-ligand binding. Two new databases present genomic and RNAseq data for monkeys, providing wealth of data on our closest relatives for comparative genomics purposes. The NAR online Molecular Biology Database Collection, available at http://www.oxfordjournals.org/nar/database/a/, has been updated and  currently lists 1512 online databases. The full content of the Database Issue is  freely available online on the Nucleic Acids Research website (http://nar.oxfordjournals.org/). 
+22821568	 RNAsnap™ is a simple and novel method that recovers all intracellular RNA quantitatively (>99%), faster (<15 min) and less expensively (∼3 cents/sample) than any of the currently available RNA isolation methods. In fact, none of the bacterial RNA isolation methods, including the commercial kits, are effective in  recovering all species of intracellular RNAs (76-5700 nt) with equal efficiency,  which can lead to biased results in genome-wide studies involving microarray or RNAseq analysis. The RNAsnap™ procedure yields ∼60 µg of RNA from 10(8) Escherichia coli cells that can be used directly for northern analysis without any further purification. Based on a comparative analysis of specific transcripts ranging in size from 76 to 5700 nt, the RNAsnap™ method provided the most accurate measure of the relative amounts of the various intracellular RNAs. Furthermore, the RNAsnap™ RNA was successfully used in enzymatic reactions such as RNA ligation, reverse transcription, primer extension and reverse transcriptase-polymerase chain reaction, following sodium acetate/ethanol precipitation. The RNAsnap™ method can be used to isolate RNA from a wide range of Gram-negative and Gram-positive bacteria as well as yeast. 
+22689638	 Translational efficiency is controlled by tRNAs and other genome-encoded mechanisms. In organelles, translational processes are dramatically altered because of genome shrinkage and horizontal acquisition of gene products. The influence of genome reduction on translation in endosymbionts is largely unknown. Here, we investigate whether divergent lineages of Buchnera aphidicola, the reduced-genome bacterial endosymbiont of aphids, possess altered translational features compared with their free-living relative, Escherichia coli. Our RNAseq data support the hypothesis that translation is less optimal in Buchnera than in  E. coli. We observed a specific, convergent, pattern of tRNA loss in Buchnera and other endosymbionts that have undergone genome shrinkage. Furthermore, many modified nucleoside pathways that are important for E. coli translation are lost  in Buchnera. Additionally, Buchnera's A + T compositional bias has resulted in reduced tRNA thermostability, and may have altered aminoacyl-tRNA synthetase recognition sites. Buchnera tRNA genes are shorter than those of E. coli, as the  majority no longer has a genome-encoded 3' CCA; however, all the expressed, shortened tRNAs undergo 3' CCA maturation. Moreover, expression of tRNA isoacceptors was not correlated with the usage of corresponding codons. Overall,  our data suggest that endosymbiont genome evolution alters tRNA characteristics that are known to influence translational efficiency in their free-living relative. 
--- a/outRNAseq/useless.out 0 → 100644
View file @1e051ed
+++ b/outRNAseq/useless.out 0 → 100644
View file @1e051ed