iarroyof

Final version for abstracts

1 -#from pdb import set_trace as st
2 -from sklearn.cross_validation import train_test_split as splitt
3 -from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
4 -from sklearn.decomposition import TruncatedSVD
5 -from sklearn.naive_bayes import MultinomialNB
6 -from sklearn.linear_model import SGDClassifier
7 -from sklearn.neighbors import KNeighborsClassifier
8 -from sklearn.neighbors import NearestCentroid
9 -from sklearn.ensemble import RandomForestClassifier
10 -from sklearn.svm import LinearSVC
11 -from sklearn.svm import SVC
12 -from sklearn import metrics
13 -from sklearn.ensemble import (ExtraTreesClassifier, RandomForestClassifier,
14 - AdaBoostClassifier, GradientBoostingClassifier)
15 -from sklearn.grid_search import GridSearchCV
16 -from sklearn.externals import joblib
17 -import pandas as pd
18 -from numpy import mean, std
19 -
20 -
21 -class EstimatorSelectionHelper:
22 - "http://www.codiply.com/blog/hyperparameter-grid-search-across-multiple-models-in-scikit-learn/"
23 - def __init__(self, models, params):
24 - if not set(models.keys()).issubset(set(params.keys())):
25 - missing_params = list(set(models.keys()) - set(params.keys()))
26 - raise ValueError("Some estimators are missing parameters: %s" % missing_params)
27 - self.models = models
28 - self.params = params
29 - self.keys = models.keys()
30 - self.grid_searches = {}
31 - self.best_estimator = {}
32 -
33 - def fit(self, X, y, cv=3, n_jobs=1, verbose=1, scoring=None, refit=False):
34 - for key in self.keys:
35 - print("Running GridSearchCV for %s." % key)
36 - model = self.models[key]
37 - params = self.params[key]
38 - gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
39 - verbose=verbose, scoring=scoring, refit=refit)
40 - gs.fit(X,y)
41 - self.grid_searches[key] = gs
42 -
43 - def score_summary(self, sort_by='mean_score'):
44 - def row(key, scores, params, model):
45 - d = {
46 - 'estimator': key,
47 - 'min_score': min(scores),
48 - 'max_score': max(scores),
49 - 'mean_score': mean(scores),
50 - 'std_score': std(scores),
51 - 'model': model
52 - }
53 - return pd.Series(dict(list(params.items()) + list(d.items())))
54 -
55 - rows = [row(k, gsc.cv_validation_scores, gsc.parameters, m)
56 - for k in self.keys
57 - for gsc, m in zip(self.grid_searches[k].grid_scores_, self.grid_searches[k].best_estimator_)]
58 - df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)
59 -
60 - columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
61 - columns = columns + [c for c in df.columns if (c not in columns and c != 'model')]
62 - self.best_estimator_ = df['model'][0]
63 - return df[columns]
64 -
65 -
66 -def get_abstracts(file_name, label):
67 - f = open(file_name)
68 - extract = {}
69 - docs = []
70 - empties = []
71 - lines = f.readlines()
72 - cpright = False
73 -
74 - for i, ln in enumerate(lines):
75 - if not ln.strip():
76 - empties.append(i)
77 - continue
78 - elif ' doi: ' in ln:
79 - for j in range(i, i + 10):
80 - if not lines[j].strip():
81 - title_idx = j + 1
82 - break
83 - continue
84 -
85 - elif 'cpright ' in ln:
86 - cpright = True
87 -
88 - elif 'DOI: ' in ln:
89 - if 'PMCID: ' in lines[i + 1]:
90 - extract['pmid'] = int(lines[i + 2].strip().split()[1])
91 - elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]:
92 - extract['pmid'] = int(lines[i + 1].strip().split()[1])
93 -
94 - if cpright:
95 - get = slice(empties[-3], empties[-2])
96 - cpright = False
97 - else:
98 - get = slice(empties[-2], empties[-1])
99 -
100 - extract['body'] = " ".join(lines[get]).replace("\n", ' ').replace(" ", ' ')
101 - title = []
102 - for j in range(title_idx, title_idx + 5):
103 - if lines[j].strip():
104 - title.append(lines[j])
105 - else:
106 - break
107 - extract['title'] = " ".join(title).replace("\n", ' ').replace(" ", ' ')
108 - extract['topic'] = label
109 - docs.append(extract)
110 - empties = []
111 - extract = {}
112 -
113 - return docs
114 -
115 -
116 -filename = "data/ecoli_abstracts/not_useful_abstracts.txt"
117 -labels = ['useless', 'useful']
118 -
119 -abstracs = get_abstracts(file_name=filename, label=labels[0])
120 -
121 -filename = "data/ecoli_abstracts/useful_abstracts.txt"
122 -
123 -abstracs += get_abstracts(file_name=filename, label=labels[1])
124 -
125 -X = [x['body'] for x in abstracs]
126 -y = [1 if x['topic'] == 'useful' else 0 for x in abstracs]
127 -
128 -models1 = {
129 - 'ExtraTreesClassifier': ExtraTreesClassifier(),
130 - 'RandomForestClassifier': RandomForestClassifier(),
131 - 'AdaBoostClassifier': AdaBoostClassifier(),
132 - 'GradientBoostingClassifier': GradientBoostingClassifier(),
133 - 'SVC': SVC()
134 -}
135 -
136 -params1 = {
137 - 'ExtraTreesClassifier': {'n_estimators': [16, 32]},
138 - 'RandomForestClassifier': {'n_estimators': [16, 32]},
139 - 'AdaBoostClassifier': {'n_estimators': [16, 32]},
140 - 'GradientBoostingClassifier': {'n_estimators': [16, 32],
141 - 'learning_rate': [0.8, 1.0]},
142 - 'SVC': [
143 - {'kernel': ['rbf'], 'C': [1, 10, 100, 150, 200, 300, 350, 400],
144 - 'gamma': [0.1, 0.01, 0.001, 0.0001, 0.00001]},
145 - {'kernel': ['poly'], 'C': [1, 10, 100, 150, 200, 300, 350, 400],
146 - 'degree': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 23, 26],
147 - 'coef0': [0.1, 0.2,0.3,0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}
148 - ]
149 -}
150 -
151 -clf = EstimatorSelectionHelper(models1, params1)
152 -
153 -vectorizer = TfidfVectorizer(binary=True)
154 - #ngram_range=(1, 3)
155 - #)
156 -#vectorizer = HashingVectorizer(non_negative=True)
157 -print(vectorizer)
158 -#svd = TruncatedSVD(n_components=200, random_state=42, n_iter=20)
159 -X = vectorizer.fit_transform(X)
160 -#X = svd.fit_transform(X)
161 -
162 -#X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
163 -
164 -#from sklearn.feature_selection import chi2, SelectKBest
165 -#ch2 = SelectKBest(chi2, k=200)
166 -#X_train = ch2.fit_transform(X_train, y_train)
167 -#X_test = ch2.transform(X_test)
168 -
169 -#clf = MultinomialNB(alpha=.01)
170 -#clf = Classifier(n_jobs=-1, n_iter=100)
171 -#st()
172 -clf.fit(X, y, scoring='f1', n_jobs=-1)
173 -
174 -#pred = clf.predict(X_test)
175 -#print(metrics.f1_score(y_test, pred, average='macro'))
176 -print(clf.score_summary(sort_by='min_score'))
177 -
178 -joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
179 -joblib.dump(vectorizer, 'model/tifidf_model.pkl')
1 -#from pdb import set_trace as st
2 -from sklearn.cross_validation import train_test_split as splitt
3 -from sklearn.feature_extraction.text import TfidfVectorizer
4 -from sklearn.model_selection import RandomizedSearchCV
5 -from sklearn.model_selection import GridSearchCV
6 -from sklearn import metrics
7 -from sklearn.svm import SVC
8 -import numpy as np
9 -import argparse
10 -import csv
11 -import os
12 -from sklearn.externals import joblib
13 -from time import time
14 -from scipy.stats import randint as sp_randint
15 -from scipy.stats import expon
16 -from sklearn.preprocessing import label_binarize
17 -
18 -
19 -def get_abstracts(file_name, label):
20 - f = open(file_name)
21 - extract = {}
22 - docs = []
23 - empties = []
24 - lines = f.readlines()
25 - copyright = False
26 -
27 - for i, ln in enumerate(lines):
28 - if not ln.strip():
29 - empties.append(i)
30 - continue
31 - elif ' doi: ' in ln:
32 - for j in range(i, i + 10):
33 - if not lines[j].strip():
34 - title_idx = j + 1
35 - break
36 - continue
37 -
38 - elif 'Copyright ' in ln or 'Publish' in ln or u'\N{COPYRIGHT SIGN}' in ln:
39 - copyright = True
40 -
41 - elif 'DOI: ' in ln:
42 - if 'PMCID: ' in lines[i + 1]:
43 - extract['pmid'] = int(lines[i + 2].strip().split()[1])
44 - elif not 'PMCID: ' in lines[i + 1] and 'PMID: ' in lines[i + 1]:
45 - extract['pmid'] = int(lines[i + 1].strip().split()[1])
46 -
47 - if copyright:
48 - get = slice(empties[-3], empties[-2])
49 - copyright = False
50 - else:
51 - get = slice(empties[-2], empties[-1])
52 -
53 - extract['body'] = " ".join(lines[get]).replace("\n", ' '
54 - ).replace(" ", ' ')
55 - title = []
56 - for j in range(title_idx, title_idx + 5):
57 - if lines[j].strip():
58 - title.append(lines[j])
59 - else:
60 - break
61 - extract['title'] = " ".join(title).replace("\n", ' '
62 - ).replace(" ", ' ')
63 - extract['topic'] = label
64 - docs.append(extract)
65 - empties = []
66 - extract = {}
67 -
68 - return docs
69 -
70 -
71 -parser = argparse.ArgumentParser(
72 - description="This script separates abstracts of biomedical papers that"
73 - "report data from biomedical experiments from those that do not.")
74 -parser.add_argument("--input", help="Input file containing the abstracts to"
75 - "be predited.")
76 -parser.add_argument("--classA", help="Input file containing the abstracts of"
77 - "class A to be learned.")
78 -parser.add_argument("--classB", help="Input file containing the abstracts of"
79 - "class B to be learned.")
80 -parser.add_argument("--out", help="Path to the output directory "
81 - "(default='./filter_output')", default="filter_output")
82 -parser.add_argument("--svcmodel", help="Path to custom pretrained svc model"
83 - "(default='./model/svm_model.pkl')", default="model/svm_model.pkl")
84 -
85 -args = parser.parse_args()
86 -
87 -labels = {0: 'useless', 1: 'useful'}
88 -vectorizer = TfidfVectorizer(binary=True)
89 -print(vectorizer)
90 -
91 -if args.classA and args.classA and not args.input:
92 - f0 = open("model_params.conf")
93 - n_iter_search = 10
94 - params = [p for p in csv.DictReader(f0)]
95 - f0.close()
96 - names = list(params[0].keys())
97 - model_params = {n: [] for n in names}
98 -
99 - for n in names:
100 - for d in params:
101 - for k in d:
102 - if k == n:
103 - try:
104 - model_params[n].append(float(d[k]))
105 - except ValueError:
106 - model_params[n].append(d[k])
107 -
108 - model_params = {k: list(set(model_params[k])) for k in model_params}
109 - abstracs = get_abstracts(file_name=args.classA, label=labels[0])
110 - abstracs += get_abstracts(file_name=args.classB, label=labels[1])
111 -
112 - tfidf_model = vectorizer.fit([x['body'] for x in abstracs])
113 - X = vectorizer.transform([x['body'] for x in abstracs])
114 - #y = [x['topic'] for x in abstracs]
115 - y = [0 if x['topic'] == 'useless' else 1 for x in abstracs]
116 -
117 - #X_train, X_test, y_train, y_test = splitt(X, y, test_size=0.3, random_state=42)
118 -
119 - clf = SVC()#kernel='linear', C=100.0, gamma=0.0001)# degree=11, coef0=0.9)
120 - clf = GridSearchCV(clf, cv=3,
121 - param_grid=model_params,
122 - # clf = RandomizedSearchCV(clf, param_distributions=model_params, cv=5, n_iter=n_iter_search,
123 - n_jobs=-1, scoring='f1')
124 - start = time()
125 - clf.fit(X, y)
126 -
127 - #clf.fit(X_train, y_train)
128 - print("GridSearch took %.2f seconds for %d candidates"
129 - " parameter settings." % ((time() - start), n_iter_search))
130 -
131 - print(clf.best_estimator_)
132 - print()
133 - print(clf.best_score_)
134 - #print(metrics.f1_score(clf.predict(X_test), y_test))
135 -
136 - #joblib.dump(clf, 'model/svm_model.pkl')
137 - joblib.dump(clf.best_estimator_, 'model/svm_model.pkl')
138 - joblib.dump(tfidf_model, 'model/tfidf_model.pkl')
139 -
140 -else:
141 -
142 - clf = joblib.load(args.svcmodel)
143 - vectorizer = joblib.load('model/tfidf_model.pkl')
144 - abstracs = get_abstracts(file_name=args.input, label='unknown')
145 - X = vectorizer.transform([x['body'] for x in abstracs])
146 - classes = clf.predict(X)
147 -
148 - if not os.path.exists(args.out):
149 - os.makedirs(args.out)
150 - # Writing predictions to output files
151 - with open(args.out + "/" + labels[0] + ".out", 'w') as f0, \
152 - open(args.out + "/" + labels[1] + ".out", 'w') as f1:
153 - for c, a in zip(classes, abstracs):
154 - if c == 0:
155 - f0.write("%d\t%s\n" % (a['pmid'], a['body']))
156 - elif c == 1:
157 - f1.write("%d\t%s\n" % (a['pmid'], a['body']))
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
1 -kernel,degree,coef0,C,gamma
2 -linear,1,0.5,100,0.0
3 -linear,1,0.5,10,0.0
4 -linear,1,0.5,50,0.0
5 -linear,1,0.5,100,0.0
6 -linear,1,0.5,5,0.0
7 -linear,1,0.5,150,0.0
8 -linear,1,0.5,200,0.0
9 -linear,1,0.5,300,0.0
10 -linear,1,0.5,400,0.0
11 -linear,1,0.5,1.0,0.0
12 -linear,1,0.5,5.0,0.0
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
1 -TfidfVectorizer(analyzer='word', binary=True, decode_error='strict',
2 - dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
3 - lowercase=True, max_df=1.0, max_features=None, min_df=1,
4 - ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
5 - stop_words=None, strip_accents=None, sublinear_tf=False,
6 - token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
7 - vocabulary=None)
8 -Running GridSearchCV for GradientBoostingClassifier.
9 -Fitting 3 folds for each of 4 candidates, totalling 12 fits
10 -Running GridSearchCV for AdaBoostClassifier.
11 -Fitting 3 folds for each of 2 candidates, totalling 6 fits
12 -Running GridSearchCV for ExtraTreesClassifier.
13 -Fitting 3 folds for each of 2 candidates, totalling 6 fits
14 -Running GridSearchCV for SVC.
15 -Fitting 3 folds for each of 63 candidates, totalling 189 fits
16 -Running GridSearchCV for RandomForestClassifier.
17 -Fitting 3 folds for each of 2 candidates, totalling 6 fits
18 - estimator min_score mean_score max_score std_score \
19 -36 SVC 0.69697 0.702911 0.705882 0.00420147
20 -66 SVC 0.69697 0.702911 0.705882 0.00420147
21 -35 SVC 0.69697 0.702911 0.705882 0.00420147
22 -37 SVC 0.69697 0.702911 0.705882 0.00420147
23 -38 SVC 0.69697 0.702911 0.705882 0.00420147
24 -39 SVC 0.69697 0.702911 0.705882 0.00420147
25 -40 SVC 0.69697 0.702911 0.705882 0.00420147
26 -41 SVC 0.69697 0.702911 0.705882 0.00420147
27 -42 SVC 0.69697 0.702911 0.705882 0.00420147
28 -43 SVC 0.69697 0.702911 0.705882 0.00420147
29 -44 SVC 0.69697 0.702911 0.705882 0.00420147
30 -45 SVC 0.69697 0.702911 0.705882 0.00420147
31 -46 SVC 0.69697 0.702911 0.705882 0.00420147
32 -47 SVC 0.69697 0.702911 0.705882 0.00420147
33 -48 SVC 0.69697 0.702911 0.705882 0.00420147
34 -49 SVC 0.69697 0.702911 0.705882 0.00420147
35 -50 SVC 0.69697 0.702911 0.705882 0.00420147
36 -51 SVC 0.69697 0.702911 0.705882 0.00420147
37 -52 SVC 0.69697 0.702911 0.705882 0.00420147
38 -53 SVC 0.69697 0.702911 0.705882 0.00420147
39 -54 SVC 0.69697 0.702911 0.705882 0.00420147
40 -55 SVC 0.69697 0.702911 0.705882 0.00420147
41 -56 SVC 0.69697 0.702911 0.705882 0.00420147
42 -57 SVC 0.69697 0.702911 0.705882 0.00420147
43 -58 SVC 0.69697 0.702911 0.705882 0.00420147
44 -59 SVC 0.69697 0.702911 0.705882 0.00420147
45 -60 SVC 0.69697 0.702911 0.705882 0.00420147
46 -61 SVC 0.69697 0.702911 0.705882 0.00420147
47 -62 SVC 0.69697 0.702911 0.705882 0.00420147
48 -63 SVC 0.69697 0.702911 0.705882 0.00420147
49 -.. ... ... ... ... ...
50 -12 SVC 0.69697 0.702911 0.705882 0.00420147
51 -13 SVC 0.69697 0.702911 0.705882 0.00420147
52 -14 SVC 0.69697 0.702911 0.705882 0.00420147
53 -15 SVC 0.69697 0.702911 0.705882 0.00420147
54 -16 SVC 0.69697 0.702911 0.705882 0.00420147
55 -17 SVC 0.69697 0.702911 0.705882 0.00420147
56 -26 SVC 0.69697 0.702911 0.705882 0.00420147
57 -25 SVC 0.69697 0.702911 0.705882 0.00420147
58 -30 SVC 0.69697 0.702911 0.705882 0.00420147
59 -29 SVC 0.69697 0.702911 0.705882 0.00420147
60 -28 SVC 0.69697 0.702911 0.705882 0.00420147
61 -27 SVC 0.69697 0.702911 0.705882 0.00420147
62 -19 SVC 0.69697 0.702911 0.705882 0.00420147
63 -65 SVC 0.69697 0.702911 0.705882 0.00420147
64 -24 SVC 0.69697 0.702911 0.705882 0.00420147
65 -23 SVC 0.69697 0.702911 0.705882 0.00420147
66 -22 SVC 0.69697 0.702911 0.705882 0.00420147
67 -21 SVC 0.69697 0.702911 0.705882 0.00420147
68 -18 SVC 0.686567 0.693502 0.69697 0.0049038
69 -20 SVC 0.676923 0.691047 0.707692 0.0126874
70 -7 ExtraTreesClassifier 0.619048 0.662524 0.688525 0.0309388
71 -6 ExtraTreesClassifier 0.588235 0.611627 0.655738 0.0312098
72 -1 GradientBoostingClassifier 0.577778 0.595982 0.610169 0.0135256
73 -0 GradientBoostingClassifier 0.5 0.549894 0.596491 0.0394613
74 -71 RandomForestClassifier 0.470588 0.557789 0.625 0.0646035
75 -3 GradientBoostingClassifier 0.454545 0.548927 0.596491 0.0667386
76 -2 GradientBoostingClassifier 0.439024 0.588593 0.701754 0.110305
77 -5 AdaBoostClassifier 0.411765 0.489657 0.618182 0.0915596
78 -4 AdaBoostClassifier 0.4 0.54013 0.655172 0.105673
79 -72 RandomForestClassifier 0.380952 0.504177 0.631579 0.10236
80 -
81 - C degree gamma kernel learning_rate n_estimators
82 -36 100 6 NaN poly NaN NaN
83 -66 200 NaN 0.0001 sigmoid NaN NaN
84 -35 100 5 NaN poly NaN NaN
85 -37 150 2 NaN poly NaN NaN
86 -38 150 3 NaN poly NaN NaN
87 -39 150 4 NaN poly NaN NaN
88 -40 150 5 NaN poly NaN NaN
89 -41 150 6 NaN poly NaN NaN
90 -42 200 2 NaN poly NaN NaN
91 -43 200 3 NaN poly NaN NaN
92 -44 200 4 NaN poly NaN NaN
93 -45 200 5 NaN poly NaN NaN
94 -46 200 6 NaN poly NaN NaN
95 -47 300 2 NaN poly NaN NaN
96 -48 300 3 NaN poly NaN NaN
97 -49 300 4 NaN poly NaN NaN
98 -50 300 5 NaN poly NaN NaN
99 -51 300 6 NaN poly NaN NaN
100 -52 400 2 NaN poly NaN NaN
101 -53 400 3 NaN poly NaN NaN
102 -54 400 4 NaN poly NaN NaN
103 -55 400 5 NaN poly NaN NaN
104 -56 400 6 NaN poly NaN NaN
105 -57 1 NaN 0.001 sigmoid NaN NaN
106 -58 1 NaN 0.0001 sigmoid NaN NaN
107 -59 10 NaN 0.001 sigmoid NaN NaN
108 -60 10 NaN 0.0001 sigmoid NaN NaN
109 -61 100 NaN 0.001 sigmoid NaN NaN
110 -62 100 NaN 0.0001 sigmoid NaN NaN
111 -63 150 NaN 0.001 sigmoid NaN NaN
112 -.. ... ... ... ... ... ...
113 -12 100 NaN 0.001 rbf NaN NaN
114 -13 100 NaN 0.0001 rbf NaN NaN
115 -14 150 NaN 0.001 rbf NaN NaN
116 -15 150 NaN 0.0001 rbf NaN NaN
117 -16 200 NaN 0.001 rbf NaN NaN
118 -17 200 NaN 0.0001 rbf NaN NaN
119 -26 1 6 NaN poly NaN NaN
120 -25 1 5 NaN poly NaN NaN
121 -30 10 5 NaN poly NaN NaN
122 -29 10 4 NaN poly NaN NaN
123 -28 10 3 NaN poly NaN NaN
124 -27 10 2 NaN poly NaN NaN
125 -19 300 NaN 0.0001 rbf NaN NaN
126 -65 200 NaN 0.001 sigmoid NaN NaN
127 -24 1 4 NaN poly NaN NaN
128 -23 1 3 NaN poly NaN NaN
129 -22 1 2 NaN poly NaN NaN
130 -21 400 NaN 0.0001 rbf NaN NaN
131 -18 300 NaN 0.001 rbf NaN NaN
132 -20 400 NaN 0.001 rbf NaN NaN
133 -7 NaN NaN NaN NaN NaN 32
134 -6 NaN NaN NaN NaN NaN 16
135 -1 NaN NaN NaN NaN 0.8 32
136 -0 NaN NaN NaN NaN 0.8 16
137 -71 NaN NaN NaN NaN NaN 16
138 -3 NaN NaN NaN NaN 1 32
139 -2 NaN NaN NaN NaN 1 16
140 -5 NaN NaN NaN NaN NaN 32
141 -4 NaN NaN NaN NaN NaN 16
142 -72 NaN NaN NaN NaN NaN 32
143 -
144 -[73 rows x 11 columns]
This diff could not be displayed because it is too large.
1 -28911122 ChIP-exo/nexus experiments rely on innovative modifications of the commonly used ChIP-seq protocol for high resolution mapping of transcription factor binding sites. Although many aspects of the ChIP-exo data analysis are similar to those of ChIP-seq, these high throughput experiments pose a number of unique quality control and analysis challenges. We develop a novel statistical quality control pipeline and accompanying R/Bioconductor package, ChIPexoQual, to enable exploration and analysis of ChIP-exo and related experiments. ChIPexoQual evaluates a number of key issues including strand imbalance, library complexity, and signal enrichment of data. Assessment of these features are facilitated through diagnostic plots and summary statistics computed over regions of the genome with varying levels of coverage. We evaluated our QC pipeline with both large collections of public ChIP-exo/nexus data and multiple, new ChIP-exo datasets from Escherichia coli. ChIPexoQual analysis of these datasets resulted in guidelines for using these QC metrics across a wide range of sequencing depths and provided further insights for modelling ChIP-exo data.
This diff is collapsed. Click to expand it.