Created
September 27, 2012 15:45
-
-
Save andrewgiessel/3794727 to your computer and use it in GitHub Desktop.
bayesian block algorithm for finding the optimal number of histogram bins
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def bayesian_blocks(t): | |
"""Bayesian Blocks Implementation | |
By Jake Vanderplas. License: BSD | |
Based on algorithm outlined in http://adsabs.harvard.edu/abs/2012arXiv1207.5578S | |
Parameters | |
---------- | |
t : ndarray, length N | |
data to be histogrammed | |
Returns | |
------- | |
bins : ndarray | |
array containing the (N+1) bin edges | |
Notes | |
----- | |
This is an incomplete implementation: it may fail for some | |
datasets. Alternate fitness functions and prior forms can | |
be found in the paper listed above. | |
""" | |
# copy and sort the array | |
t = np.sort(t) | |
N = t.size | |
# create length-(N + 1) array of cell edges | |
edges = np.concatenate([t[:1], | |
0.5 * (t[1:] + t[:-1]), | |
t[-1:]]) | |
block_length = t[-1] - edges | |
# arrays needed for the iteration | |
nn_vec = np.ones(N) | |
best = np.zeros(N, dtype=float) | |
last = np.zeros(N, dtype=int) | |
#----------------------------------------------------------------- | |
# Start with first data cell; add one cell at each iteration | |
#----------------------------------------------------------------- | |
for K in range(N): | |
# Compute the width and count of the final bin for all possible | |
# locations of the K^th changepoint | |
width = block_length[:K + 1] - block_length[K + 1] | |
count_vec = np.cumsum(nn_vec[:K + 1][::-1])[::-1] | |
# evaluate fitness function for these possibilities | |
fit_vec = count_vec * (np.log(count_vec) - np.log(width)) | |
fit_vec -= 4 # 4 comes from the prior on the number of changepoints | |
fit_vec[1:] += best[:K] | |
# find the max of the fitness: this is the K^th changepoint | |
i_max = np.argmax(fit_vec) | |
last[K] = i_max | |
best[K] = fit_vec[i_max] | |
#----------------------------------------------------------------- | |
# Recover changepoints by iteratively peeling off the last block | |
#----------------------------------------------------------------- | |
change_points = np.zeros(N, dtype=int) | |
i_cp = N | |
ind = N | |
while True: | |
i_cp -= 1 | |
change_points[i_cp] = ind | |
if ind == 0: | |
break | |
ind = last[ind - 1] | |
change_points = change_points[i_cp:] | |
return edges[change_points] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# plot a standard histogram in the background, with alpha transparency | |
H1 = hist(x, bins=200, histtype='stepfilled', | |
alpha=0.2, normed=True) | |
# plot an adaptive-width histogram on top | |
H2 = hist(x, bins=bayesian_blocks(x), color='black', | |
histtype='step', normed=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment