Created
June 6, 2011 10:46
-
-
Save GaelVaroquaux/1010064 to your computer and use it in GitHub Desktop.
Computing bins to have a somewhat equal partitioning of the data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
def _compute_bins(x, n_bins=10): | |
""" Find optimal bins from a univariate distribution | |
Parameters | |
=========== | |
x: 1D array-like | |
Samples | |
n_bins: integer | |
Number of bins to return. The algorithm will return at most | |
n_bins, but may return less. | |
Returns | |
======== | |
bins: 1D array, length: n_bins + 1 | |
The bounds of the bins: the first value is the lower bound of | |
the first bins, the second value is the upper bound of the | |
first bin and the lower bound of the second bin... the last | |
value is the upper bound of the last bin. | |
Notes | |
====== | |
This function tries to return bins defined by quantiles: divide | |
the sample set into bins with an equal number of observations. | |
However, if the sample has macroscopicaly occupated state (for | |
instance in the case of a sparse vector, many coefficients are | |
zero), the function will single these out. | |
""" | |
# Use quantiles as bins to equally divide the distribution | |
distribution = np.asarray(x).copy().astype(np.float) | |
distribution.sort() | |
eps = np.finfo(np.float).eps | |
bin_indices = np.linspace(0, (len(distribution)-1), | |
n_bins).astype(np.int) | |
bins = distribution[bin_indices] | |
unique_bins = np.unique(bins) | |
if not unique_bins.size == n_bins+1: | |
# some bins are equal, this can be the case for sparse | |
# distributions, for which many coefficients are near zero. In this | |
# case, we take new quantiles regularly-spaced in between sets of | |
# zero-sized bins | |
posterized_bins = np.searchsorted(unique_bins, bins) | |
edges = np.where(np.diff(posterized_bins, n=2))[0].tolist() | |
edges.insert(0, -1) | |
edges.append(n_bins-2) | |
new_indices = [0] | |
scale = (n_bins+1)/float(unique_bins.size) | |
for index_start, index_end in zip(edges[::2], edges[1::2]): | |
# We are specifying only one value for end of a bin and start | |
# of the next, remove the duplicated value | |
new_indices.pop() | |
new_indices.extend(np.linspace(bin_indices[index_start+1], | |
bin_indices[index_end+1], | |
scale*(index_end - index_start +1))) | |
new_indices[-1] = -1 | |
new_indices = np.array(new_indices, dtype=np.int) | |
bins = distribution[new_indices] | |
# Finally clean up left over duplicated bins, rather than | |
# iterating on this process: | |
bins = np.unique(bins) | |
# If we are still low on bins (because there are very few | |
# unique numbers in our observations), create some more by | |
# subdividing bins | |
if bins.size < .5 * (n_bins + 1): | |
bins = np.r_[bins, .5*(bins[1:] + bins[:-1])] | |
bins.sort() | |
# Avoid aliasing the bins with the data | |
bins += 10*eps | |
bins[-1] = distribution[-1] + .01*max(-distribution[0], distribution[-1]) | |
bins[0] = distribution[0] - eps | |
return bins |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment