Created
April 21, 2011 20:46
-
-
Save gsakkis/935425 to your computer and use it in GitHub Desktop.
automatic binning for integer data histograms
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from itertools import groupby | |
from operator import itemgetter | |
def bincount_histogram(nums, num_bins=sys.maxint, min_bin_count=1): | |
num_bins = max(num_bins, 1) | |
min_bin_count = max(min_bin_count, 1) | |
# initialize the bins of width=1 | |
bin_edges, counts = [], [] | |
for num, group in groupby(sorted(nums)): | |
bin_edges.append(num) | |
counts.append(sum(1 for _ in group)) | |
# append the rightmost edge | |
bin_edges.append(bin_edges[-1] + 1) | |
num_counts = len(counts) | |
while num_counts: | |
# find the least populated bin | |
j, min_count = min(enumerate(counts), key=itemgetter(1)) | |
# stop looping as soon as the bins have been reduced to `num_bins` | |
# (or less) and no bin contains less than `min_bin_count` | |
if num_counts <= num_bins and min_count >= min_bin_count: | |
break | |
# merge the min_count bin with a neighbor | |
if j == 0 or j < num_counts-1 and counts[j-1] > counts[j+1]: | |
# leftmost bin or with left neighbor bigger than right: | |
# merge into right neighbor | |
del bin_edges[j+1] | |
counts[j+1] += min_count | |
else: | |
# rightmost bin or with right neighbor bigger or equal than left: | |
# merge into left neighbor | |
del bin_edges[j] | |
counts[j-1] += min_count | |
del counts[j] | |
num_counts -= 1 | |
return counts, bin_edges | |
def validate_bincount_histogram(nums, num_bins=sys.maxint, min_bin_count=1): | |
assert all(isinstance(x,int) for x in nums) | |
assert len(nums) >= max(1, min_bin_count) | |
counts, bin_edges = bincount_histogram(nums, num_bins, min_bin_count) | |
assert len(bin_edges) == len(counts) + 1 | |
assert bin_edges == sorted(bin_edges) | |
assert sum(counts) == len(nums) | |
assert len(counts) <= num_bins | |
assert all(c>=min_bin_count for c in counts) | |
from numpy import histogram | |
counts2, bin_edges2 = histogram(nums, bins=bin_edges) | |
assert counts == list(counts2) | |
assert bin_edges == list(bin_edges2) | |
if __name__ == '__main__': | |
from numpy.random import randint | |
i = 0 | |
for _ in xrange(100): | |
for size in 10, 100: | |
for num_bins in sys.maxint, randint(1,size+1): | |
for min_bin_count in 1, randint(1,size+1): | |
for k in 0.1, 0.5, 1.0, 2.0, 10: | |
nums = randint(-k*size, k*size, size=size) | |
try: | |
validate_bincount_histogram(nums, num_bins, min_bin_count) | |
i += 1 | |
except AssertionError: | |
print "nums:", nums | |
print "num_bins:", num_bins | |
print "min_bin_count:", min_bin_count | |
raise | |
print '%d tests ran successfully' % i |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment