Skip to content

Instantly share code, notes, and snippets.

@wasade
Last active December 25, 2015 19:48
Show Gist options
  • Save wasade/7029970 to your computer and use it in GitHub Desktop.
Save wasade/7029970 to your computer and use it in GitHub Desktop.
Easy hash buckets. The specific use is in situations where you have a giant single dictionary (e.g., 10s of millions of keys). The motivation is to break up a giant hash table into smaller ones in order to reduce the amount of memory required.
#!/usr/bin/env python27
from random import shuffle
from unittest import TestCase, main
__author__ = "Daniel McDonald"
__credits__ = ["Daniel McDonald"]
__license__ = "CC0 1.0, http://creativecommons.org/publicdomain/zero/1.0/"
__maintainer__ = "Daniel McDonald"
__url__ = "https://gist.github.com/wasade/7029970"
__email__ = "[email protected]"
class DictBucket(dict):
"""Partition up hash space into buckets
keys - the keys that go into each bucket
n - the number of buckets
To use:
>>> d = DictBucket(range(17), 4)
>>> d[16]['asd'] = 10
>>> d
{0: {}, 1: {}, 2: {}, 3: {}, 4: {'asd': 10}, 5: {}, 6: {}, 7: {}, 8: {}, 9: {}, 10: {}, 11: {}, 12: {'asd': 10}, 13: {}, 14: {}, 15: {'asd': 10}, 16: {'asd': 10}}
"""
def __init__(self, keys, n, random_f=shuffle):
super(DictBucket, self).__setattr__('n',n)
super(DictBucket, self).__setattr__('_inner',
[dict() for i in range(n)])
if random_f is not None:
random_f(keys)
for i,k in enumerate(keys):
super(DictBucket, self).__setitem__(k, self._inner[i % n])
def __getitem__(self, key):
if key not in self:
raise KeyError("Unknown bucket key %s!" % key)
else:
return super(DictBucket, self).__getitem__(key)
def __setitem__(self, key, value):
raise TypeError("Bucket keys are immutable")
__delitem__ = __setitem__
class BucketTest(TestCase):
def test_n_buckets(self):
d = DictBucket(range(16), 4)
self.assertEqual(len(d.keys()), 16)
self.assertEqual(len(set(map(id, d.values()))), 4)
d = DictBucket(range(17), 4)
self.assertEqual(len(d.keys()), 17)
self.assertEqual(len(set(map(id, d.values()))), 4)
def test_immutability_of_container(self):
d = DictBucket(range(16), 4)
self.assertRaises(TypeError, d.__setitem__, 'a', 5)
def test_getitem(self):
import types
d = DictBucket(range(16), 4)
self.assertRaises(KeyError, d.__getitem__, 'a')
inner = d[0]
self.assertEqual(type(inner), types.DictType)
def test_punch(self):
# None -> don't shuffle keys
d = DictBucket(range(16), 2, None)
for i in range(16):
for j in range(100):
d[i][j * i] = True
# first bucket
for i in range(2,16)[::2]:
self.assertEqual(d[i-2], d[i])
# second bucket
for i in range(3, 16)[::2]:
self.assertEqual(d[i-2],d[i])
# assert the buckets are not the same
self.assertNotEqual(d[7],d[8])
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment