Created
July 25, 2018 19:20
-
-
Save adambard/243da9241946da8db0d904dbddd3fe70 to your computer and use it in GitHub Desktop.
A Django HLLField (HyperLogLog)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Presented without warranty, but seems to work ok | |
from django.db import models | |
# From https://github.com/ascv/HyperLogLog | |
from HLL import HyperLogLog | |
def init_hll(m, seed, bytes=None): | |
hll = HyperLogLog(m, seed) | |
if bytes is not None: | |
hll.set_registers(bytes) | |
return hll | |
class HLLField(models.BinaryField): | |
""" | |
Use a HyperLogLog for efficient counting, storing its state in a binary field | |
""" | |
def __init__(self, register_exponent=5, seed=314, *args, **kwargs): | |
self.register_exponent = register_exponent | |
self.seed = seed | |
super(HLLField, self).__init__(*args, **kwargs) | |
def deconstruct(self): | |
name, path, args, kwargs = super(HLLField, self).deconstruct() | |
if self.seed != 314: | |
kwargs['seed'] = self.seed | |
if self.register_exponent != 5: | |
kwargs['register_exponent'] = self.register_exponent | |
return name, path, args, kwargs | |
def from_db_value(self, value, expression, connection, context): | |
if value is None: | |
return value | |
elif not isinstance(value, bytearray): | |
value = bytearray(value) | |
return init_hll(self.register_exponent, self.seed, value) | |
def to_python(self, value): | |
if isinstance(value, HyperLogLog): | |
return value | |
elif value is None: | |
return value | |
elif not isinstance(value, bytearray): | |
value = bytearray(value) | |
return init_hll(self.register_exponent, self.seed, super(HLLField, self).to_python(value)) | |
def get_prep_value(self, value): | |
v = value.registers() | |
return super(HLLField, self).get_prep_value(v) | |
def value_to_string(self, value): | |
return "HLL(cardinality={})".format(value.cardinality()) | |
# Usage | |
class MyModel(models.Model): | |
# register_exponent goes from 2-16, creating an hll with 2^16 registers | |
# More registers=slower, but more accurate | |
# Seed is just a random number for Murmer, pick something and don't change it | |
hll = HLLField(register_exponent=8, seed=123) | |
m = MyModel(hll=init_hll(8, 123)) | |
m.hll.add("Something to count") | |
m.hll.add("Something else to count") | |
m.hll.add("I guess we're counting sentences now?") | |
m.hll.add("Duplicates won't be double-counted") | |
m.hll.add("Duplicates won't be double-counted") | |
m.save() # HLL is serialized and saved | |
m2 = MyModel.objects.get() | |
m2.hll.cardinality() # 4.0xxx or so | |
m3 = MyModel(hll=init_hll(8, 123)) | |
m3.hll.add("a") | |
m3.hll.add("b") | |
m3.hll.add("Duplicates won't be double-counted") | |
m3.hll.add("Duplicates won't be double-counted") | |
m2.hll.merge(m3.hll) | |
m2.hll.cardinality() # 6.0xx or so |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment