anatolebeuzon · February 13, 2022 11:43
diff --git a/README.md b/README.md
diff --git a/go.mod b/go.mod
 module github.com/DataDog/experimental/users/anatole.beuzon/large-card-hll

 go 1.17

 require github.com/DataDog/hyperloglog v0.0.0-20200325135234-98c95166e316

 require (
 	github.com/DataDog/mmh3 v0.0.0-20210722141835-012dc69a9e49 // indirect
 	github.com/dustin/randbo v0.0.0-20140428231429-7f1b564ca724 // indirect
 )
diff --git a/go.sum b/go.sum
 github.com/DataDog/hyperloglog v0.0.0-20200325135234-98c95166e316 h1:CTR6ylcEcfvmr6xAJwLfKfud8EtIe5eBrVh5ClYj5ow=
 github.com/DataDog/hyperloglog v0.0.0-20200325135234-98c95166e316/go.mod h1:hFPkswc42pKhRbeKDKXy05mRi7J1kJ2vMNbvd9erH0M=
 github.com/DataDog/mmh3 v0.0.0-20210722141835-012dc69a9e49 h1:EbzDX8HPk5uE2FsJYxD74QmMw0/3CqSKhEr6teh0ncQ=
 github.com/DataDog/mmh3 v0.0.0-20210722141835-012dc69a9e49/go.mod h1:SvsjzyJlSg0rKsqYgdcFxeEVflx3ZNAyFfkUHP0TxXg=
 github.com/dustin/randbo v0.0.0-20140428231429-7f1b564ca724 h1:1/c0u68+2LRI+XSpduQpV9BnKx1k1P6GTb3MVxCE3w4=
 github.com/dustin/randbo v0.0.0-20140428231429-7f1b564ca724/go.mod h1:pTiKQhUCcxt2eQMAnv48oc5nAsmelPm573z44h6PSXc=
diff --git a/main.go b/main.go
 package main

 import (
 	"flag"
 	"fmt"
 	"math"
 	"math/rand"
 	"os"
 	"runtime"
 	"sync"

 	hll "github.com/DataDog/hyperloglog"
 )

 func main() {
 	var target int
 	flag.IntVar(&target, "card", 0, "target cardinality")
 	flag.Parse()
 	if target == 0 {
 		flag.PrintDefaults()
 		os.Exit(1)
 	}

 	// Map
 	procs := runtime.GOMAXPROCS(0)
 	wg := sync.WaitGroup{}
 	counters := make([]*hll.HyperLogLog, procs)
 	for i := 0; i < procs; i++ {
 		wg.Add(1)
 		go func(i int) {
 			counters[i] = genCounts(target/procs, i, i == 0)
 			wg.Done()
 		}(i)
 	}
 	wg.Wait()

 	// Reduce
 	finalCounter := newCounter()
 	for _, c := range counters {
 		err := finalCounter.Merge(c)
 		if err != nil {
 			panic(err)
 		}
 	}

 	resWithoutCorrection := count(finalCounter, false)
 	resWithCorrection := count(finalCounter, true)
 	fmt.Printf("Without correction:\t%d\t(inaccuracy: %+.0f%%)\n", resWithoutCorrection, pctDist(target, resWithoutCorrection))
 	fmt.Printf("With correction:\t%d\t(inaccuracy: %+.0f%%)\n", resWithCorrection, pctDist(target, resWithCorrection))
 }

 func newCounter() *hll.HyperLogLog {
 	const power = 10 // will use 2^10 registers
 	c, err := hll.New(uint(1) << power)
 	if err != nil {
 		panic(err)
 	}
 	return c
 }

 // genCounts will generate n random numbers and add them to a counter, then return that counter.
 func genCounts(n int, seed int, printProgress bool) *hll.HyperLogLog {
 	r := rand.New(rand.NewSource(int64(seed)))
 	c := newCounter()
 	for i := 0; i < n; i++ {
 		if printProgress && i%(n/100) == 0 {
 			fmt.Printf("%d%%\r", 100*i/n)
 		}

 		// We MUST use 64-bits inputs so that the domain of the hash function (2**64
 		// possibilities) is larger than its output domain (2**32), to properly model
 		// a real application.
 		//
 		// Otherwise with hll.Murmur32(r.Uint32()) there would be virtually no hash
 		// collisions, which is not realistic.
 		c.Add(hll.Murmur64(r.Uint64()))
 	}
 	return c
 }

 func pctDist(target int, actual uint64) float64 {
 	return 100 * float64(int64(actual)-int64(target)) / float64(target)
 }

 // Same as github.com/DataDog/hyperloglog's (*hll.HyperLogLog).Count(), but with an extra
 // argument to be able to toggle the large range correction on/off.
 func count(h *hll.HyperLogLog, withLargeRangeCorrection bool) uint64 {
 	exp32 := math.Pow(2, 32)

 	sum := 0.0
 	m := float64(h.M)
 	for _, val := range h.Registers {
 		sum += 1.0 / math.Pow(2.0, float64(val))
 	}
 	estimate := h.Alpha * m * m / sum
 	if estimate <= 5.0/2.0*m {
 		// Small range correction
 		v := 0
 		for _, r := range h.Registers {
 			if r == 0 {
 				v++
 			}
 		}
 		if v > 0 {
 			estimate = m * math.Log(m/float64(v))
 		}
 	} else if estimate > 1.0/30.0*exp32 {
 		if withLargeRangeCorrection {
 			estimate = -exp32 * math.Log(1-estimate/exp32)
 		}
 	}
 	return uint64(estimate)
 }
	module github.com/DataDog/experimental/users/anatole.beuzon/large-card-hll

	go 1.17

	require github.com/DataDog/hyperloglog v0.0.0-20200325135234-98c95166e316

	require (
	github.com/DataDog/mmh3 v0.0.0-20210722141835-012dc69a9e49 // indirect
	github.com/dustin/randbo v0.0.0-20140428231429-7f1b564ca724 // indirect
	)
	github.com/DataDog/hyperloglog v0.0.0-20200325135234-98c95166e316 h1:CTR6ylcEcfvmr6xAJwLfKfud8EtIe5eBrVh5ClYj5ow=
	github.com/DataDog/hyperloglog v0.0.0-20200325135234-98c95166e316/go.mod h1:hFPkswc42pKhRbeKDKXy05mRi7J1kJ2vMNbvd9erH0M=
	github.com/DataDog/mmh3 v0.0.0-20210722141835-012dc69a9e49 h1:EbzDX8HPk5uE2FsJYxD74QmMw0/3CqSKhEr6teh0ncQ=
	github.com/DataDog/mmh3 v0.0.0-20210722141835-012dc69a9e49/go.mod h1:SvsjzyJlSg0rKsqYgdcFxeEVflx3ZNAyFfkUHP0TxXg=
	github.com/dustin/randbo v0.0.0-20140428231429-7f1b564ca724 h1:1/c0u68+2LRI+XSpduQpV9BnKx1k1P6GTb3MVxCE3w4=
	github.com/dustin/randbo v0.0.0-20140428231429-7f1b564ca724/go.mod h1:pTiKQhUCcxt2eQMAnv48oc5nAsmelPm573z44h6PSXc=
	package main

	import (
	"flag"
	"fmt"
	"math"
	"math/rand"
	"os"
	"runtime"
	"sync"

	hll "github.com/DataDog/hyperloglog"
	)

	func main() {
	var target int
	flag.IntVar(&target, "card", 0, "target cardinality")
	flag.Parse()
	if target == 0 {
	flag.PrintDefaults()
	os.Exit(1)
	}

	// Map
	procs := runtime.GOMAXPROCS(0)
	wg := sync.WaitGroup{}
	counters := make([]*hll.HyperLogLog, procs)
	for i := 0; i < procs; i++ {
	wg.Add(1)
	go func(i int) {
	counters[i] = genCounts(target/procs, i, i == 0)
	wg.Done()
	}(i)
	}
	wg.Wait()

	// Reduce
	finalCounter := newCounter()
	for _, c := range counters {
	err := finalCounter.Merge(c)
	if err != nil {
	panic(err)
	}
	}

	resWithoutCorrection := count(finalCounter, false)
	resWithCorrection := count(finalCounter, true)
	fmt.Printf("Without correction:\t%d\t(inaccuracy: %+.0f%%)\n", resWithoutCorrection, pctDist(target, resWithoutCorrection))
	fmt.Printf("With correction:\t%d\t(inaccuracy: %+.0f%%)\n", resWithCorrection, pctDist(target, resWithCorrection))
	}

	func newCounter() *hll.HyperLogLog {
	const power = 10 // will use 2^10 registers
	c, err := hll.New(uint(1) << power)
	if err != nil {
	panic(err)
	}
	return c
	}

	// genCounts will generate n random numbers and add them to a counter, then return that counter.
	func genCounts(n int, seed int, printProgress bool) *hll.HyperLogLog {
	r := rand.New(rand.NewSource(int64(seed)))
	c := newCounter()
	for i := 0; i < n; i++ {
	if printProgress && i%(n/100) == 0 {
	fmt.Printf("%d%%\r", 100*i/n)
	}

	// We MUST use 64-bits inputs so that the domain of the hash function (2**64
	// possibilities) is larger than its output domain (2**32), to properly model
	// a real application.
	//
	// Otherwise with hll.Murmur32(r.Uint32()) there would be virtually no hash
	// collisions, which is not realistic.
	c.Add(hll.Murmur64(r.Uint64()))
	}
	return c
	}

	func pctDist(target int, actual uint64) float64 {
	return 100 * float64(int64(actual)-int64(target)) / float64(target)
	}

	// Same as github.com/DataDog/hyperloglog's (*hll.HyperLogLog).Count(), but with an extra
	// argument to be able to toggle the large range correction on/off.
	func count(h *hll.HyperLogLog, withLargeRangeCorrection bool) uint64 {
	exp32 := math.Pow(2, 32)

	sum := 0.0
	m := float64(h.M)
	for _, val := range h.Registers {
	sum += 1.0 / math.Pow(2.0, float64(val))
	}
	estimate := h.Alpha * m * m / sum
	if estimate <= 5.0/2.0*m {
	// Small range correction
	v := 0
	for _, r := range h.Registers {
	if r == 0 {
	v++
	}
	}
	if v > 0 {
	estimate = m * math.Log(m/float64(v))
	}
	} else if estimate > 1.0/30.0*exp32 {
	if withLargeRangeCorrection {
	estimate = -exp32 * math.Log(1-estimate/exp32)
	}
	}
	return uint64(estimate)
	}