athomason · October 30, 2019 09:25
diff --git a/cardapprox.go b/cardapprox.go
 // An implementation of https://arxiv.org/abs/1512.07901
 //
 // Simple set cardinality estimation through random sampling
 //
 // We present a simple algorithm that estimates the cardinality n of a set V
 // when allowed to sample elements of V uniformly and independently at random.
 // Our algorithm with probability (1−δ) returns a (1±ε)−approximation of n...

 package main

 import (
 	"flag"
 	"fmt"
 	"math"
 	"math/rand"
 	"time"
 )

 // ApproximateCardinality estimates the cardinality of the set that is probed
 // by sampler. With error rate δ, it returns a value within a factor 1±ε of the
 // actual cardinality.
 func ApproximateCardinality(sampler func() int, ε float64, δ float64, realN int) int {
 	var (
 		seen            = make(map[int]struct{})                                // 1: 𝑆 ← ∅
 		weightedSamples = 0                                                     // 2: 𝑤 ← 0
 		repeats         = 0                                                     // 3: 𝑟 ← 0
 		threshold       = int(math.Ceil((2 + 4.4*ε) / (ε * ε) * math.Log(3/δ))) // 4: 𝑘 ← ⌈(2+4.4𝜖)/𝜖²⋅ln(3/δ)⌉
 	)
 	for i := 0; ; i++ { // 5: 𝐫𝐞𝐩𝐞𝐚𝐭
 		weightedSamples += len(seen) // 6: 𝑤 ← 𝑤 + |𝑆|
 		e := sampler()               // 7: 𝑒 ← 𝐒𝐀𝐌𝐏𝐋𝐄(𝑉)
 		if _, ok := seen[e]; ok {
 			repeats++ // 8: 𝑟 ← 𝑟 + |𝑆 ∩ 𝑒|
 		} else {
 			seen[e] = struct{}{} // 9: 𝑆 ← 𝑆 ∪ 𝑒
 		}
 		if repeats >= threshold { // 10: 𝐮𝐧𝐭𝐢𝐥 𝑟≥𝑘
 			if false {
 				// CARDAPPROX... invokes 𝐒𝐀𝐌𝐏𝐋𝐄(𝑉) at most min(𝑛, 2⌈√𝑘𝑛⌉) + 𝑘 times
 				maxSamples := int(
 					math.Min(
 						float64(realN),
 						2*math.Ceil(math.Sqrt(float64(threshold*realN))),
 					),
 				) + threshold
 				fmt.Printf("took %d, expected %d\n", i, maxSamples)
 			}
 			return int(float64(weightedSamples) / float64(repeats)) // 11: 𝐫𝐞𝐭𝐮𝐫𝐧 𝑤/𝑟
 		}
 	}
 }

 func main() {
 	var (
 		maxN   = flag.Int("n", 64, "maximum cardinality")
 		trials = flag.Int("trials", 1000, "trials")
 		ε      = flag.Float64("epsilon", 1./64, "epsilon")       // accurate to one host in a mega64
 		δ      = flag.Float64("delta", .01, "probability bound") // with 99% probability
 	)
 	flag.Parse()
 	rand.Seed(time.Now().UnixNano())

 	mispredictions := 0            // count errors
 	for i := 0; i < *trials; i++ { // through many attempts
 		n := 1 + rand.Intn(*maxN)                     // pick N≥1
 		sampler := func() int { return rand.Intn(n) } // returns N distinct values
 		guess := ApproximateCardinality(sampler, *ε, *δ, n)
 		fmt.Printf("actual=%d -> guess=%d (%d)\n", n, guess, n-guess)
 		if n != guess {
 			mispredictions++
 		}
 	}
 	// error rate should be less than δ
 	fmt.Printf("error rate: %d/%d = %.1f%%\n",
 		mispredictions, *trials, 100*float64(mispredictions)/float64(*trials))
 }
	// An implementation of https://arxiv.org/abs/1512.07901
	//
	// Simple set cardinality estimation through random sampling
	//
	// We present a simple algorithm that estimates the cardinality n of a set V
	// when allowed to sample elements of V uniformly and independently at random.
	// Our algorithm with probability (1−δ) returns a (1±ε)−approximation of n...

	package main

	import (
	"flag"
	"fmt"
	"math"
	"math/rand"
	"time"
	)

	// ApproximateCardinality estimates the cardinality of the set that is probed
	// by sampler. With error rate δ, it returns a value within a factor 1±ε of the
	// actual cardinality.
	func ApproximateCardinality(sampler func() int, ε float64, δ float64, realN int) int {
	var (
	seen = make(map[int]struct{}) // 1: 𝑆 ← ∅
	weightedSamples = 0 // 2: 𝑤 ← 0
	repeats = 0 // 3: 𝑟 ← 0
	threshold = int(math.Ceil((2 + 4.4ε) / (ε ε) * math.Log(3/δ))) // 4: 𝑘 ← ⌈(2+4.4𝜖)/𝜖²⋅ln(3/δ)⌉
	)
	for i := 0; ; i++ { // 5: 𝐫𝐞𝐩𝐞𝐚𝐭
	weightedSamples += len(seen) // 6: 𝑤 ← 𝑤 + \|𝑆\|
	e := sampler() // 7: 𝑒 ← 𝐒𝐀𝐌𝐏𝐋𝐄(𝑉)
	if _, ok := seen[e]; ok {
	repeats++ // 8: 𝑟 ← 𝑟 + \|𝑆 ∩ 𝑒\|
	} else {
	seen[e] = struct{}{} // 9: 𝑆 ← 𝑆 ∪ 𝑒
	}
	if repeats >= threshold { // 10: 𝐮𝐧𝐭𝐢𝐥 𝑟≥𝑘
	if false {
	// CARDAPPROX... invokes 𝐒𝐀𝐌𝐏𝐋𝐄(𝑉) at most min(𝑛, 2⌈√𝑘𝑛⌉) + 𝑘 times
	maxSamples := int(
	math.Min(
	float64(realN),
	2math.Ceil(math.Sqrt(float64(thresholdrealN))),
	),
	) + threshold
	fmt.Printf("took %d, expected %d\n", i, maxSamples)
	}
	return int(float64(weightedSamples) / float64(repeats)) // 11: 𝐫𝐞𝐭𝐮𝐫𝐧 𝑤/𝑟
	}
	}
	}

	func main() {
	var (
	maxN = flag.Int("n", 64, "maximum cardinality")
	trials = flag.Int("trials", 1000, "trials")
	ε = flag.Float64("epsilon", 1./64, "epsilon") // accurate to one host in a mega64
	δ = flag.Float64("delta", .01, "probability bound") // with 99% probability
	)
	flag.Parse()
	rand.Seed(time.Now().UnixNano())

	mispredictions := 0 // count errors
	for i := 0; i < *trials; i++ { // through many attempts
	n := 1 + rand.Intn(*maxN) // pick N≥1
	sampler := func() int { return rand.Intn(n) } // returns N distinct values
	guess := ApproximateCardinality(sampler, ε, δ, n)
	fmt.Printf("actual=%d -> guess=%d (%d)\n", n, guess, n-guess)
	if n != guess {
	mispredictions++
	}
	}
	// error rate should be less than δ
	fmt.Printf("error rate: %d/%d = %.1f%%\n",
	mispredictions, trials, 100float64(mispredictions)/float64(*trials))
	}