Created
December 17, 2022 01:37
-
-
Save prateek/3ea7d540f1cb751bad44f6cab5338996 to your computer and use it in GitHub Desktop.
TestKeysDistribution demonstrates why you have to be careful when picking hash functions while double hashing.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"hash/crc32" | |
"math" | |
"testing" | |
) | |
/* | |
TestKeysDistribution demonstrates why you have to be careful when picking | |
hash functions while double hashing. | |
❯ GO111MODULE=off go run . -test.v | |
=== RUN TestKeysDistribution | |
=== RUN TestKeysDistribution/different_hash_functions | |
=== RUN TestKeysDistribution/same_hash_functions | |
main.go:51: instance 0 used 8192 out 16384 hash slots | |
main.go:59: instance 0 had 61.035400 keys/slot which is not within 10% of 30.517578 | |
main.go:51: instance 1 used 8192 out 16384 hash slots | |
main.go:59: instance 1 had 61.034912 keys/slot which is not within 10% of 30.517578 | |
--- FAIL: TestKeysDistribution (0.18s) | |
--- PASS: TestKeysDistribution/different_hash_functions (0.09s) | |
--- FAIL: TestKeysDistribution/same_hash_functions (0.09s) | |
FAIL | |
exit status 1 | |
*/ | |
const ( | |
_numTestKeys = 1000 * 1000 | |
_numRedisInstances = 2 | |
_numHashSlots = 16384 | |
) | |
func TestKeysDistribution(t *testing.T) { | |
for _, tt := range []struct { | |
name string | |
pickInstanceHashFn func(key []byte) uint32 | |
pickSlotHashFn func(key []byte) uint32 | |
}{ | |
{"different hash functions", crc32.ChecksumIEEE, _crc32Castagnoli}, | |
{"same hash functions", crc32.ChecksumIEEE, crc32.ChecksumIEEE}, | |
} { | |
t.Run(tt.name, func(t *testing.T) { | |
instanceDispersionCounts := make(map[int]*dispersionCounter) | |
for i := 0; i < _numRedisInstances; i++ { | |
instanceDispersionCounts[i] = newDispersionCounter(_numHashSlots) | |
} | |
for _, key := range _randomKeys { | |
i := int(tt.pickInstanceHashFn([]byte(key)) % _numRedisInstances) | |
slot := tt.pickSlotHashFn([]byte(key)) % _numHashSlots | |
instanceDispersionCounts[i].incr(slot) | |
} | |
// assert properties on the distribution | |
for i, instance := range instanceDispersionCounts { | |
summary := instance.summary() | |
// each slot in each instance must be used | |
if slotsUsed := int(summary.count); slotsUsed < _numHashSlots { | |
t.Errorf("instance %d used %d out %d hash slots", i, slotsUsed, _numHashSlots) | |
} | |
// if we have a uniform distribution, we expect to divide the keys roughly equally | |
expectedNumKeysPerSlot := float64(_numTestKeys) / float64(_numHashSlots) / float64(_numRedisInstances) | |
// assert we're within 10% of fully uniform distribution | |
if summary.mean < 0.9*expectedNumKeysPerSlot || summary.mean > 1.1*expectedNumKeysPerSlot { | |
t.Errorf("instance %d had %f keys/slot which is not within 10%% of %f", i, summary.mean, expectedNumKeysPerSlot) | |
} | |
} | |
}) | |
} | |
} | |
type dispersionCounter struct { | |
numSlots uint32 | |
counts map[uint32]int | |
} | |
func newDispersionCounter(numSlots uint32) *dispersionCounter { | |
return &dispersionCounter{ | |
numSlots: numSlots, | |
counts: make(map[uint32]int, numSlots), | |
} | |
} | |
func (d *dispersionCounter) incr(slot uint32) { | |
if slot >= d.numSlots { | |
panic("slot >= numSlots") | |
} | |
d.counts[slot] = d.counts[slot] + 1 | |
} | |
func (d *dispersionCounter) summary() summaryStats { | |
summary := summaryStats{ | |
max: -1 * math.MaxFloat64, | |
min: math.MaxFloat64, | |
count: 0, | |
} | |
sum := 0.0 | |
for _, count := range d.counts { | |
c := float64(count) | |
summary.max = math.Max(summary.max, c) | |
summary.min = math.Min(summary.min, c) | |
summary.count = summary.count + 1 | |
sum += c | |
} | |
if summary.count > 0 { | |
summary.mean = sum / summary.count | |
} | |
return summary | |
} | |
type summaryStats struct { | |
count float64 | |
mean float64 | |
max float64 | |
min float64 | |
} | |
func newRandomKeys(num int) [][]byte { | |
ret := make([][]byte, 0, num) | |
for i := 0; i < num; i++ { | |
ret = append(ret, []byte(fmt.Sprintf("random-string-id-%d", i))) | |
} | |
return ret | |
} | |
var ( | |
_randomKeys = newRandomKeys(_numTestKeys) | |
_castagnoliTable = crc32.MakeTable(crc32.Castagnoli) | |
_crc32Castagnoli = func(b []byte) uint32 { | |
return crc32.Checksum(b, _castagnoliTable) | |
} | |
) | |
func matchString(a, b string) (bool, error) { | |
return a == b, nil | |
} | |
func main() { | |
testSuite := []testing.InternalTest{ | |
{ | |
Name: "TestKeysDistribution", | |
F: TestKeysDistribution, | |
}, | |
} | |
testing.Main(matchString, testSuite, nil, nil) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment