Skip to content

Instantly share code, notes, and snippets.

@edef1c
Created February 18, 2022 14:27
Show Gist options
  • Save edef1c/0ce6721e59857ed7e427319c14efc0b2 to your computer and use it in GitHub Desktop.
Save edef1c/0ce6721e59857ed7e427319c14efc0b2 to your computer and use it in GitHub Desktop.
BuzHash chunker in 80 LoC
package chunker
import (
"math/bits"
)
const windowSize = 48
type hasher struct {
x uint32
i uint32
window [windowSize]byte
// window[i] is the least recently written byte
// window[i-1] is the most recently written byte
}
func newHasher(data []byte) (h hasher) {
n := len(data) - windowSize
copy(h.window[:], data[n:][:windowSize])
for i, b := range h.window {
h.x ^= bits.RotateLeft32(table[b], windowSize-i-1)
}
return
}
func (h *hasher) push(in byte) {
out := h.window[h.i]
h.window[h.i] = in
h.i = (h.i + 1) % windowSize
h.x = bits.RotateLeft32(h.x, 1) ^
bits.RotateLeft32(table[out], windowSize) ^
table[in]
}
func (h *hasher) judge(discriminator uint32) bool {
return h.x%discriminator == discriminator-1
}
var table = [256]uint32{
0x458be752, 0xc10748cc, 0xfbbcdbb8, 0x6ded5b68,
0xb10a82b5, 0x20d75648, 0xdfc5665f, 0xa8428801,
0x7ebf5191, 0x841135c7, 0x65cc53b3, 0x280a597c,
0x16f60255, 0xc78cbc3e, 0x294415f5, 0xb938d494,
0xec85c4e6, 0xb7d33edc, 0xe549b544, 0xfdeda5aa,
0x882bf287, 0x3116737c, 0x05569956, 0xe8cc1f68,
0x0806ac5e, 0x22a14443, 0x15297e10, 0x50d090e7,
0x4ba60f6f, 0xefd9f1a7, 0x5c5c885c, 0x82482f93,
0x9bfd7c64, 0x0b3e7276, 0xf2688e77, 0x8fad8abc,
0xb0509568, 0xf1ada29f, 0xa53efdfe, 0xcb2b1d00,
0xf2a9e986, 0x6463432b, 0x95094051, 0x5a223ad2,
0x9be8401b, 0x61e579cb, 0x1a556a14, 0x5840fdc2,
0x9261ddf6, 0xcde002bb, 0x52432bb0, 0xbf17373e,
0x7b7c222f, 0x2955ed16, 0x9f10ca59, 0xe840c4c9,
0xccabd806, 0x14543f34, 0x1462417a, 0x0d4a1f9c,
0x087ed925, 0xd7f8f24c, 0x7338c425, 0xcf86c8f5,
0xb19165cd, 0x9891c393, 0x325384ac, 0x0308459d,
0x86141d7e, 0xc922116a, 0xe2ffa6b6, 0x53f52aed,
0x2cd86197, 0xf5b9f498, 0xbf319c8f, 0xe0411fae,
0x977eb18c, 0xd8770976, 0x9833466a, 0xc674df7f,
0x8c297d45, 0x8ca48d26, 0xc49ed8e2, 0x7344f874,
0x556f79c7, 0x6b25eaed, 0xa03e2b42, 0xf68f66a4,
0x8e8b09a2, 0xf2e0e62a, 0x0d3a9806, 0x9729e493,
0x8c72b0fc, 0x160b94f6, 0x450e4d3d, 0x7a320e85,
0xbef8f0e1, 0x21d73653, 0x4e3d977a, 0x1e7b3929,
0x1cc6c719, 0xbe478d53, 0x8d752809, 0xe6d8c2c6,
0x275f0892, 0xc8acc273, 0x4cc21580, 0xecc4a617,
0xf5f7be70, 0xe795248a, 0x375a2fe9, 0x425570b6,
0x8898dcf8, 0xdc2d97c4, 0x0106114b, 0x364dc22f,
0x1e0cad1f, 0xbe63803c, 0x5f69fac2, 0x4d5afa6f,
0x1bc0dfb5, 0xfb273589, 0x0ea47f7b, 0x3c1c2b50,
0x21b2a932, 0x6b1223fd, 0x2fe706a8, 0xf9bd6ce2,
0xa268e64e, 0xe987f486, 0x3eacf563, 0x1ca2018c,
0x65e18228, 0x2207360a, 0x57cf1715, 0x34c37d2b,
0x1f8f3cde, 0x93b657cf, 0x31a019fd, 0xe69eb729,
0x8bca7b9b, 0x4c9d5bed, 0x277ebeaf, 0xe0d8f8ae,
0xd150821c, 0x31381871, 0xafc3f1b0, 0x927db328,
0xe95effac, 0x305a47bd, 0x426ba35b, 0x1233af3f,
0x686a5b83, 0x50e072e5, 0xd9d3bb2a, 0x8befc475,
0x487f0de6, 0xc88dff89, 0xbd664d5e, 0x971b5d18,
0x63b14847, 0xd7d3c1ce, 0x7f583cf3, 0x72cbcb09,
0xc0d0a81c, 0x7fa3429b, 0xe9158a1b, 0x225ea19a,
0xd8ca9ea3, 0xc763b282, 0xbb0c6341, 0x020b8293,
0xd4cd299d, 0x58cfa7f8, 0x91b4ee53, 0x37e4d140,
0x95ec764c, 0x30f76b06, 0x5ee68d24, 0x679c8661,
0xa41979c2, 0xf2b61284, 0x4fac1475, 0x0adb49f9,
0x19727a23, 0x15a7e374, 0xc43a18d5, 0x3fb1aa73,
0x342fc615, 0x924c0793, 0xbee2d7f0, 0x8a279de9,
0x4aa2d70c, 0xe24dd37f, 0xbe862c0b, 0x177c22c2,
0x5388e5ee, 0xcd8a7510, 0xf901b4fd, 0xdbc13dbc,
0x6c0bae5b, 0x64efe8c7, 0x48b02079, 0x80331a49,
0xca3d8ae6, 0xf3546190, 0xfed7108b, 0xc49b941b,
0x32baf4a9, 0xeb833a4a, 0x88a3f1a5, 0x3a91ce0a,
0x3cc27da1, 0x7112e684, 0x4a3096b1, 0x3794574c,
0xa3c8b6f3, 0x1d213941, 0x6e0a2e00, 0x233479f1,
0x0f4cd82f, 0x6093edd2, 0x5d7d209e, 0x464fe319,
0xd4dcac9e, 0x0db845cb, 0xfb5e4bc3, 0xe0256ce1,
0x09fb4ed1, 0x0914be1e, 0xa5bdb2c3, 0xc6eb57bb,
0x30320350, 0x3f397e91, 0xa67791bc, 0x86bc0e2c,
0xefa0a7e2, 0xe9ff7543, 0xe733612c, 0xd185897b,
0x329e5388, 0x91dd236b, 0x2ecb0d93, 0xf4d82a3d,
0x35b5c03f, 0xe4e606f0, 0x05b21843, 0x37b45964,
0x5eff22f4, 0x6027f4cc, 0x77178b3c, 0xae507131,
0x7bf7cabc, 0xf9c18d66, 0x593ade65, 0xd95ddf11,
}
package chunker
import (
"bytes"
"io"
)
const (
MinChunk = AvgChunk / 4
AvgChunk = 64 * 1024
MaxChunk = AvgChunk * 4
)
var discriminator = discriminatorFromAverage(AvgChunk)
func discriminatorFromAverage(avg uint64) uint32 {
return uint32(float64(avg) / (-1.42888852e-7*float64(avg) + 1.33237515))
}
type Chunker struct {
b bytes.Buffer
r io.Reader
}
func NewChunker(r io.Reader) *Chunker {
return &Chunker{r: r}
}
// Next returns the next chunk from the underlying reader.
// The returned slice is valid until the method is called again.
// Any errors are from the underlying reader.
func (c *Chunker) Next() (chunk []byte, err error) {
buffer, err := c.fill()
if err != nil {
return nil, err
}
if len(buffer) <= MinChunk {
// underfill, give up
return c.b.Next(MinChunk), nil
}
h := newHasher(buffer[:MinChunk])
for i, b := range buffer[MinChunk:] {
h.push(b)
if h.judge(discriminator) {
// split point, got 'em
return c.b.Next(MinChunk + i + 1), nil
}
}
// full chunk, give up
return c.b.Next(MaxChunk), nil
}
func (c *Chunker) fill() (buffer []byte, err error) {
need := MaxChunk - c.b.Len()
_, err = c.b.ReadFrom(io.LimitReader(c.r, int64(need)))
if err != nil {
return nil, err
}
if c.b.Len() == 0 {
return nil, io.EOF
}
return c.b.Bytes(), nil
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment