Skip to content

Instantly share code, notes, and snippets.

@on99
Last active June 7, 2020 08:34
Show Gist options
  • Save on99/9fd158deae2b4bd4c9c0528ae0a464e1 to your computer and use it in GitHub Desktop.
Save on99/9fd158deae2b4bd4c9c0528ae0a464e1 to your computer and use it in GitHub Desktop.
package main
import (
"bufio"
"bytes"
"fmt"
"log"
"os"
"strconv"
"github.com/RoaringBitmap/roaring"
"github.com/adtalos/gocandy/must"
mapset "github.com/deckarep/golang-set"
"github.com/joeljunstrom/go-luhn"
"github.com/segmentio/fasthash/fnv1a"
"github.com/spaolacci/murmur3"
)
// // used to generate imei file
func init() {
// generateImeiFile()
// generateHashFile()
// os.Exit(1)
}
func generateImeiFile() {
file, err := os.Create("imei.txt")
must.BeNilError(err)
buf := bytes.NewBuffer(nil)
for i := 0; i < 1<<30; i++ {
deviceid := luhn.Generate(16) // generate imei
must.BeNil(buf.WriteString(deviceid + "\n"))
if buf.Len() > 1<<20 {
must.BeNil(buf.WriteTo(file))
buf.Reset()
log.Println("write to file")
}
}
must.BeNilError(file.Close())
}
func generateHashFile() {
imeiFile, err := os.Open("imei.txt")
must.BeNilError(err)
hashFile, err := os.Create("hashes.txt")
must.BeNilError(err)
set := mapset.NewThreadUnsafeSet()
buf := bytes.NewBuffer(nil)
scanner := bufio.NewScanner(imeiFile)
for scanner.Scan() {
str := scanner.Text()
if set.Contains(str) { // deduplicate
continue
}
set.Add(str)
i, j := fnv1a.HashString32(str), murmur3.Sum32([]byte(str))
must.BeNil(buf.WriteString(fmt.Sprintf("%d\n%d\n", i, j)))
if buf.Len() > 1<<20 {
must.BeNil(buf.WriteTo(hashFile))
buf.Reset()
log.Println("write to file")
}
}
must.BeNilError(scanner.Err())
must.BeNilError(imeiFile.Close())
must.BeNilError(hashFile.Close())
}
type wrap struct {
bucket int
bitmaps []*roaring.Bitmap
}
func newWrap(bucket int) *wrap {
w := &wrap{
bucket: bucket,
}
for i := 0; i < bucket; i++ {
w.bitmaps = append(w.bitmaps, roaring.New())
}
return w
}
func (w *wrap) add(i, j uint32) {
w.bitmaps[int(i)%w.bucket].Add(j)
}
func (w *wrap) generateReport(total int) {
var cardinality uint64
for _, v := range w.bitmaps {
cardinality += v.GetCardinality()
}
fmt.Println(w.bucket, cardinality, float64(cardinality)/float64(total)*100)
}
func (w *wrap) persist() {
os.MkdirAll(fmt.Sprintf("bitmaps/%d", w.bucket), os.ModePerm)
for i, v := range w.bitmaps {
file, err := os.Create(fmt.Sprintf("bitmaps/%d/%d", w.bucket, i))
must.BeNilError(err)
must.BeNil(v.WriteTo(file))
must.BeNilError(file.Close())
}
}
func (w *wrap) memoryReport() {
var total uint64
for _, v := range w.bitmaps {
total += v.GetSizeInBytes()
}
fmt.Println(w.bucket, total)
}
func main() {
log.Println("start")
for shift := 0; shift < 10; shift++ {
file, err := os.Open("hashes.txt")
must.BeNilError(err)
w := newWrap(1 << shift)
total := 0
scanner := bufio.NewScanner(file)
var i, j uint32
for scanner.Scan() {
str := scanner.Text()
n, err := strconv.ParseUint(str, 10, 32)
must.BeNilError(err)
total++
switch total % 2 {
case 1:
i = uint32(n)
case 0:
j = uint32(n)
w.add(i, j)
}
}
must.BeNilError(scanner.Err())
total = total / 2
fmt.Println("total", total)
fmt.Println("accuracy report")
w.generateReport(total)
fmt.Println("persist")
w.persist()
fmt.Println("memory report")
w.memoryReport()
}
log.Println("finished")
}
@on99
Copy link
Author

on99 commented Jun 5, 2020

如果只用一个 bitmap 来保存很多设备号, 无论选择什么 hash 碰撞率都不低, 所以打算用多个 bitmap 来保存, 以降低碰撞率, 提高准确率. 第一次 hash 选择 bitmap, 第二次 hash 保存到 bitmap.

结果:

均匀分布, 每个 bitmap 的 cardinality 都差不多.

hashes.txt 包含 844919415 个不重复的 imei

bucket cardinality accuracy memory disk
1 767018244 90.780047% 537001992 513M
2 804699026 95.239736% 1074003984 1.1G
4 824477124 97.580563% 1649478568 1.6G
8 834614465 98.780362% 1670277570 1.6G
16 839747719 99.387906% 1681592718 1.6G
32 842328022 99.693297% 1688850604 1.6G
64 843622908 99.846552% 1695634936 1.7G
128 844269472 99.923076% 1705317184 1.7G
256 844594313 99.961522% 1722745106 1.7G
512 844756468 99.980714% 1756625896 1.9G

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment