Last active
June 7, 2020 08:34
-
-
Save on99/9fd158deae2b4bd4c9c0528ae0a464e1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"bytes" | |
"fmt" | |
"log" | |
"os" | |
"strconv" | |
"github.com/RoaringBitmap/roaring" | |
"github.com/adtalos/gocandy/must" | |
mapset "github.com/deckarep/golang-set" | |
"github.com/joeljunstrom/go-luhn" | |
"github.com/segmentio/fasthash/fnv1a" | |
"github.com/spaolacci/murmur3" | |
) | |
// // used to generate imei file | |
func init() { | |
// generateImeiFile() | |
// generateHashFile() | |
// os.Exit(1) | |
} | |
func generateImeiFile() { | |
file, err := os.Create("imei.txt") | |
must.BeNilError(err) | |
buf := bytes.NewBuffer(nil) | |
for i := 0; i < 1<<30; i++ { | |
deviceid := luhn.Generate(16) // generate imei | |
must.BeNil(buf.WriteString(deviceid + "\n")) | |
if buf.Len() > 1<<20 { | |
must.BeNil(buf.WriteTo(file)) | |
buf.Reset() | |
log.Println("write to file") | |
} | |
} | |
must.BeNilError(file.Close()) | |
} | |
func generateHashFile() { | |
imeiFile, err := os.Open("imei.txt") | |
must.BeNilError(err) | |
hashFile, err := os.Create("hashes.txt") | |
must.BeNilError(err) | |
set := mapset.NewThreadUnsafeSet() | |
buf := bytes.NewBuffer(nil) | |
scanner := bufio.NewScanner(imeiFile) | |
for scanner.Scan() { | |
str := scanner.Text() | |
if set.Contains(str) { // deduplicate | |
continue | |
} | |
set.Add(str) | |
i, j := fnv1a.HashString32(str), murmur3.Sum32([]byte(str)) | |
must.BeNil(buf.WriteString(fmt.Sprintf("%d\n%d\n", i, j))) | |
if buf.Len() > 1<<20 { | |
must.BeNil(buf.WriteTo(hashFile)) | |
buf.Reset() | |
log.Println("write to file") | |
} | |
} | |
must.BeNilError(scanner.Err()) | |
must.BeNilError(imeiFile.Close()) | |
must.BeNilError(hashFile.Close()) | |
} | |
type wrap struct { | |
bucket int | |
bitmaps []*roaring.Bitmap | |
} | |
func newWrap(bucket int) *wrap { | |
w := &wrap{ | |
bucket: bucket, | |
} | |
for i := 0; i < bucket; i++ { | |
w.bitmaps = append(w.bitmaps, roaring.New()) | |
} | |
return w | |
} | |
func (w *wrap) add(i, j uint32) { | |
w.bitmaps[int(i)%w.bucket].Add(j) | |
} | |
func (w *wrap) generateReport(total int) { | |
var cardinality uint64 | |
for _, v := range w.bitmaps { | |
cardinality += v.GetCardinality() | |
} | |
fmt.Println(w.bucket, cardinality, float64(cardinality)/float64(total)*100) | |
} | |
func (w *wrap) persist() { | |
os.MkdirAll(fmt.Sprintf("bitmaps/%d", w.bucket), os.ModePerm) | |
for i, v := range w.bitmaps { | |
file, err := os.Create(fmt.Sprintf("bitmaps/%d/%d", w.bucket, i)) | |
must.BeNilError(err) | |
must.BeNil(v.WriteTo(file)) | |
must.BeNilError(file.Close()) | |
} | |
} | |
func (w *wrap) memoryReport() { | |
var total uint64 | |
for _, v := range w.bitmaps { | |
total += v.GetSizeInBytes() | |
} | |
fmt.Println(w.bucket, total) | |
} | |
func main() { | |
log.Println("start") | |
for shift := 0; shift < 10; shift++ { | |
file, err := os.Open("hashes.txt") | |
must.BeNilError(err) | |
w := newWrap(1 << shift) | |
total := 0 | |
scanner := bufio.NewScanner(file) | |
var i, j uint32 | |
for scanner.Scan() { | |
str := scanner.Text() | |
n, err := strconv.ParseUint(str, 10, 32) | |
must.BeNilError(err) | |
total++ | |
switch total % 2 { | |
case 1: | |
i = uint32(n) | |
case 0: | |
j = uint32(n) | |
w.add(i, j) | |
} | |
} | |
must.BeNilError(scanner.Err()) | |
total = total / 2 | |
fmt.Println("total", total) | |
fmt.Println("accuracy report") | |
w.generateReport(total) | |
fmt.Println("persist") | |
w.persist() | |
fmt.Println("memory report") | |
w.memoryReport() | |
} | |
log.Println("finished") | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
如果只用一个 bitmap 来保存很多设备号, 无论选择什么 hash 碰撞率都不低, 所以打算用多个 bitmap 来保存, 以降低碰撞率, 提高准确率. 第一次 hash 选择 bitmap, 第二次 hash 保存到 bitmap.
结果:
均匀分布, 每个 bitmap 的 cardinality 都差不多.
hashes.txt 包含 844919415 个不重复的 imei