Created
November 2, 2016 17:17
-
-
Save dgryski/a4942251c222aced8e4a30151d0a2841 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"errors" | |
"flag" | |
"fmt" | |
"log" | |
"os" | |
"runtime" | |
"runtime/pprof" | |
"time" | |
"github.com/dgryski/go-bloomindex" | |
"github.com/dgryski/trifles/repl" | |
"github.com/dustin/go-humanize" | |
) | |
func main() { | |
cpuprofile := flag.String("cpuprofile", "", "write cpu profile to file") | |
interactive := flag.Bool("interactive", true, "interactive") | |
benchmark := flag.Bool("benchmark", true, "benchmark") | |
ngram := flag.Int("ngram", 3, "n-gram size") | |
flag.Parse() | |
if *cpuprofile != "" { | |
f, err := os.Create(*cpuprofile) | |
if err != nil { | |
log.Fatal(err) | |
} | |
pprof.StartCPUProfile(f) | |
defer pprof.StopCPUProfile() | |
} | |
var docs []string | |
var idx *bloomindex.Index | |
var ids []bloomindex.DocID | |
cIndex := func(args []string) error { | |
if len(args) < 1 { | |
return errors.New("missing argument") | |
} | |
fname := args[0] | |
f, err := os.Open(fname) | |
if err != nil { | |
return err | |
} | |
scanner := bufio.NewScanner(f) | |
if len(docs) != 0 { | |
docs = docs[:0] | |
} | |
var mem runtime.MemStats | |
runtime.ReadMemStats(&mem) | |
log.Println(humanize.Bytes(mem.Alloc)) | |
// idx = bloomindex.NewShardedIndex() //Index(256, 65536, 4) | |
idx = bloomindex.NewIndex(256, 65536, 4) | |
t0 := time.Now() | |
var tokens []T | |
var skipped int | |
for scanner.Scan() { | |
d := scanner.Text() | |
var toks []uint32 | |
for _, t := range Extract(d, *ngram, tokens[:0]) { | |
toks = append(toks, jenkinsShift(uint32(t))) | |
} | |
if len(toks) > 0 { | |
docs = append(docs, d) | |
idx.AddDocument(toks) | |
} | |
} | |
if err := scanner.Err(); err != nil { | |
fmt.Println("error during scan: ", err) | |
} | |
fmt.Printf("indexed %d documents in %s (skipped %d)\n", len(docs), time.Since(t0), skipped) | |
runtime.GC() | |
runtime.ReadMemStats(&mem) | |
log.Println(humanize.Bytes(mem.Alloc)) | |
return nil | |
} | |
cPrint := func(args []string) error { | |
for _, id := range ids { | |
fmt.Printf("%05d: %q\n", id, docs[id]) | |
} | |
return nil | |
} | |
cSearch := func(args []string) error { | |
if idx == nil { | |
return errors.New("no index loaded") | |
} | |
var tokens []T | |
var toks []uint32 | |
for _, arg := range args { | |
for _, tok := range Extract(arg, *ngram, tokens[:0]) { | |
toks = append(toks, jenkinsShift(uint32(tok))) | |
} | |
} | |
if !*benchmark { | |
t0 := time.Now() | |
ids = idx.Query(toks) | |
fmt.Printf("found %d documents in %v\n", len(ids), time.Since(t0)) | |
} else { | |
bloomindex.Log = false | |
var total time.Duration | |
for i := 0; i < 30; i++ { | |
t0 := time.Now() | |
for j := 0; j < 100; j++ { | |
ids = idx.Query(toks) | |
} | |
total += time.Since(t0) | |
fmt.Println(time.Since(t0)) | |
} | |
total /= 3000 | |
fmt.Println("QPS: ", int(time.Second/total)) | |
} | |
return nil | |
} | |
bloomindex.Log = true | |
if !*interactive { | |
cIndex([]string{"metrics.txt"}) | |
cSearch([]string{"terms", "for", "query"}) | |
} else { | |
repl.Run("bindex> ", | |
map[string]repl.Cmd{ | |
"index": cIndex, | |
"print": cPrint, | |
"search": cSearch, | |
}, | |
) | |
} | |
} | |
type T uint64 | |
// Extract returns a list of all the unique trigrams in s | |
func Extract(s string, ngram int, trigrams []T) []T { | |
for i := 0; i <= len(s)-ngram; i++ { | |
var t T | |
for j := 0; j < ngram; j++ { | |
t = (t << 8) | T(s[i+j]) | |
} | |
trigrams = appendIfUnique(trigrams, t) | |
} | |
return trigrams | |
} | |
func appendIfUnique(t []T, n T) []T { | |
for _, v := range t { | |
if v == n { | |
return t | |
} | |
} | |
return append(t, n) | |
} | |
func jenkinsShift(a uint32) uint32 { | |
a -= (a << 6) | |
a ^= (a >> 17) | |
a -= (a << 9) | |
a ^= (a << 4) | |
a -= (a << 3) | |
a ^= (a << 10) | |
a ^= (a >> 15) | |
return a | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment