Last active
August 29, 2015 14:10
-
-
Save dgryski/d8f1c8cb9c8df5a438a3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"fmt" | |
"io" | |
"os" | |
"sort" | |
"strconv" | |
"strings" | |
"time" | |
"github.com/dgryski/go-trigram" | |
"github.com/peterh/liner" | |
) | |
func main() { | |
var docs []string | |
var ids []trigram.DocID | |
var idx trigram.Index | |
line := liner.NewLiner() | |
defer line.Close() | |
REPL: | |
for { | |
var err error | |
var command string | |
command, err = line.Prompt("trigram> ") | |
if err == io.EOF { | |
break | |
} | |
if err != nil { | |
fmt.Println("error reading line: ", err) | |
continue | |
} | |
fields := strings.Fields(command) | |
if len(fields) == 0 { | |
continue | |
} | |
line.AppendHistory(command) | |
switch fields[0] { | |
case "brute", "b": | |
if idx == nil { | |
fmt.Println("no index loaded") | |
break | |
} | |
if len(fields) == 1 { | |
fmt.Println("missing argument") | |
break | |
} | |
patterns := fields[1:] | |
if len(ids) != 0 { | |
ids = ids[:0] | |
} | |
t0 := time.Now() | |
for i, s := range docs { | |
var mismatch = false | |
search: | |
for _, pat := range patterns { | |
if !strings.Contains(s, pat) { | |
mismatch = true | |
break search | |
} | |
} | |
if !mismatch { | |
ids = append(ids, trigram.DocID(i)) | |
} | |
} | |
fmt.Println("found", len(ids), "documents in", time.Since(t0)) | |
case "f", "filter": | |
if idx == nil { | |
fmt.Println("no index loaded") | |
break | |
} | |
var ts []trigram.T | |
for _, f := range fields[1:] { | |
ts = trigram.Extract(f, ts) | |
} | |
t0 := time.Now() | |
ids = idx.Filter(ids, ts) | |
fmt.Println("filtered", len(ids), "documents in", time.Since(t0)) | |
case "h", "help": | |
fmt.Println("b[rute] pattern -- brute force search for `pattern`") | |
fmt.Println("f[ilter] pat1 pat2... -- filter current matches with additional trigrams") | |
fmt.Println("h[elp] -- this help") | |
fmt.Println("index file.txt -- load a file into the index") | |
fmt.Println("p[rint] -- print current matches") | |
fmt.Println("q[uit] -- quit") | |
fmt.Println("s[earch] pat1 pat2... -- trigram search for docs containing the specified patterns") | |
fmt.Println("t[rigram] pat1 pat2... -- show trigram frequencies for the given patterns") | |
case "index": | |
if len(fields) != 2 { | |
fmt.Println("missing argument") | |
break | |
} | |
fname := fields[1] | |
f, err := os.Open(fname) | |
if err != nil { | |
fmt.Println(err) | |
break | |
} | |
scanner := bufio.NewScanner(f) | |
if len(docs) != 0 { | |
docs = docs[:0] | |
} | |
idx = trigram.NewIndex(nil) | |
t0 := time.Now() | |
for scanner.Scan() { | |
d := scanner.Text() | |
docs = append(docs, d) | |
// add the trigrams | |
idx.Add(d) | |
} | |
fmt.Printf("indexed %d documents in %s\n", len(docs), time.Since(t0)) | |
case "p", "print": | |
for _, id := range ids { | |
fmt.Printf("%05d: %q\n", id, docs[id]) | |
} | |
case "prune": | |
if idx == nil { | |
fmt.Println("no index loaded") | |
break | |
} | |
if len(fields) != 2 { | |
} | |
pct, _ := strconv.Atoi(fields[1]) | |
pruned := idx.Prune(float64(pct) / 100) | |
fmt.Println("pruned", pruned, "at", pct) | |
case "q", "quit": | |
break REPL | |
case "s", "search": | |
if idx == nil { | |
fmt.Println("no index loaded") | |
break | |
} | |
var ts []trigram.T | |
for _, f := range fields[1:] { | |
ts = trigram.Extract(f, ts) | |
} | |
t0 := time.Now() | |
ids = idx.QueryTrigrams(ts) | |
fmt.Println("found", len(ids), "documents in", time.Since(t0)) | |
case "top": | |
var freq []int | |
for _, v := range idx { | |
freq = append(freq, len(v)) | |
} | |
sort.Ints(freq) | |
for i := 0; i < 100; i++ { | |
fmt.Println(freq[len(freq)-1-i]) | |
} | |
case "t", "tri", "trigram", "trigrams": | |
if idx == nil { | |
fmt.Println("no index loaded") | |
break | |
} | |
var ts []trigram.T | |
for _, f := range fields[1:] { | |
ts = trigram.Extract(f, ts) | |
} | |
for _, t := range ts { | |
fmt.Printf("%q: %d\n", t, len(idx[t])) | |
} | |
default: | |
fmt.Println("unknown command, try `help`") | |
} | |
} | |
fmt.Println("bye") | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment