Skip to content

Instantly share code, notes, and snippets.

@kevinclark
Last active December 19, 2015 18:19
Show Gist options
  • Save kevinclark/5998123 to your computer and use it in GitHub Desktop.
Save kevinclark/5998123 to your computer and use it in GitHub Desktop.
I'd like to support switching out the indexing.DocId typedef (see indexing/index.go). Currently my tests (indexing/index_test.go) assume a uint64 DocId but the code is currently defined as having a uint32 DocId. I want to be able to switch between the two in tests so that I can see that changing the typedef actually changes the output format.
package indexing
import (
"bufio"
"encoding/binary"
"errors"
"fmt"
"io"
"strings"
)
const (
magic = "searchme"
)
// I'd like to be able to change this typedef during tests (or execution even, I guess).
type DocId uint32
type Index struct {
DocCounter DocId
docIdToPath map[DocId]string
pathToDocId map[string]DocId
postings map[string][]DocId
}
func NewIndex() *Index {
var index Index
index.postings = make(map[string][]DocId)
index.docIdToPath = make(map[DocId]string)
index.pathToDocId = make(map[string]DocId)
return &index
}
func (i *Index) Add(doc *Document) error {
termSet := make(map[string]struct{})
for _, term := range TokenizeString(doc.Content) {
termSet[term] = struct{}{}
}
docId := i.AssignDocId(doc.Path)
for term, _ := range termSet {
i.postings[term] = append(i.postings[term], docId)
}
return nil
}
func (i *Index) AssignDocId(path string) DocId {
docId, contains := i.pathToDocId[path]
if !contains {
docId = i.DocCounter
i.pathToDocId[path] = docId
i.docIdToPath[docId] = path
i.DocCounter++
}
return docId
}
func (index *Index) TermPaths(term string) []string {
docs, found := index.postings[strings.ToLower(term)]
if !found {
return make([]string, 0)
}
results := make([]string, len(docs))
for i, docId := range docs {
results[i] = index.docIdToPath[docId]
}
return results
}
func (index *Index) Write(w io.Writer) {
// List of files, null terminated. Doc ids correspond to index. Ends with an empty filename.
io.WriteString(w, magic+"\x00")
for i := DocId(0); i < index.DocCounter; i++ {
io.WriteString(w, index.docIdToPath[i]+"\x00")
}
io.WriteString(w, "\x00")
for term, docIds := range index.postings {
// TERM \x00 32-bit-number-of-docs 64bit doc ids until
io.WriteString(w, term)
io.WriteString(w, "\x00")
binary.Write(w, binary.BigEndian, uint32(len(docIds)))
for _, id := range docIds {
binary.Write(w, binary.BigEndian, id)
}
}
}
func stripNull(b []byte) string {
return string(b[:len(b)-1])
}
func LoadIndex(reader io.Reader) (*Index, error) {
index := NewIndex()
r := bufio.NewReader(reader)
magic, _ := r.ReadBytes('\x00')
if stripNull(magic) != "searchme" {
return nil, errors.New(fmt.Sprintf("Bad format. Magic bytes not detected: %q", magic))
}
// Read docs
var i DocId
// TODO(kev): Don't ignore errors
for p, _ := r.ReadBytes('\x00'); len(p) >= 2; p, _ = r.ReadBytes('\x00') {
path := stripNull(p)
index.docIdToPath[i] = path
index.pathToDocId[path] = i
i++
}
// Read terms
for {
t, err := r.ReadBytes('\x00')
if err != nil {
return index, nil
}
term := stripNull(t)
var size uint32
binary.Read(r, binary.BigEndian, &size)
docs := make([]DocId, size)
for j := uint32(0); j < size; j++ {
var docId DocId
binary.Read(r, binary.BigEndian, &docId)
docs[j] = docId
}
index.postings[term] = docs
}
return index, nil
}
package indexing
import (
"bytes"
"testing"
)
// All of the serialized data here assumes we're writing uint64 DocIds.
// So something like:
var testWrites = []struct {
docs []Document
out string
}{
{[]Document{},
// Header | End of Docs
"searchme\x00\x00"}, // terminator + doc terminator
{[]Document{{"path", "content"}},
// Header | Doc paths + Terminator | term + terminator + num-docs + doc id (0)
"searchme\x00path\x00\x00content\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00"},
}
func TestWrite64(t *testing.T) {
// Set our tests to use 64 bit ids:
indexing.DocId = uint64
for testNum, testCase := range testWrites {
i := NewIndex()
buf := new(bytes.Buffer)
for _, doc := range testCase.docs {
i.Add(&doc)
}
i.Write(buf)
result := string(buf.Bytes())
if testCase.out != result {
t.Fatalf("%d. Expected: %q Actual %q", testNum, testCase.out, result)
}
}
}
// Then imagine another version of TestWrite that sets type to uint32
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment