Last active
December 19, 2015 18:19
-
-
Save kevinclark/5998123 to your computer and use it in GitHub Desktop.
I'd like to support switching out the indexing.DocId typedef (see indexing/index.go). Currently my tests (indexing/index_test.go) assume a uint64 DocId but the code is currently defined as having a uint32 DocId. I want to be able to switch between the two in tests so that I can see that changing the typedef actually changes the output format.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package indexing | |
import ( | |
"bufio" | |
"encoding/binary" | |
"errors" | |
"fmt" | |
"io" | |
"strings" | |
) | |
const ( | |
magic = "searchme" | |
) | |
// I'd like to be able to change this typedef during tests (or execution even, I guess). | |
type DocId uint32 | |
type Index struct { | |
DocCounter DocId | |
docIdToPath map[DocId]string | |
pathToDocId map[string]DocId | |
postings map[string][]DocId | |
} | |
func NewIndex() *Index { | |
var index Index | |
index.postings = make(map[string][]DocId) | |
index.docIdToPath = make(map[DocId]string) | |
index.pathToDocId = make(map[string]DocId) | |
return &index | |
} | |
func (i *Index) Add(doc *Document) error { | |
termSet := make(map[string]struct{}) | |
for _, term := range TokenizeString(doc.Content) { | |
termSet[term] = struct{}{} | |
} | |
docId := i.AssignDocId(doc.Path) | |
for term, _ := range termSet { | |
i.postings[term] = append(i.postings[term], docId) | |
} | |
return nil | |
} | |
func (i *Index) AssignDocId(path string) DocId { | |
docId, contains := i.pathToDocId[path] | |
if !contains { | |
docId = i.DocCounter | |
i.pathToDocId[path] = docId | |
i.docIdToPath[docId] = path | |
i.DocCounter++ | |
} | |
return docId | |
} | |
func (index *Index) TermPaths(term string) []string { | |
docs, found := index.postings[strings.ToLower(term)] | |
if !found { | |
return make([]string, 0) | |
} | |
results := make([]string, len(docs)) | |
for i, docId := range docs { | |
results[i] = index.docIdToPath[docId] | |
} | |
return results | |
} | |
func (index *Index) Write(w io.Writer) { | |
// List of files, null terminated. Doc ids correspond to index. Ends with an empty filename. | |
io.WriteString(w, magic+"\x00") | |
for i := DocId(0); i < index.DocCounter; i++ { | |
io.WriteString(w, index.docIdToPath[i]+"\x00") | |
} | |
io.WriteString(w, "\x00") | |
for term, docIds := range index.postings { | |
// TERM \x00 32-bit-number-of-docs 64bit doc ids until | |
io.WriteString(w, term) | |
io.WriteString(w, "\x00") | |
binary.Write(w, binary.BigEndian, uint32(len(docIds))) | |
for _, id := range docIds { | |
binary.Write(w, binary.BigEndian, id) | |
} | |
} | |
} | |
func stripNull(b []byte) string { | |
return string(b[:len(b)-1]) | |
} | |
func LoadIndex(reader io.Reader) (*Index, error) { | |
index := NewIndex() | |
r := bufio.NewReader(reader) | |
magic, _ := r.ReadBytes('\x00') | |
if stripNull(magic) != "searchme" { | |
return nil, errors.New(fmt.Sprintf("Bad format. Magic bytes not detected: %q", magic)) | |
} | |
// Read docs | |
var i DocId | |
// TODO(kev): Don't ignore errors | |
for p, _ := r.ReadBytes('\x00'); len(p) >= 2; p, _ = r.ReadBytes('\x00') { | |
path := stripNull(p) | |
index.docIdToPath[i] = path | |
index.pathToDocId[path] = i | |
i++ | |
} | |
// Read terms | |
for { | |
t, err := r.ReadBytes('\x00') | |
if err != nil { | |
return index, nil | |
} | |
term := stripNull(t) | |
var size uint32 | |
binary.Read(r, binary.BigEndian, &size) | |
docs := make([]DocId, size) | |
for j := uint32(0); j < size; j++ { | |
var docId DocId | |
binary.Read(r, binary.BigEndian, &docId) | |
docs[j] = docId | |
} | |
index.postings[term] = docs | |
} | |
return index, nil | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package indexing | |
import ( | |
"bytes" | |
"testing" | |
) | |
// All of the serialized data here assumes we're writing uint64 DocIds. | |
// So something like: | |
var testWrites = []struct { | |
docs []Document | |
out string | |
}{ | |
{[]Document{}, | |
// Header | End of Docs | |
"searchme\x00\x00"}, // terminator + doc terminator | |
{[]Document{{"path", "content"}}, | |
// Header | Doc paths + Terminator | term + terminator + num-docs + doc id (0) | |
"searchme\x00path\x00\x00content\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00"}, | |
} | |
func TestWrite64(t *testing.T) { | |
// Set our tests to use 64 bit ids: | |
indexing.DocId = uint64 | |
for testNum, testCase := range testWrites { | |
i := NewIndex() | |
buf := new(bytes.Buffer) | |
for _, doc := range testCase.docs { | |
i.Add(&doc) | |
} | |
i.Write(buf) | |
result := string(buf.Bytes()) | |
if testCase.out != result { | |
t.Fatalf("%d. Expected: %q Actual %q", testNum, testCase.out, result) | |
} | |
} | |
} | |
// Then imagine another version of TestWrite that sets type to uint32 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment