Created
February 6, 2015 18:57
-
-
Save husio/ed0e241c99550fd20065 to your computer and use it in GitHub Desktop.
index
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"io" | |
"unicode" | |
) | |
const ( | |
ngramSize = 3 | |
) | |
type cursor struct { | |
// line number | |
LineNo uint16 | |
// number of bytes from new line, because of unicode | |
LinePos uint8 | |
Doc string | |
} | |
type Index struct { | |
ngrams map[string][]*cursor | |
} | |
func NewIndex() *Index { | |
return &Index{ngrams: make(map[string][]*cursor)} | |
} | |
func (idx *Index) Index(r io.Reader) error { | |
s := newScanner(r, ngramSize) | |
for { | |
if err := s.Next(); err != nil { | |
if err == io.EOF { | |
return nil | |
} | |
return err | |
} | |
ngs := string(s.Ngram) | |
ngrams, ok := idx.ngrams[ngs] | |
if !ok { | |
ngrams = make([]*cursor, 0) | |
} | |
c := &cursor{ | |
LineNo: uint16(s.LineNo), | |
LinePos: uint8(s.RdOff - len(ngs)), | |
Doc: "", | |
} | |
idx.ngrams[ngs] = append(ngrams, c) | |
} | |
} | |
func (idx *Index) Search(query string) { | |
// XXX | |
} | |
type scanner struct { | |
rd *bufio.Reader | |
Ngram []rune | |
LineNo int | |
RdOff int | |
} | |
func newScanner(r io.Reader, ngramSize int) *scanner { | |
s := &scanner{ | |
rd: bufio.NewReader(r), | |
Ngram: make([]rune, ngramSize), | |
LineNo: 0, | |
RdOff: 0, | |
} | |
s.initFill() | |
return s | |
} | |
func (s *scanner) initFill() { | |
if err := s.skipNoise(' ', 0); err != nil { | |
return | |
} | |
for i := len(s.Ngram) - 1; i > 0; i-- { | |
s.Ngram[i] = s.Ngram[i-1] | |
} | |
} | |
func (s *scanner) Next() error { | |
for i := 0; i < len(s.Ngram)-1; i++ { | |
s.Ngram[i] = s.Ngram[i+1] | |
} | |
for { | |
r, n, err := s.rd.ReadRune() | |
if err != nil { | |
return err | |
} | |
if isNoise(r) { | |
if err := s.skipNoise(r, n); err != nil { | |
return err | |
} | |
continue | |
} | |
s.Ngram[len(s.Ngram)-1] = r | |
s.RdOff += n | |
return nil | |
} | |
} | |
func (s *scanner) skipNoise(r rune, n int) error { | |
s.RdOff += n | |
if r == '\n' { | |
s.LineNo++ | |
s.RdOff = 0 | |
} | |
for i := 0; i < len(s.Ngram)-1; i++ { | |
r, n, err := s.rd.ReadRune() | |
if err != nil { | |
return err | |
} | |
if isNoise(r) { | |
return s.skipNoise(r, n) | |
} | |
s.Ngram[i] = r | |
s.RdOff += n | |
} | |
return nil | |
} | |
func isNoise(r rune) bool { | |
return unicode.IsSpace(r) || unicode.IsSymbol(r) || unicode.IsPunct(r) | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"flag" | |
"fmt" | |
"io" | |
"log" | |
"os" | |
"strings" | |
"unicode" | |
) | |
const ( | |
ngramSize = 3 | |
) | |
type ngram struct { | |
// line number | |
lineno uint16 | |
// number of bytes from new line, because of unicode | |
linepos uint8 | |
} | |
type Index struct { | |
ngrams map[string][]*ngram | |
} | |
func NewIndex() *Index { | |
return &Index{ngrams: make(map[string][]*ngram)} | |
} | |
func (idx *Index) Index(r io.Reader) error { | |
s := newScanner(r, ngramSize) | |
for { | |
if err := s.Next(); err != nil { | |
if err == io.EOF { | |
return nil | |
} | |
return err | |
} | |
ngs := string(s.Ngram) | |
ngrams, ok := idx.ngrams[ngs] | |
if !ok { | |
ngrams = make([]*ngram, 0) | |
} | |
ng := &ngram{ | |
lineno: uint16(s.LineNo), | |
linepos: uint8(s.RdOff - len(ngs)), | |
} | |
idx.ngrams[ngs] = append(ngrams, ng) | |
} | |
} | |
func (idx *Index) String() string { | |
lines := make([]string, 0, len(idx.ngrams)) | |
for s, ngrams := range idx.ngrams { | |
chunks := []string{s} | |
for _, ng := range ngrams { | |
chunks = append(chunks, fmt.Sprintf("%d:%d", ng.lineno, ng.linepos)) | |
} | |
lines = append(lines, strings.Join(chunks, " ")) | |
} | |
return strings.Join(lines, "\n") | |
} | |
func (idx *Index) addNgram(lineno uint16, linepos uint8, gram string) { | |
ngrams, ok := idx.ngrams[gram] | |
if !ok { | |
ngrams = make([]*ngram, 0) | |
} | |
idx.ngrams[gram] = append(ngrams, &ngram{lineno: lineno, linepos: linepos}) | |
} | |
func isNoise(r rune) bool { | |
return unicode.IsSpace(r) || unicode.IsSymbol(r) || unicode.IsPunct(r) | |
} | |
type scanner struct { | |
rd *bufio.Reader | |
Ngram []rune | |
LineNo int | |
RdOff int | |
} | |
func newScanner(r io.Reader, ngramSize int) *scanner { | |
s := &scanner{ | |
rd: bufio.NewReader(r), | |
Ngram: make([]rune, ngramSize), | |
LineNo: 0, | |
RdOff: 0, | |
} | |
s.initFill() | |
return s | |
} | |
func (s *scanner) initFill() { | |
if err := s.skipNoise(' ', 0); err != nil { | |
return | |
} | |
for i := len(s.Ngram) - 1; i > 0; i-- { | |
s.Ngram[i] = s.Ngram[i-1] | |
} | |
} | |
func (s *scanner) Next() error { | |
for i := 0; i < len(s.Ngram)-1; i++ { | |
s.Ngram[i] = s.Ngram[i+1] | |
} | |
for { | |
r, n, err := s.rd.ReadRune() | |
if err != nil { | |
return err | |
} | |
if isNoise(r) { | |
if err := s.skipNoise(r, n); err != nil { | |
return err | |
} | |
continue | |
} | |
s.Ngram[len(s.Ngram)-1] = r | |
s.RdOff += n | |
return nil | |
} | |
} | |
func (s *scanner) skipNoise(r rune, n int) error { | |
s.RdOff += n | |
if r == '\n' { | |
s.LineNo++ | |
s.RdOff = 0 | |
} | |
for i := 0; i < len(s.Ngram)-1; i++ { | |
r, n, err := s.rd.ReadRune() | |
if err != nil { | |
return err | |
} | |
if isNoise(r) { | |
return s.skipNoise(r, n) | |
} | |
s.Ngram[i] = r | |
s.RdOff += n | |
} | |
return nil | |
} | |
func main() { | |
flag.Parse() | |
index := NewIndex() | |
for _, src := range flag.Args() { | |
fd, err := os.Open(src) | |
if err != nil { | |
log.Printf("cannot open %s: %s", src, err) | |
continue | |
} | |
defer fd.Close() | |
if err := index.Index(fd); err != nil { | |
log.Printf("cannot parse %s: %s", src, err) | |
} | |
} | |
log.Println(index) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment