Skip to content

Instantly share code, notes, and snippets.

@husio
Created February 6, 2015 18:57
Show Gist options
  • Save husio/ed0e241c99550fd20065 to your computer and use it in GitHub Desktop.
Save husio/ed0e241c99550fd20065 to your computer and use it in GitHub Desktop.
index
package main
import (
"bufio"
"io"
"unicode"
)
const (
ngramSize = 3
)
type cursor struct {
// line number
LineNo uint16
// number of bytes from new line, because of unicode
LinePos uint8
Doc string
}
type Index struct {
ngrams map[string][]*cursor
}
func NewIndex() *Index {
return &Index{ngrams: make(map[string][]*cursor)}
}
func (idx *Index) Index(r io.Reader) error {
s := newScanner(r, ngramSize)
for {
if err := s.Next(); err != nil {
if err == io.EOF {
return nil
}
return err
}
ngs := string(s.Ngram)
ngrams, ok := idx.ngrams[ngs]
if !ok {
ngrams = make([]*cursor, 0)
}
c := &cursor{
LineNo: uint16(s.LineNo),
LinePos: uint8(s.RdOff - len(ngs)),
Doc: "",
}
idx.ngrams[ngs] = append(ngrams, c)
}
}
func (idx *Index) Search(query string) {
// XXX
}
type scanner struct {
rd *bufio.Reader
Ngram []rune
LineNo int
RdOff int
}
func newScanner(r io.Reader, ngramSize int) *scanner {
s := &scanner{
rd: bufio.NewReader(r),
Ngram: make([]rune, ngramSize),
LineNo: 0,
RdOff: 0,
}
s.initFill()
return s
}
func (s *scanner) initFill() {
if err := s.skipNoise(' ', 0); err != nil {
return
}
for i := len(s.Ngram) - 1; i > 0; i-- {
s.Ngram[i] = s.Ngram[i-1]
}
}
func (s *scanner) Next() error {
for i := 0; i < len(s.Ngram)-1; i++ {
s.Ngram[i] = s.Ngram[i+1]
}
for {
r, n, err := s.rd.ReadRune()
if err != nil {
return err
}
if isNoise(r) {
if err := s.skipNoise(r, n); err != nil {
return err
}
continue
}
s.Ngram[len(s.Ngram)-1] = r
s.RdOff += n
return nil
}
}
func (s *scanner) skipNoise(r rune, n int) error {
s.RdOff += n
if r == '\n' {
s.LineNo++
s.RdOff = 0
}
for i := 0; i < len(s.Ngram)-1; i++ {
r, n, err := s.rd.ReadRune()
if err != nil {
return err
}
if isNoise(r) {
return s.skipNoise(r, n)
}
s.Ngram[i] = r
s.RdOff += n
}
return nil
}
func isNoise(r rune) bool {
return unicode.IsSpace(r) || unicode.IsSymbol(r) || unicode.IsPunct(r)
}
package main
import (
"bufio"
"flag"
"fmt"
"io"
"log"
"os"
"strings"
"unicode"
)
const (
ngramSize = 3
)
type ngram struct {
// line number
lineno uint16
// number of bytes from new line, because of unicode
linepos uint8
}
type Index struct {
ngrams map[string][]*ngram
}
func NewIndex() *Index {
return &Index{ngrams: make(map[string][]*ngram)}
}
func (idx *Index) Index(r io.Reader) error {
s := newScanner(r, ngramSize)
for {
if err := s.Next(); err != nil {
if err == io.EOF {
return nil
}
return err
}
ngs := string(s.Ngram)
ngrams, ok := idx.ngrams[ngs]
if !ok {
ngrams = make([]*ngram, 0)
}
ng := &ngram{
lineno: uint16(s.LineNo),
linepos: uint8(s.RdOff - len(ngs)),
}
idx.ngrams[ngs] = append(ngrams, ng)
}
}
func (idx *Index) String() string {
lines := make([]string, 0, len(idx.ngrams))
for s, ngrams := range idx.ngrams {
chunks := []string{s}
for _, ng := range ngrams {
chunks = append(chunks, fmt.Sprintf("%d:%d", ng.lineno, ng.linepos))
}
lines = append(lines, strings.Join(chunks, " "))
}
return strings.Join(lines, "\n")
}
func (idx *Index) addNgram(lineno uint16, linepos uint8, gram string) {
ngrams, ok := idx.ngrams[gram]
if !ok {
ngrams = make([]*ngram, 0)
}
idx.ngrams[gram] = append(ngrams, &ngram{lineno: lineno, linepos: linepos})
}
func isNoise(r rune) bool {
return unicode.IsSpace(r) || unicode.IsSymbol(r) || unicode.IsPunct(r)
}
type scanner struct {
rd *bufio.Reader
Ngram []rune
LineNo int
RdOff int
}
func newScanner(r io.Reader, ngramSize int) *scanner {
s := &scanner{
rd: bufio.NewReader(r),
Ngram: make([]rune, ngramSize),
LineNo: 0,
RdOff: 0,
}
s.initFill()
return s
}
func (s *scanner) initFill() {
if err := s.skipNoise(' ', 0); err != nil {
return
}
for i := len(s.Ngram) - 1; i > 0; i-- {
s.Ngram[i] = s.Ngram[i-1]
}
}
func (s *scanner) Next() error {
for i := 0; i < len(s.Ngram)-1; i++ {
s.Ngram[i] = s.Ngram[i+1]
}
for {
r, n, err := s.rd.ReadRune()
if err != nil {
return err
}
if isNoise(r) {
if err := s.skipNoise(r, n); err != nil {
return err
}
continue
}
s.Ngram[len(s.Ngram)-1] = r
s.RdOff += n
return nil
}
}
func (s *scanner) skipNoise(r rune, n int) error {
s.RdOff += n
if r == '\n' {
s.LineNo++
s.RdOff = 0
}
for i := 0; i < len(s.Ngram)-1; i++ {
r, n, err := s.rd.ReadRune()
if err != nil {
return err
}
if isNoise(r) {
return s.skipNoise(r, n)
}
s.Ngram[i] = r
s.RdOff += n
}
return nil
}
func main() {
flag.Parse()
index := NewIndex()
for _, src := range flag.Args() {
fd, err := os.Open(src)
if err != nil {
log.Printf("cannot open %s: %s", src, err)
continue
}
defer fd.Close()
if err := index.Index(fd); err != nil {
log.Printf("cannot parse %s: %s", src, err)
}
}
log.Println(index)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment