Created
March 10, 2014 05:00
-
-
Save lestrrat/9459727 to your computer and use it in GitHub Desktop.
俺俺 golang mecabバインディング(mecabで日本語をトークナイズするためのミニマルなヤツ)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package tokenizer | |
/* | |
#cgo CFLAGS: XXX CHANGE ME XXX | |
#cfo LDFALGS: XXX CHANGE ME XXX | |
#include <mecab.h> | |
struct mecab_t {} | |
*/ | |
import "C" | |
import "errors" | |
var ErrNoMoreTokens = errors.New("No more tokens") | |
type Iterator interface { | |
Next() (string, err) | |
} | |
type Tokenizer interface { | |
Tokenize(string) Iterator | |
} | |
type TokenizeMecab struct { | |
mecab *C.mecab_t | |
} | |
type TokenizeMecabIter interface { | |
root *C.mecab_node_t | |
current *C.struct_mecab_node_t | |
} | |
func NewMecab(s string) *TokenizeMecab { | |
return &TokenizeMecab { C.mecab_new2(C.CString(s)) } | |
} | |
func (t *TokenizeMecab) Tokenize(input string) *TokenizeMecabIter { | |
p := C.CString(input) | |
node := C.mecab_sparse_tonode(t.mecab, p) | |
return &TokenizeMecabIterator { node, node.next } | |
} | |
func (iter *TokenizeMecabIter) Next() (string, error) { | |
if t.current == nil { | |
return "", ErrNoMoreTokens | |
} | |
node := iter.current | |
iter.current = iter.current.next | |
s := C.GoString(node.surface) | |
return s[:int(node.length)], nil | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment