Last active
January 1, 2016 07:19
-
-
Save momchil-velikov/8111004 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"io" | |
"strings" | |
) | |
const ( | |
None = iota | |
Cat | |
Alt | |
ZeroOrMore | |
OneOrMore | |
ZeroOrOne | |
CharClass | |
CharSeq | |
) | |
type operands [2]*ast | |
type ast struct { | |
op byte | |
txt string | |
operand operands | |
} | |
type parse_error struct { | |
nested error | |
msg string | |
} | |
func (e *parse_error) Error() string { | |
if e.nested == nil { | |
return e.msg | |
} else { | |
return fmt.Sprintf("parse error: %s: %s", e.msg, e.nested.Error()) | |
} | |
} | |
func (t *ast) dot() { | |
if t == nil { | |
return | |
} | |
fmt.Printf("n%p [label=\"", t) | |
switch t.op { | |
case Cat: | |
fmt.Printf("concat") | |
case Alt: | |
fmt.Printf("alter") | |
case ZeroOrMore: | |
fmt.Printf("zero-or-more") | |
case OneOrMore: | |
fmt.Printf("one-or-more") | |
case ZeroOrOne: | |
fmt.Printf("zero-or-one") | |
case CharClass: | |
fmt.Printf("char-class: %s", t.txt) | |
case CharSeq: | |
fmt.Printf("char-seq: %s", t.txt) | |
} | |
fmt.Printf("\" ]\n") | |
if t.operand[0] != nil { | |
fmt.Printf("n%p -> n%p\n", t, t.operand[0]) | |
t.operand[0].dot() | |
} | |
if t.operand[1] != nil { | |
fmt.Printf("n%p -> n%p\n", t, t.operand[1]) | |
t.operand[1].dot() | |
} | |
} | |
func (t *ast) Dot() { | |
fmt.Println("digraph AST {\n") | |
t.dot() | |
fmt.Println("}\n") | |
} | |
func (t *ast) compact() *ast { | |
op0, op1 := t.operand[0], t.operand[1] | |
if op0 != nil { | |
op0 = op0.compact() | |
} | |
if op1 != nil { | |
op1 = op1.compact() | |
} | |
t.operand[0] = op0 | |
t.operand[1] = op1 | |
if op0 == nil || op1 == nil { | |
return t | |
} | |
if t.op == Cat { | |
if op0.op == CharSeq && op1.op == CharSeq { | |
return &ast{CharSeq, op0.txt + op1.txt, operands{nil, nil}} | |
} else if op0.op == CharSeq && op1.op == Cat && op1.operand[0].op == CharSeq { | |
p := &ast{CharSeq, op0.txt + op1.operand[0].txt, operands{nil, nil}} | |
return &ast{Cat, "", operands{p, op1.operand[1]}} | |
} else if op0.op == Cat && op0.operand[1].op == CharSeq && op1.op == CharSeq { | |
p := &ast{CharSeq, op0.operand[1].txt + op1.txt, operands{nil, nil}} | |
return &ast{Cat, "", operands{op0.operand[0], p}} | |
} | |
} | |
return t | |
} | |
// re := concat '|' concat | |
// re := concat | |
// concat := closure concat | |
// concat := closure | |
// closure := term '*' | |
// closure := term '?' | |
// closure := term '+' | |
// closure := term | |
// term := '(' re ')' | |
// term := '[' char-seq ']' | |
// term := char | |
func match(ch byte, r io.ByteScanner) error { | |
c, e := r.ReadByte() | |
if e == nil { | |
if c == ch { | |
return nil | |
} else { | |
r.UnreadByte() | |
return &parse_error{nil, fmt.Sprintf("expected '%c', got '%c'", ch, c)} | |
} | |
} else { | |
return e | |
} | |
} | |
func is_reserved(ch byte) bool { | |
return ch == '|' || ch == '*' || ch == '+' || ch == '?' || ch == '[' || | |
ch == ']' || ch == '(' || ch == ')' | |
} | |
func parse_char_class(r io.ByteScanner) (*ast, error) { | |
txt := []byte{} | |
ch, e := r.ReadByte() | |
for e == nil { | |
if is_reserved(ch) { | |
r.UnreadByte() | |
break | |
} | |
txt = append(txt, ch) | |
ch, e = r.ReadByte() | |
} | |
if len(txt) == 0 { | |
return nil, &parse_error{nil, "Empty character class"} | |
} else { | |
return &ast{CharClass, string(txt), operands{nil, nil}}, nil | |
} | |
} | |
func parse_term(r io.ByteScanner) (*ast, error) { | |
ch, e := r.ReadByte() | |
if e != nil { | |
return nil, e | |
} | |
switch ch { | |
case '(': | |
if t, e := parse_re(r); e != nil { | |
return nil, e | |
} else if e = match(')', r); e != nil { | |
return nil, e | |
} else { | |
return t, nil | |
} | |
case '[': | |
if t, e := parse_char_class(r); e != nil { | |
return nil, e | |
} else if e = match(']', r); e != nil { | |
return nil, e | |
} else { | |
return t, nil | |
} | |
default: | |
if is_reserved(ch) { | |
return nil, &parse_error{nil, | |
fmt.Sprintf("Unexpected character: '%c'", ch)} | |
} else { | |
return &ast{CharSeq, string(ch), operands{nil, nil}}, nil | |
} | |
} | |
} | |
func parse_closure(r io.ByteScanner) (*ast, error) { | |
t, e := parse_term(r) | |
if e != nil { | |
return t, e | |
} | |
ch, e := r.ReadByte() | |
if e != nil { | |
return t, e | |
} | |
switch ch { | |
case '*': | |
return &ast{ZeroOrMore, "", operands{t, nil}}, nil | |
case '+': | |
return &ast{OneOrMore, "", operands{t, nil}}, nil | |
case '?': | |
return &ast{ZeroOrOne, "", operands{t, nil}}, nil | |
default: | |
r.UnreadByte() | |
return t, nil | |
} | |
} | |
func parse_concat(r io.ByteScanner) (*ast, error) { | |
t, e := parse_closure(r) | |
if e != nil { | |
return t, e | |
} | |
var ch byte | |
for ch, e = r.ReadByte(); e == nil; ch, e = r.ReadByte() { | |
r.UnreadByte() | |
if ch != '(' && ch != '[' && is_reserved(ch) { | |
return t, nil | |
} else { | |
p, e := parse_closure(r) | |
if e == nil || e == io.EOF { | |
t = &ast{Cat, "", operands{t, p}} | |
} else { | |
return nil, e | |
} | |
} | |
} | |
return t, e | |
} | |
func parse_re(r io.ByteScanner) (*ast, error) { | |
t, e := parse_concat(r) | |
if e != nil { | |
return t, e | |
} | |
var ch byte | |
for ch, e = r.ReadByte(); e == nil; ch, e = r.ReadByte() { | |
if ch == '|' { | |
p, e := parse_concat(r) | |
if e == nil || e == io.EOF { | |
t = &ast{Alt, "", operands{t, p}} | |
} else { | |
return nil, e | |
} | |
} else { | |
r.UnreadByte() | |
return t, nil | |
} | |
} | |
return t, e | |
} | |
func Parse(r io.ByteScanner) (*ast, error) { | |
t, e := parse_re(r) | |
if e == io.EOF { | |
return t.compact(), nil | |
} else { | |
return nil, e | |
} | |
} | |
type nfa_edge struct { | |
label string | |
dst int | |
} | |
type nfa_state struct { | |
succ []nfa_edge | |
} | |
type nfa struct { | |
start, end int | |
states []nfa_state | |
} | |
func (a *nfa) add_edge(from, to int, label string) { | |
s := append(a.states[from].succ, nfa_edge{label, to}) | |
a.states[from].succ = s | |
} | |
func (a *nfa) add_state() int { | |
s := len(a.states) | |
a.states = append(a.states, nfa_state{[]nfa_edge(nil)}) | |
return s | |
} | |
func (a *nfa) build_nfa(t *ast, start, end int) { | |
switch t.op { | |
case Cat: | |
m := a.add_state() | |
a.build_nfa(t.operand[0], start, m) | |
a.build_nfa(t.operand[1], m, end) | |
case Alt: | |
s0, e0 := a.add_state(), a.add_state() | |
a.build_nfa(t.operand[0], s0, e0) | |
s1, e1 := a.add_state(), a.add_state() | |
a.build_nfa(t.operand[1], s1, e1) | |
a.add_edge(start, s0, "") | |
a.add_edge(start, s1, "") | |
a.add_edge(e0, end, "") | |
a.add_edge(e1, end, "") | |
case ZeroOrMore: | |
s, e := a.add_state(), a.add_state() | |
a.build_nfa(t.operand[0], s, e) | |
a.add_edge(e, s, "") | |
a.add_edge(start, s, "") | |
a.add_edge(e, end, "") | |
a.add_edge(start, end, "") | |
case OneOrMore: | |
a.build_nfa(t.operand[0], start, end) | |
a.add_edge(end, start, "") | |
case ZeroOrOne: | |
s, e := a.add_state(), a.add_state() | |
a.build_nfa(t.operand[0], s, e) | |
a.add_edge(start, s, "") | |
a.add_edge(e, end, "") | |
a.add_edge(start, end, "") | |
case CharClass: | |
a.add_edge(start, end, "["+t.txt+"]") | |
case CharSeq: | |
a.add_edge(start, end, t.txt) | |
} | |
} | |
func (t *ast) BuildNFA() *nfa { | |
a := &nfa{} | |
a.start, a.end = a.add_state(), a.add_state() | |
a.build_nfa(t, a.start, a.end) | |
return a | |
} | |
func (s *nfa_state) dot(n int, a *nfa) { | |
fmt.Printf("n%d", n) | |
if n == a.start { | |
fmt.Printf("[color=green]") | |
} else if n == a.end { | |
fmt.Printf("[color=red]") | |
} | |
fmt.Println("") | |
for _, e := range s.succ { | |
fmt.Printf("n%d -> n%d [label=\"%s\"]\n", n, e.dst, e.label) | |
} | |
} | |
func (a *nfa) Dot() { | |
fmt.Println("digraph NFA { rankdir=LR;") | |
for i, s := range a.states { | |
s.dot(i, a) | |
} | |
fmt.Println("}") | |
} | |
func main() { | |
// t, err := Parse(strings.NewReader("(a|b)+ab")) | |
t, err := Parse(strings.NewReader("((ab|b*)+cd)+")) | |
// t, err := Parse(strings.NewReader("((a*b)c)+cd")) | |
// t, err := Parse(strings.NewReader("d+(.d*)?(E(p|m)?d+)?")) | |
// t, err := Parse(strings.NewReader("[0123456789]+(.[0123456789]*)?(E-?[0123456789]+)?")) | |
if err == nil { | |
t.Dot() | |
a := t.BuildNFA() | |
a.Dot() | |
} else { | |
fmt.Printf("error: %s\n", err.Error()) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment