Created
December 9, 2011 18:18
-
-
Save taylor/1452682 to your computer and use it in GitHub Desktop.
python regex to (simple) NFA parsing code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
From: https://github.com/darius/sketchbook/blob/master/regex/integrated1.py | |
Integrate the right-to-left top-down operator-precedence parser with | |
the simplest terminating NFA code. | |
""" | |
def match(re, s): return run(prepare(re), s) | |
def run(states, s): | |
for c in s: | |
states = set.union(*[state(c) for state in states]) | |
return accepting_state in states | |
def accepting_state(c): return set() | |
def expecting_state(char, k): return lambda c: k(set()) if c == char else set() | |
def state_node(state): return lambda seen: set([state]) | |
def alt_node(k1, k2): return lambda seen: k1(seen) | k2(seen) | |
def loop_node(k, make_k): | |
def loop(seen): | |
if loop in seen: return set() | |
seen.add(loop) | |
return k(seen) | looping(seen) | |
looping = make_k(loop) | |
return loop | |
def prepare(re): | |
ts = list(re) | |
def parse_expr(precedence, k): | |
rhs = parse_factor(k) | |
while ts: | |
if ts[-1] == '(': break | |
prec = 2 if ts[-1] == '|' else 4 | |
if prec < precedence: break | |
if chomp('|'): | |
rhs = alt_node(parse_expr(prec + 1, k), rhs) | |
else: | |
rhs = parse_expr(prec + 1, rhs) | |
return rhs | |
def parse_factor(k): | |
if not ts or ts[-1] in '|(': | |
return k | |
elif chomp(')'): | |
e = parse_expr(0, k) | |
assert chomp('(') | |
return e | |
elif chomp('*'): | |
return loop_node(k, lambda loop: parse_expr(6, loop)) | |
else: | |
return state_node(expecting_state(ts.pop(), k)) | |
def chomp(token): | |
matches = (ts and ts[-1] == token) | |
if matches: ts.pop() | |
return matches | |
k = parse_expr(0, state_node(accepting_state)) | |
assert not ts | |
return k(set()) | |
## match('', '') | |
#. True | |
## match('', 'a') | |
#. False | |
## match('a', '') | |
#. False | |
## match('a', 'a') | |
#. True | |
## match('a', 'aa') | |
#. False | |
## match('ab', '') | |
#. False | |
## match('ab', 'ab') | |
#. True | |
## match('abc', 'ab') | |
#. False | |
## match('abc', 'abc') | |
#. True | |
## match('abc', 'abcd') | |
#. False | |
## match('a|b|c', '') | |
#. False | |
## match('a|b|c', 'a') | |
#. True | |
## match('a|b|c', 'b') | |
#. True | |
## match('a|b|c', 'c') | |
#. True | |
## match('a|b|c', 'd') | |
#. False | |
## match('ab||c', '') | |
#. True | |
## match('ab||c', 'a') | |
#. False | |
## match('ab||c', 'b') | |
#. False | |
## match('ab||c', 'c') | |
#. True | |
## match('ab||c', 'ab') | |
#. True | |
## match('a*', '') | |
#. True | |
## match('a*', 'a') | |
#. True | |
## match('a*', 'aa') | |
#. True | |
## match('a*', 'aab') | |
#. False | |
## match('ab*c', '') | |
#. False | |
## match('ab*c', 'ab') | |
#. False | |
## match('ab*c', 'ac') | |
#. True | |
## match('ab*c', 'abc') | |
#. True | |
## match('ab*c', 'abbbbc') | |
#. True | |
## match('ab*c', 'abbbbd') | |
#. False | |
## match('a(bc|d|)*e', '') | |
#. False | |
## match('a(bc|d|)*e', 'a') | |
#. False | |
## match('a(bc|d|)*e', 'ae') | |
#. True | |
## match('a(bc|d|)*e', 'abe') | |
#. False | |
## match('a(bc|d|)*e', 'ade') | |
#. True | |
## match('a(bc|d|)*e', 'abce') | |
#. True | |
## match('a(bc|d|)*e', 'abcde') | |
#. True | |
## match('(ab*c)', '') | |
#. False | |
## match('(ab*c)', 'abbbbc') | |
#. True | |
## match('()', '') | |
#. True | |
## match('()', 'a') | |
#. False | |
## match('()', 'a') | |
#. False | |
## match('()*', 'a') | |
#. False | |
## match('()*', '') | |
#. True | |
## match('a**', 'aaa') | |
#. True | |
## match('a**', 'b') | |
#. False |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment