Skip to content

Instantly share code, notes, and snippets.

@vishwakt
Forked from taylor/regexnfaparser.py
Created August 28, 2014 06:55
Show Gist options
  • Save vishwakt/7d1be367ad980b8c7f26 to your computer and use it in GitHub Desktop.
Save vishwakt/7d1be367ad980b8c7f26 to your computer and use it in GitHub Desktop.
"""
From: https://github.com/darius/sketchbook/blob/master/regex/integrated1.py
Integrate the right-to-left top-down operator-precedence parser with
the simplest terminating NFA code.
"""
def match(re, s): return run(prepare(re), s)
def run(states, s):
for c in s:
states = set.union(*[state(c) for state in states])
return accepting_state in states
def accepting_state(c): return set()
def expecting_state(char, k): return lambda c: k(set()) if c == char else set()
def state_node(state): return lambda seen: set([state])
def alt_node(k1, k2): return lambda seen: k1(seen) | k2(seen)
def loop_node(k, make_k):
def loop(seen):
if loop in seen: return set()
seen.add(loop)
return k(seen) | looping(seen)
looping = make_k(loop)
return loop
def prepare(re):
ts = list(re)
def parse_expr(precedence, k):
rhs = parse_factor(k)
while ts:
if ts[-1] == '(': break
prec = 2 if ts[-1] == '|' else 4
if prec < precedence: break
if chomp('|'):
rhs = alt_node(parse_expr(prec + 1, k), rhs)
else:
rhs = parse_expr(prec + 1, rhs)
return rhs
def parse_factor(k):
if not ts or ts[-1] in '|(':
return k
elif chomp(')'):
e = parse_expr(0, k)
assert chomp('(')
return e
elif chomp('*'):
return loop_node(k, lambda loop: parse_expr(6, loop))
else:
return state_node(expecting_state(ts.pop(), k))
def chomp(token):
matches = (ts and ts[-1] == token)
if matches: ts.pop()
return matches
k = parse_expr(0, state_node(accepting_state))
assert not ts
return k(set())
## match('', '')
#. True
## match('', 'a')
#. False
## match('a', '')
#. False
## match('a', 'a')
#. True
## match('a', 'aa')
#. False
## match('ab', '')
#. False
## match('ab', 'ab')
#. True
## match('abc', 'ab')
#. False
## match('abc', 'abc')
#. True
## match('abc', 'abcd')
#. False
## match('a|b|c', '')
#. False
## match('a|b|c', 'a')
#. True
## match('a|b|c', 'b')
#. True
## match('a|b|c', 'c')
#. True
## match('a|b|c', 'd')
#. False
## match('ab||c', '')
#. True
## match('ab||c', 'a')
#. False
## match('ab||c', 'b')
#. False
## match('ab||c', 'c')
#. True
## match('ab||c', 'ab')
#. True
## match('a*', '')
#. True
## match('a*', 'a')
#. True
## match('a*', 'aa')
#. True
## match('a*', 'aab')
#. False
## match('ab*c', '')
#. False
## match('ab*c', 'ab')
#. False
## match('ab*c', 'ac')
#. True
## match('ab*c', 'abc')
#. True
## match('ab*c', 'abbbbc')
#. True
## match('ab*c', 'abbbbd')
#. False
## match('a(bc|d|)*e', '')
#. False
## match('a(bc|d|)*e', 'a')
#. False
## match('a(bc|d|)*e', 'ae')
#. True
## match('a(bc|d|)*e', 'abe')
#. False
## match('a(bc|d|)*e', 'ade')
#. True
## match('a(bc|d|)*e', 'abce')
#. True
## match('a(bc|d|)*e', 'abcde')
#. True
## match('(ab*c)', '')
#. False
## match('(ab*c)', 'abbbbc')
#. True
## match('()', '')
#. True
## match('()', 'a')
#. False
## match('()', 'a')
#. False
## match('()*', 'a')
#. False
## match('()*', '')
#. True
## match('a**', 'aaa')
#. True
## match('a**', 'b')
#. False
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment