Last active
February 2, 2018 11:52
-
-
Save pervognsen/b95f6eb73ded6abb7a99e9c73ea16c7e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def record(name, fields=''): | |
fields = fields.split() | |
class cls: | |
def __init__(self, *args): | |
for key, value in zip(fields, args): | |
self.__dict__[key] = value | |
def __repr__(self): | |
return '%s(%s)' % (name, ', '.join(repr(self.__dict__[key]) for key in fields)) | |
cls.__name__ = name | |
return cls | |
def memoize1(f): | |
memo = {} | |
def f_memo(x): | |
try: x_memo = memo[x] | |
except: x_memo = memo[x] = f(x) | |
return x_memo | |
f_memo.memo = memo | |
return f_memo | |
def memoize2(f): | |
memo = {} | |
def f_memo(x, y): | |
try: x_memo = memo[x] | |
except: x_memo = memo[x] = {} | |
try: xy_memo = x_memo[y] | |
except: xy_memo = x_memo[y] = f(x, y) | |
return xy_memo | |
f_memo.memo = memo | |
return f_memo | |
null = record('null')() | |
epsilon = record('epsilon')() | |
Alt = record('alt', 'e1 e2') | |
Seq = record('seq', 'e1 e2') | |
Star = record('star', 'e1') | |
@memoize2 | |
def alt(e1, e2): | |
if id(e2) < id(e1): | |
return alt(e2, e1) | |
elif e1 is null: | |
return e2 | |
elif e2 is null: | |
return e1 | |
elif e1 is e2: | |
return e1 | |
elif isinstance(e1, Alt): | |
return alt(e1.e1, alt(e1.e2, e2)) | |
else: | |
return Alt(e1, e2) | |
@memoize2 | |
def seq(e1, e2): | |
if e1 is null or e2 is null: | |
return null | |
elif e1 is epsilon: | |
return e2 | |
elif e2 is epsilon: | |
return e1 | |
elif isinstance(e1, Seq): | |
return seq(e1.e1, seq(e1.e2, e2)) | |
else: | |
return Seq(e1, e2) | |
@memoize1 | |
def star(e): | |
if e is null or e is epsilon: | |
return epsilon | |
elif isinstance(e, Star): | |
return e.e1 | |
else: | |
return Star(e) | |
def opt(e): | |
return alt(epsilon, e) | |
def plus(e): | |
return seq(e, star(e)) | |
@memoize1 | |
def nullable(e): | |
if e is null or isinstance(e, str): | |
return False | |
elif e is epsilon or isinstance(e, Star): | |
return True | |
elif isinstance(e, Alt): | |
return nullable(e.e1) or nullable(e.e2) | |
elif isinstance(e, Seq): | |
return nullable(e.e1) and nullable(e.e2) | |
@memoize2 | |
def derive(e, c): | |
if e is null or e is epsilon: | |
return null | |
elif isinstance(e, str): | |
return epsilon if e == c else null | |
elif isinstance(e, Alt): | |
return alt(derive(e.e1, c), derive(e.e2, c)) | |
elif isinstance(e, Seq): | |
d = seq(derive(e.e1, c), e.e2) | |
return alt(d, derive(e.e2, c)) if nullable(e.e1) else d | |
elif isinstance(e, Star): | |
return seq(derive(e.e1, c), e) | |
def re_match(e, s): | |
for c in s: | |
e = derive(e, c) | |
return nullable(e) | |
def make_dfa(e): | |
transitions = [] | |
final = [] | |
def new_state(): | |
transitions.append(None) | |
final.append(None) | |
return len(transitions)-1 | |
visited = {} | |
def visit(e): | |
if e not in visited: | |
state = visited[e] = new_state() | |
transitions[state] = [visit(derive(e, chr(n))) for n in range(256)] | |
final[state] = int(nullable(e)) | |
return visited[e] | |
visit(e) | |
return transitions, final | |
def dfa_match(dfa, s): | |
transitions, final = dfa | |
state = 0 | |
for c in s: | |
state = transitions[state][ord(c)] | |
return final[state] | |
def dfa_generate_c(dfa): | |
transitions, final = dfa | |
states = range(len(transitions)) | |
print("%s transitions[][256] = {" % ("char" if len(states) <= 256 else "short")) | |
for state in states: | |
print(" {%s}," % ','.join(str(transitions[state][n]) for n in range(256))) | |
print("};") | |
print("char final[] = {%s};" % ', '.join(str(final[state]) for state in states)) | |
from timeit import timeit | |
s = "x"*10_000_000 + "y" | |
e = seq(plus(seq(plus('x'), plus('x'))), 'y') | |
print(timeit(lambda: print(re_match(e, s)), number=1)) | |
dfa = make_dfa(e) | |
print(timeit(lambda: print(dfa_match(dfa, s)), number=1)) | |
dfa_generate_c(dfa) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Haha, I know what this is!
I was looking through your gists since I saw your tweet re conic sections https://t.co/Utq9RtmvWx
I really like the application of derivatives in parsing, whether or not they are a performant solution