Skip to content

Instantly share code, notes, and snippets.

@tommy-mor
Created May 7, 2020 03:43
Show Gist options
  • Save tommy-mor/3d659aa6d0b1d9b3488b3b47ab47f0dc to your computer and use it in GitHub Desktop.
Save tommy-mor/3d659aa6d0b1d9b3488b3b47ab47f0dc to your computer and use it in GitHub Desktop.
import re
import sre_parse
import pprint
import random
pprint = pprint.PrettyPrinter(4).pprint
def test_qc(func1, func2):
for x in range(10000):
string = (
bin(random.randint(0, 2**random.randint(2, 25)))
.replace('0b', '')
.replace('0', 'a')
.replace('1', 'b')
)
if func1(string) == func2(string):
pass
else:
print('function %s or %s failed on %s' % (func1.__name__,
func2.__name__, string))
# - Create a DFA for strings over the alphabet {'a', 'b'} that contain the
# string "abba" somewhere in them. Draw this on paper, and work through
# it by hand for a few examples
abba_re = re.compile(r'.*abba.*')
baabaa_re = re.compile(r'.*baabaa.*')
test_qc(lambda s: abba_re.match(s) is None,
lambda s: re.compile('abba').search(s) is None)
def abba_re_fn(s):
return abba_re.match(s) is not None
def baabaa_re_fn(s):
return baabaa_re.match(s) is None
def testabba(func):
assert(func("abba"))
assert(func("aaabba"))
assert(func("aaabbaaaaaa"))
assert(not func("aaabaaaaaa"))
assert(not func("bbaaaaaa"))
assert(not func("aaaa"))
assert(not func(""))
test_qc(abba_re_fn, func)
def testbaabaa(func):
assert(func("abba"))
assert(func("aaabba"))
assert(func("aaabbaaaaaa"))
assert(func("aaabaaaaaa"))
assert(func("bbaaaaaa"))
assert(func("aaaa"))
assert(func(""))
assert(not func("baabaa"))
assert(not func("abaabaaa"))
assert(not func("bbbbbbbaabaabbb"))
assert(func("bbbbbbbababaabbb"))
test_qc(baabaa_re_fn, func)
def dfa_abba(string):
state = 0
while len(string) != 0:
c = string[0]
string = string[1:]
if state == 0:
if c == 'a':
state = 1 # move forward
else:
state = 0
elif state == 1:
if c == 'b':
state = 2
else:
state = 1
elif state == 2:
if c == 'b':
state = 3
else:
state = 1
elif state == 3:
if c == 'a':
state = 4
else:
state = 0
elif state == 4:
state = 4 # transition in place for all inputs
return state == 4
testabba(dfa_abba)
def dfa_abba_better(string):
# the transition table is a function from
# (current state, 'character') -> next state
# so each entry in the dictionary is a line.
trans = {(0, 'a'): 1, (0, 'b'): 0,
(1, 'b'): 2, (1, 'a'): 1,
(2, 'b'): 3, (2, 'a'): 1,
(3, 'a'): 4, (3, 'b'): 0,
(4, 'a'): 4, (4, 'b'): 4}
state = 0
while(len(string) != 0):
c, string = string[0], string[1:]
state = trans.get((state, c))
return state in [4]
testabba(dfa_abba_better)
def dfa_baabaa_not(string):
trans = {(0, 'b'): 1, (0, 'a'): 0,
(1, 'a'): 2, (1, 'b'): 1,
(2, 'a'): 3, (2, 'b'): 1,
(3, 'b'): 4, (3, 'a'): 0,
(4, 'a'): 5, (4, 'b'): 1,
(5, 'a'): 6, (5, 'b'): 1,
(6, 'b'): 6, (6, 'a'): 6}
state = 0
while(len(string) != 0):
c, string = string[0], string[1:]
state = trans[(state, c)]
return state in [0, 1, 2, 3, 4, 5] # feels like cheating
testbaabaa(dfa_baabaa_not)
def dfa_rec(trans, initial, final):
# transition table, initial state, set of final states
def dfa_internal(string, state):
if (len(string) == 0):
return state in final
c, string = string[0], string[1:]
return dfa_internal(string, trans[(state, c)])
return lambda s: dfa_internal(s, initial)
trans_baabaa = {(0, 'b'): 1, (0, 'a'): 0,
(1, 'a'): 2, (1, 'b'): 1,
(2, 'a'): 3, (2, 'b'): 1,
(3, 'b'): 4, (3, 'a'): 0,
(4, 'a'): 5, (4, 'b'): 1,
(5, 'a'): 6, (5, 'b'): 1,
(6, 'b'): 6, (6, 'a'): 6}
trans_abba = {(0, 'a'): 1, (0, 'b'): 0,
(1, 'b'): 2, (1, 'a'): 1,
(2, 'b'): 3, (2, 'a'): 1,
(3, 'a'): 4, (3, 'b'): 0,
(4, 'b'): 4, (4, 'a'): 4}
dfa_abba_rec = dfa_rec(trans_abba, 0, [4])
dfa_baabaa_not_rec = dfa_rec(trans_baabaa, 0, [0, 1, 2, 3, 4, 5])
testbaabaa(dfa_baabaa_not_rec)
testabba(dfa_abba_rec)
# not sure if each function should be a character in the alphabet
# or if each function should be one of the states.
# im going to go with states, because in general there are fewer
def split(s):
return s[0], s[1:]
def dfa_abba_zero(string):
if string is '': return False
c, string = split(string)
if c == 'a':
return dfa_abba_one(string)
else:
return dfa_abba_zero(string)
def dfa_abba_one(string):
if string is '': return False
c, string = split(string)
if c == 'b':
return dfa_abba_two(string)
else:
return dfa_abba_one(string)
def dfa_abba_two(string):
if string is '': return False
c, string = split(string)
if c == 'b':
return dfa_abba_three(string)
else:
return dfa_abba_one(string)
def dfa_abba_three(string):
if string is '': return False
c, string = split(string)
if c == 'a':
return dfa_abba_four(string)
else:
return dfa_abba_zero(string)
def dfa_abba_four(string):
if string is '': return True
_, string = split(string)
return dfa_abba_four(string)
def dfa_abba_mutrec(string):
return dfa_abba_zero(string)
testabba(dfa_abba_mutrec)
#test_qc(abba_re_fn, dfa_abba_mutrec)
#test_qc(abba_re_fn, dfa_abba)
#test_qc(abba_re_fn, dfa_abba_rec)
test_qc(abba_re_fn, dfa_abba_better)
abba_ast = sre_parse.parse(r'ab(aaaa|c)ba')
# I don't feel like writing the sixth one one
# TODO write the NFA and regex
# TODO read the brown homie paper more
# TODO use python metaprogramming to do what his thing does more or less?
# TODO use julia macros to do the brown paper
# Reject, Epsilon, Character, Sequence, Alternation, or Repeat.
# ok notes. big difference is that NFA's have espsilon, which means they don't
# have toconsume a specific input, which means they can choose to go down
# epsilon paths or not.
# transform sre_parse ast into easier format
# my format
# sequence ("SEQ", [items...])
# alternative ("ALT", [items...])
# subpattern(parens) ("SUB", [items...])
# please forgive me
literal = sre_parse.parse('a')[0][0]
subp = sre_parse.parse('(a)')[0][0]
branch = sre_parse.parse('(aa|a)')[0][1][3][1][0]
def transform(ast):
if isinstance(ast, sre_parse.SubPattern):
return ("SEQ", [transform(x) for x in ast.data])
elif isinstance(ast, tuple):
if ast[0] == literal:
return ("CHAR", chr(ast[1]))
elif ast[0] == subp:
return ("SUB", transform(ast[1][3]))
elif ast[0] == branch:
return ("ALT", [transform(x) for x in ast[1][1]])
else:
raise Exception(ast)
else:
raise Exception(ast)
def log(f):
def newfn(*args):
ret = f(*args)
print("%s: %s -> %s" % (f.__name__, ','.join(map(str, args)), ret))
return ret
return newfn
def newstate(trans):
# first new state is 1, 0 is starting state
return max([n for (n, _) in trans.keys()] + [n for n in trans.values()], default=0) + 1
assert(newstate({(0, 'a'): 4, (1, 'a'): 3}) == 5)
assert(newstate({}) == 1)
def construct_char(fromstate, char, trans):
# each char gets its own state.
state = newstate(trans)
if(fromstate == state):
print(fromstate, state)
assert(False)
# adds a transition from fromstate to our state
trans[(fromstate, char)] = state
# TODO we must change our runner to map to initial state when theres
# no transition in table
# (go to initial for finding any string, go to false for matching)
# returns the updated dict (not sure if mutation lets us avoid this)
# and our state
return (state, trans)
def construct_seq(fromstate, listofthings, trans):
# weaves together a list of parsers
# could be done as a recursive function, loop works fine
for thing in listofthings:
(fromstate, trans) = construct(fromstate, thing, trans)
return (fromstate, trans)
def construct_alt(fromstate, listofalts, transto be able ):
# produces nodes on top of eachother
# /> c \>
# a -> b -> -> d -> -> f
# \> e />
tostates = []
for alt in listofalts:
(tostate, trans) = construct(fromstate, alt, trans)
tostates.append(tostate)
# ok this is fucked. I need iron out what I want the argument/return value to be
# think of a couple examples. can't rn
for tostate
return (fromstate, trans)
def construct(fromstate, ast, trans):
if ast[0] == "SEQ":
return construct_seq(fromstate, ast[1], trans)
elif ast[0] == "CHAR":
return construct_char(fromstate, ast[1], trans)
elif ast[0] == "ALT":
return construct_alt(fromstate, ast[1], trans)
else:
raise Exception(ast[0])
def dfa_create(ast):
initial = 0
trans = {}
(fromstate, finaltrans) = construct(initial, ast, trans)
pprint(finaltrans)
# test if python semantics is doing what I think
assert(trans == finaltrans)
final = [fromstate] # not sure if right
return (finaltrans, initial, final)
# adapted from code above, changed so that in can handle the case where
# there is no transition. will update later to be able to handle NDA's
# then handle
def dfa_run(trans, initial, final):
# transition table, initial state, set of final states
def dfa_internal(string, state):
if (len(string) == 0):
return state in final
c, string = string[0], string[1:]
# by default, go to state zero
ret = dfa_internal(string, trans.get((state, c), None))
if ret is None:
return False # we reached a dead path
else:
return ret
return lambda s: dfa_internal(s, initial)
def comp(string):
# parse regex
res = transform(sre_parse.parse(string))
pprint(res)
# create transition table, initial state, final state
# create function that runs transition tab
trans, initial, final = dfa_create(res)
return dfa_run(trans, initial, final)
def test_regex(pattern):
test_qc(lambda s: re.compile(pattern).match(s) is not None, comp(pattern))
test_regex('abba')
#test_qc(a, abba_re_fn)
# QUESTION. i can see there being a compiler from matching re to DFA.
# is there a compiler from seaching RE to DFA?
# could you make a compiler for more complex strings than abba?
# idea: maybe nfa->dfa is that anytime there are two epsilon nodes going to a different states, they must actually go to a new, same state, until they actually difference
# like the backtracking i was imagining with long words with lots of reperated letters, that idea is actually what the algorithm must emulate.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment