tommy-mor · May 7, 2020 03:43
diff --git a/dfa.py b/dfa.py
 import re
 import sre_parse
 import pprint
 import random
 pprint = pprint.PrettyPrinter(4).pprint


 def test_qc(func1, func2):
    for x in range(10000):
        string = (
            bin(random.randint(0, 2**random.randint(2, 25)))
            .replace('0b', '')
            .replace('0', 'a')
            .replace('1', 'b')
        )
        if func1(string) == func2(string):
            pass
        else:
            print('function %s or %s failed on %s' % (func1.__name__,
                                                      func2.__name__, string))

 # - Create a DFA for strings over the alphabet {'a', 'b'} that contain the
 # string "abba" somewhere in them.  Draw this on paper, and work through
 # it by hand for a few examples


 abba_re = re.compile(r'.*abba.*')
 baabaa_re = re.compile(r'.*baabaa.*')

 test_qc(lambda s: abba_re.match(s) is None,
        lambda s: re.compile('abba').search(s) is None)


 def abba_re_fn(s):
    return abba_re.match(s) is not None


 def baabaa_re_fn(s):
    return baabaa_re.match(s) is None


 def testabba(func):
    assert(func("abba"))
    assert(func("aaabba"))
    assert(func("aaabbaaaaaa"))
    assert(not func("aaabaaaaaa"))
    assert(not func("bbaaaaaa"))
    assert(not func("aaaa"))
    assert(not func(""))
    test_qc(abba_re_fn, func)


 def testbaabaa(func):
    assert(func("abba"))
    assert(func("aaabba"))
    assert(func("aaabbaaaaaa"))
    assert(func("aaabaaaaaa"))
    assert(func("bbaaaaaa"))
    assert(func("aaaa"))
    assert(func(""))
    assert(not func("baabaa"))
    assert(not func("abaabaaa"))
    assert(not func("bbbbbbbaabaabbb"))
    assert(func("bbbbbbbababaabbb"))
    test_qc(baabaa_re_fn, func)


 def dfa_abba(string):
    state = 0
    while len(string) != 0:
        c = string[0]
        string = string[1:]
        if state == 0:
            if c == 'a':
                state = 1  # move forward
            else:
                state = 0
        elif state == 1:
            if c == 'b':
                state = 2
            else:
                state = 1
        elif state == 2:
            if c == 'b':
                state = 3
            else:
                state = 1
        elif state == 3:
            if c == 'a':
                state = 4
            else:
                state = 0
        elif state == 4:
            state = 4  # transition in place for all inputs
    return state == 4


 testabba(dfa_abba)


 def dfa_abba_better(string):
    # the transition table is a function from
    # (current state, 'character') -> next state
    # so each entry in the dictionary is a line.
    trans = {(0, 'a'): 1, (0, 'b'): 0,
             (1, 'b'): 2, (1, 'a'): 1,
             (2, 'b'): 3, (2, 'a'): 1,
             (3, 'a'): 4, (3, 'b'): 0,
             (4, 'a'): 4,  (4, 'b'): 4}

    state = 0
    while(len(string) != 0):
        c, string = string[0], string[1:]
        state = trans.get((state, c))

    return state in [4]


 testabba(dfa_abba_better)


 def dfa_baabaa_not(string):
    trans = {(0, 'b'): 1, (0, 'a'): 0,
             (1, 'a'): 2, (1, 'b'): 1,
             (2, 'a'): 3, (2, 'b'): 1,
             (3, 'b'): 4, (3, 'a'): 0,
             (4, 'a'): 5, (4, 'b'): 1,
             (5, 'a'): 6, (5, 'b'): 1,
             (6, 'b'): 6, (6, 'a'): 6}

    state = 0
    while(len(string) != 0):
        c, string = string[0], string[1:]
        state = trans[(state, c)]

    return state in [0, 1, 2, 3, 4, 5]  # feels like cheating


 testbaabaa(dfa_baabaa_not)


 def dfa_rec(trans, initial, final):
    # transition table, initial state, set of final states
    def dfa_internal(string, state):
        if (len(string) == 0):
            return state in final

        c, string = string[0], string[1:]
        return dfa_internal(string, trans[(state, c)])
    return lambda s: dfa_internal(s, initial)


 trans_baabaa = {(0, 'b'): 1, (0, 'a'): 0,
                (1, 'a'): 2, (1, 'b'): 1,
                (2, 'a'): 3, (2, 'b'): 1,
                (3, 'b'): 4, (3, 'a'): 0,
                (4, 'a'): 5, (4, 'b'): 1,
                (5, 'a'): 6, (5, 'b'): 1,
                (6, 'b'): 6, (6, 'a'): 6}

 trans_abba = {(0, 'a'): 1, (0, 'b'): 0,
              (1, 'b'): 2, (1, 'a'): 1,
              (2, 'b'): 3, (2, 'a'): 1,
              (3, 'a'): 4, (3, 'b'): 0,
              (4, 'b'): 4, (4, 'a'): 4}

 dfa_abba_rec = dfa_rec(trans_abba, 0, [4])
 dfa_baabaa_not_rec = dfa_rec(trans_baabaa, 0, [0, 1, 2, 3, 4, 5])


 testbaabaa(dfa_baabaa_not_rec)
 testabba(dfa_abba_rec)

 # not sure if each function should be a character in the alphabet
 # or if each function should be one of the states.
 # im going to go with states, because in general there are fewer


 def split(s):
    return s[0], s[1:]


 def dfa_abba_zero(string):
    if string is '': return False
    c, string = split(string)
    if c == 'a':
        return dfa_abba_one(string)
    else:
        return dfa_abba_zero(string)


 def dfa_abba_one(string):
    if string is '': return False
    c, string = split(string)
    if c == 'b':
        return dfa_abba_two(string)
    else:
        return dfa_abba_one(string)


 def dfa_abba_two(string):
    if string is '': return False
    c, string = split(string)
    if c == 'b':
        return dfa_abba_three(string)
    else:
        return dfa_abba_one(string)


 def dfa_abba_three(string):
    if string is '': return False
    c, string = split(string)
    if c == 'a':
        return dfa_abba_four(string)
    else:
        return dfa_abba_zero(string)


 def dfa_abba_four(string):
    if string is '': return True
    _, string = split(string)
    return dfa_abba_four(string)


 def dfa_abba_mutrec(string):
    return dfa_abba_zero(string)


 testabba(dfa_abba_mutrec)



 #test_qc(abba_re_fn, dfa_abba_mutrec)
 #test_qc(abba_re_fn, dfa_abba)
 #test_qc(abba_re_fn, dfa_abba_rec)
 test_qc(abba_re_fn, dfa_abba_better)


 abba_ast = sre_parse.parse(r'ab(aaaa|c)ba')


 # I don't feel like writing the sixth one one

 # TODO write the NFA and regex

 # TODO read the brown homie paper more
 # TODO use python metaprogramming to do what his thing does more or less?

 # TODO use julia macros to do the brown paper

 # Reject, Epsilon, Character, Sequence, Alternation, or Repeat.

 # ok notes. big difference is that NFA's have espsilon, which means they don't
 # have toconsume a specific input, which means they can choose to go down
 # epsilon paths or not.

 # transform sre_parse ast into easier format
 # my format

 # sequence ("SEQ", [items...])
 # alternative ("ALT", [items...])
 # subpattern(parens) ("SUB", [items...])

 # please forgive me
 literal = sre_parse.parse('a')[0][0]
 subp = sre_parse.parse('(a)')[0][0]
 branch = sre_parse.parse('(aa|a)')[0][1][3][1][0]


 def transform(ast):
    if isinstance(ast, sre_parse.SubPattern):
        return ("SEQ", [transform(x) for x in ast.data])
    elif isinstance(ast, tuple):
        if ast[0] == literal:
            return ("CHAR", chr(ast[1]))
        elif ast[0] == subp:
            return ("SUB", transform(ast[1][3]))
        elif ast[0] == branch:
            return ("ALT", [transform(x) for x in ast[1][1]])
        else:
            raise Exception(ast)
    else:
        raise Exception(ast)


 def log(f):
    def newfn(*args):
        ret = f(*args)
        print("%s: %s -> %s" % (f.__name__, ','.join(map(str, args)), ret))
        return ret
    return newfn


 def newstate(trans):
    # first new state is 1, 0 is starting state
    return max([n for (n, _) in trans.keys()] + [n for n in trans.values()], default=0) + 1


 assert(newstate({(0, 'a'): 4, (1, 'a'): 3}) == 5)
 assert(newstate({}) == 1)


 def construct_char(fromstate, char, trans):
    # each char gets its own state.
    state = newstate(trans)

    if(fromstate == state):
        print(fromstate, state)
        assert(False)
    # adds a transition from fromstate to our state
    trans[(fromstate, char)] = state

    # TODO we must change our runner to map to initial state when theres
    # no transition in table
    # (go to initial for finding any string, go to false for matching)

    # returns the updated dict (not sure if mutation lets us avoid this)
    # and our state
    return (state, trans)


 def construct_seq(fromstate, listofthings, trans):
    # weaves together a list of parsers
    # could be done as a recursive function, loop works fine
    for thing in listofthings:
        (fromstate, trans) = construct(fromstate, thing, trans)

    return (fromstate, trans)


 def construct_alt(fromstate, listofalts, transto be able ):
    # produces nodes on top of eachother
    #           /> c \>
    # a -> b -> -> d -> -> f
    #           \> e />

    tostates = []
    for alt in listofalts:
        (tostate, trans) = construct(fromstate, alt, trans)
        tostates.append(tostate)
    # ok this is fucked. I need iron out what I want the argument/return value to be
    # think of a couple examples. can't rn
    for tostate 

    return (fromstate, trans)


 def construct(fromstate, ast, trans):
    if ast[0] == "SEQ":
        return construct_seq(fromstate, ast[1], trans)
    elif ast[0] == "CHAR":
        return construct_char(fromstate, ast[1], trans)
    elif ast[0] == "ALT":
        return construct_alt(fromstate, ast[1], trans)
    else:
        raise Exception(ast[0])


 def dfa_create(ast):
    initial = 0
    trans = {}
    (fromstate, finaltrans) = construct(initial, ast, trans)

    pprint(finaltrans)

    # test if python semantics is doing what I think
    assert(trans == finaltrans)

    final = [fromstate]  # not sure if right
    return (finaltrans, initial, final)


 # adapted from code above, changed so that in can handle the case where
 # there is no transition. will update later to be able to handle NDA's
 # then handle
 def dfa_run(trans, initial, final):
    # transition table, initial state, set of final states
    def dfa_internal(string, state):
        if (len(string) == 0):
            return state in final

        c, string = string[0], string[1:]

        # by default, go to state zero

        ret = dfa_internal(string, trans.get((state, c), None))
        if ret is None:
            return False  # we reached a dead path
        else:
            return ret

    return lambda s: dfa_internal(s, initial)


 def comp(string):
    # parse regex
    res = transform(sre_parse.parse(string))
    pprint(res)

    # create transition table, initial state, final state

    # create function that runs transition tab
    trans, initial, final = dfa_create(res)

    return dfa_run(trans, initial, final)


 def test_regex(pattern):
    test_qc(lambda s: re.compile(pattern).match(s) is not None, comp(pattern))


 test_regex('abba')
 #test_qc(a, abba_re_fn)

 # QUESTION. i can see there being a compiler from matching re to DFA.
 # is there a compiler from seaching RE to DFA?
 # could you make a compiler for more complex strings than abba?

 # idea: maybe nfa->dfa is that anytime there are two epsilon nodes going to a different states, they must actually go to a new, same state, until they actually difference
 # like the backtracking i was imagining with long words with lots of reperated letters, that idea is actually what the algorithm must emulate.
	import re
	import sre_parse
	import pprint
	import random
	pprint = pprint.PrettyPrinter(4).pprint


	def test_qc(func1, func2):
	for x in range(10000):
	string = (
	bin(random.randint(0, 2**random.randint(2, 25)))
	.replace('0b', '')
	.replace('0', 'a')
	.replace('1', 'b')
	)
	if func1(string) == func2(string):
	pass
	else:
	print('function %s or %s failed on %s' % (func1.__name__,
	func2.__name__, string))

	# - Create a DFA for strings over the alphabet {'a', 'b'} that contain the
	# string "abba" somewhere in them. Draw this on paper, and work through
	# it by hand for a few examples


	abba_re = re.compile(r'.abba.')
	baabaa_re = re.compile(r'.baabaa.')

	test_qc(lambda s: abba_re.match(s) is None,
	lambda s: re.compile('abba').search(s) is None)


	def abba_re_fn(s):
	return abba_re.match(s) is not None


	def baabaa_re_fn(s):
	return baabaa_re.match(s) is None


	def testabba(func):
	assert(func("abba"))
	assert(func("aaabba"))
	assert(func("aaabbaaaaaa"))
	assert(not func("aaabaaaaaa"))
	assert(not func("bbaaaaaa"))
	assert(not func("aaaa"))
	assert(not func(""))
	test_qc(abba_re_fn, func)


	def testbaabaa(func):
	assert(func("abba"))
	assert(func("aaabba"))
	assert(func("aaabbaaaaaa"))
	assert(func("aaabaaaaaa"))
	assert(func("bbaaaaaa"))
	assert(func("aaaa"))
	assert(func(""))
	assert(not func("baabaa"))
	assert(not func("abaabaaa"))
	assert(not func("bbbbbbbaabaabbb"))
	assert(func("bbbbbbbababaabbb"))
	test_qc(baabaa_re_fn, func)


	def dfa_abba(string):
	state = 0
	while len(string) != 0:
	c = string[0]
	string = string[1:]
	if state == 0:
	if c == 'a':
	state = 1 # move forward
	else:
	state = 0
	elif state == 1:
	if c == 'b':
	state = 2
	else:
	state = 1
	elif state == 2:
	if c == 'b':
	state = 3
	else:
	state = 1
	elif state == 3:
	if c == 'a':
	state = 4
	else:
	state = 0
	elif state == 4:
	state = 4 # transition in place for all inputs
	return state == 4


	testabba(dfa_abba)


	def dfa_abba_better(string):
	# the transition table is a function from
	# (current state, 'character') -> next state
	# so each entry in the dictionary is a line.
	trans = {(0, 'a'): 1, (0, 'b'): 0,
	(1, 'b'): 2, (1, 'a'): 1,
	(2, 'b'): 3, (2, 'a'): 1,
	(3, 'a'): 4, (3, 'b'): 0,
	(4, 'a'): 4, (4, 'b'): 4}

	state = 0
	while(len(string) != 0):
	c, string = string[0], string[1:]
	state = trans.get((state, c))

	return state in [4]


	testabba(dfa_abba_better)


	def dfa_baabaa_not(string):
	trans = {(0, 'b'): 1, (0, 'a'): 0,
	(1, 'a'): 2, (1, 'b'): 1,
	(2, 'a'): 3, (2, 'b'): 1,
	(3, 'b'): 4, (3, 'a'): 0,
	(4, 'a'): 5, (4, 'b'): 1,
	(5, 'a'): 6, (5, 'b'): 1,
	(6, 'b'): 6, (6, 'a'): 6}

	state = 0
	while(len(string) != 0):
	c, string = string[0], string[1:]
	state = trans[(state, c)]

	return state in [0, 1, 2, 3, 4, 5] # feels like cheating


	testbaabaa(dfa_baabaa_not)


	def dfa_rec(trans, initial, final):
	# transition table, initial state, set of final states
	def dfa_internal(string, state):
	if (len(string) == 0):
	return state in final

	c, string = string[0], string[1:]
	return dfa_internal(string, trans[(state, c)])
	return lambda s: dfa_internal(s, initial)


	trans_baabaa = {(0, 'b'): 1, (0, 'a'): 0,
	(1, 'a'): 2, (1, 'b'): 1,
	(2, 'a'): 3, (2, 'b'): 1,
	(3, 'b'): 4, (3, 'a'): 0,
	(4, 'a'): 5, (4, 'b'): 1,
	(5, 'a'): 6, (5, 'b'): 1,
	(6, 'b'): 6, (6, 'a'): 6}

	trans_abba = {(0, 'a'): 1, (0, 'b'): 0,
	(1, 'b'): 2, (1, 'a'): 1,
	(2, 'b'): 3, (2, 'a'): 1,
	(3, 'a'): 4, (3, 'b'): 0,
	(4, 'b'): 4, (4, 'a'): 4}

	dfa_abba_rec = dfa_rec(trans_abba, 0, [4])
	dfa_baabaa_not_rec = dfa_rec(trans_baabaa, 0, [0, 1, 2, 3, 4, 5])


	testbaabaa(dfa_baabaa_not_rec)
	testabba(dfa_abba_rec)

	# not sure if each function should be a character in the alphabet
	# or if each function should be one of the states.
	# im going to go with states, because in general there are fewer


	def split(s):
	return s[0], s[1:]


	def dfa_abba_zero(string):
	if string is '': return False
	c, string = split(string)
	if c == 'a':
	return dfa_abba_one(string)
	else:
	return dfa_abba_zero(string)


	def dfa_abba_one(string):
	if string is '': return False
	c, string = split(string)
	if c == 'b':
	return dfa_abba_two(string)
	else:
	return dfa_abba_one(string)


	def dfa_abba_two(string):
	if string is '': return False
	c, string = split(string)
	if c == 'b':
	return dfa_abba_three(string)
	else:
	return dfa_abba_one(string)


	def dfa_abba_three(string):
	if string is '': return False
	c, string = split(string)
	if c == 'a':
	return dfa_abba_four(string)
	else:
	return dfa_abba_zero(string)


	def dfa_abba_four(string):
	if string is '': return True
	_, string = split(string)
	return dfa_abba_four(string)


	def dfa_abba_mutrec(string):
	return dfa_abba_zero(string)


	testabba(dfa_abba_mutrec)



	#test_qc(abba_re_fn, dfa_abba_mutrec)
	#test_qc(abba_re_fn, dfa_abba)
	#test_qc(abba_re_fn, dfa_abba_rec)
	test_qc(abba_re_fn, dfa_abba_better)


	abba_ast = sre_parse.parse(r'ab(aaaa\|c)ba')


	# I don't feel like writing the sixth one one

	# TODO write the NFA and regex

	# TODO read the brown homie paper more
	# TODO use python metaprogramming to do what his thing does more or less?

	# TODO use julia macros to do the brown paper

	# Reject, Epsilon, Character, Sequence, Alternation, or Repeat.

	# ok notes. big difference is that NFA's have espsilon, which means they don't
	# have toconsume a specific input, which means they can choose to go down
	# epsilon paths or not.

	# transform sre_parse ast into easier format
	# my format

	# sequence ("SEQ", [items...])
	# alternative ("ALT", [items...])
	# subpattern(parens) ("SUB", [items...])

	# please forgive me
	literal = sre_parse.parse('a')[0][0]
	subp = sre_parse.parse('(a)')[0][0]
	branch = sre_parse.parse('(aa\|a)')[0][1][3][1][0]


	def transform(ast):
	if isinstance(ast, sre_parse.SubPattern):
	return ("SEQ", [transform(x) for x in ast.data])
	elif isinstance(ast, tuple):
	if ast[0] == literal:
	return ("CHAR", chr(ast[1]))
	elif ast[0] == subp:
	return ("SUB", transform(ast[1][3]))
	elif ast[0] == branch:
	return ("ALT", [transform(x) for x in ast[1][1]])
	else:
	raise Exception(ast)
	else:
	raise Exception(ast)


	def log(f):
	def newfn(*args):
	ret = f(*args)
	print("%s: %s -> %s" % (f.__name__, ','.join(map(str, args)), ret))
	return ret
	return newfn


	def newstate(trans):
	# first new state is 1, 0 is starting state
	return max([n for (n, _) in trans.keys()] + [n for n in trans.values()], default=0) + 1


	assert(newstate({(0, 'a'): 4, (1, 'a'): 3}) == 5)
	assert(newstate({}) == 1)


	def construct_char(fromstate, char, trans):
	# each char gets its own state.
	state = newstate(trans)

	if(fromstate == state):
	print(fromstate, state)
	assert(False)
	# adds a transition from fromstate to our state
	trans[(fromstate, char)] = state

	# TODO we must change our runner to map to initial state when theres
	# no transition in table
	# (go to initial for finding any string, go to false for matching)

	# returns the updated dict (not sure if mutation lets us avoid this)
	# and our state
	return (state, trans)


	def construct_seq(fromstate, listofthings, trans):
	# weaves together a list of parsers
	# could be done as a recursive function, loop works fine
	for thing in listofthings:
	(fromstate, trans) = construct(fromstate, thing, trans)

	return (fromstate, trans)


	def construct_alt(fromstate, listofalts, transto be able ):
	# produces nodes on top of eachother
	# /> c \>
	# a -> b -> -> d -> -> f
	# \> e />

	tostates = []
	for alt in listofalts:
	(tostate, trans) = construct(fromstate, alt, trans)
	tostates.append(tostate)
	# ok this is fucked. I need iron out what I want the argument/return value to be
	# think of a couple examples. can't rn
	for tostate

	return (fromstate, trans)


	def construct(fromstate, ast, trans):
	if ast[0] == "SEQ":
	return construct_seq(fromstate, ast[1], trans)
	elif ast[0] == "CHAR":
	return construct_char(fromstate, ast[1], trans)
	elif ast[0] == "ALT":
	return construct_alt(fromstate, ast[1], trans)
	else:
	raise Exception(ast[0])


	def dfa_create(ast):
	initial = 0
	trans = {}
	(fromstate, finaltrans) = construct(initial, ast, trans)

	pprint(finaltrans)

	# test if python semantics is doing what I think
	assert(trans == finaltrans)

	final = [fromstate] # not sure if right
	return (finaltrans, initial, final)


	# adapted from code above, changed so that in can handle the case where
	# there is no transition. will update later to be able to handle NDA's
	# then handle
	def dfa_run(trans, initial, final):
	# transition table, initial state, set of final states
	def dfa_internal(string, state):
	if (len(string) == 0):
	return state in final

	c, string = string[0], string[1:]

	# by default, go to state zero

	ret = dfa_internal(string, trans.get((state, c), None))
	if ret is None:
	return False # we reached a dead path
	else:
	return ret

	return lambda s: dfa_internal(s, initial)


	def comp(string):
	# parse regex
	res = transform(sre_parse.parse(string))
	pprint(res)

	# create transition table, initial state, final state

	# create function that runs transition tab
	trans, initial, final = dfa_create(res)

	return dfa_run(trans, initial, final)


	def test_regex(pattern):
	test_qc(lambda s: re.compile(pattern).match(s) is not None, comp(pattern))


	test_regex('abba')
	#test_qc(a, abba_re_fn)

	# QUESTION. i can see there being a compiler from matching re to DFA.
	# is there a compiler from seaching RE to DFA?
	# could you make a compiler for more complex strings than abba?

	# idea: maybe nfa->dfa is that anytime there are two epsilon nodes going to a different states, they must actually go to a new, same state, until they actually difference
	# like the backtracking i was imagining with long words with lots of reperated letters, that idea is actually what the algorithm must emulate.