Created
August 2, 2022 13:12
-
-
Save p7g/d2c99b185059b54f9fb41ebfe4734041 to your computer and use it in GitHub Desktop.
Python regular expressions that can be composed into bigger regular expressions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This is a really basic and dumb regexp engine to play with composable regular | |
expressions. | |
A Re object is basically just a pointer to some initial state and to some | |
accepting state. These states can each point to other states, forming an NFA-ε, | |
which can then be evaluated directly. | |
The cool thing is that you can just combine Re objects like this: | |
hello_re = literal("hello") | |
world_re = literal("world") | |
space_re = literal(" ") | literal("\t") | |
spaces_re = space_re + space_re.closure() | |
hello_world_re = hello_re + spaces_re + world_re | |
assert hello_world_re.evaluate("hello \t world") | |
""" | |
from __future__ import annotations | |
from collections import defaultdict, deque | |
class _Epsilon: | |
def __repr__(self) -> str: | |
return "ε" | |
epsilon = _Epsilon() | |
class State: | |
def __init__(self, transitions: dict[str | _Epsilon, list[State]]) -> None: | |
self.transitions = defaultdict(list, transitions) | |
def _copy(self, accepting_state_mapping, seen=None) -> State: | |
if self is accepting_state_mapping[0]: | |
return accepting_state_mapping[1] | |
seen = seen or {} | |
copy = State({}) | |
seen[self] = copy | |
for input_, next_states in self.transitions.items(): | |
for next_state in next_states: | |
if next_state in seen: | |
next_state = seen[next_state] | |
else: | |
next_state = seen[next_state] = next_state._copy(accepting_state_mapping, seen) | |
copy.transitions[input_].append(next_state) | |
return copy | |
def __repr__(self) -> str: | |
return f"State({list(self.transitions.keys())})" | |
class Re: | |
def __init__(self, initial_state: State, accepting_state: State): | |
self.initial_state = initial_state | |
self.accepting_state = accepting_state | |
def evaluate(self, text: str) -> bool: | |
states = {self.initial_state} | |
for c in text: | |
new_states: set[State] = set() | |
work = deque(states) | |
while work: | |
state = work.popleft() | |
if c in state.transitions: | |
new_states.update(state.transitions[c]) | |
if epsilon in state.transitions: | |
work.extend(state.transitions[epsilon]) | |
states = new_states | |
work = deque(states) | |
states = set() | |
while work: | |
state = work.popleft() | |
if epsilon in state.transitions: | |
work.extend(state.transitions[epsilon]) | |
else: | |
states.add(state) | |
return self.accepting_state in states | |
def _copy(self) -> Re: | |
accepting_state = State({}) | |
initial_state = self.initial_state._copy((self.accepting_state, accepting_state)) | |
return Re(initial_state, accepting_state) | |
def __or__(self, other): | |
if isinstance(other, str): | |
other = literal(other) | |
elif isinstance(other, Re): | |
pass | |
else: | |
return NotImplemented | |
return alternation(self, other) | |
def __add__(self, other): | |
if isinstance(other, str): | |
other = literal(other) | |
elif isinstance(other, Re): | |
pass | |
else: | |
return NotImplemented | |
return concatenation(self, other) | |
def closure(self) -> Re: | |
return closure(self) | |
def literal(text: str) -> Re: | |
current_state = initial_state = State({}) | |
for c in text: | |
next_state = State({}) | |
current_state.transitions[c] = [next_state] | |
current_state = next_state | |
return Re(initial_state, current_state) | |
def alternation(a: Re, b: Re) -> Re: | |
a = a._copy() | |
b = b._copy() | |
initial_state = State( | |
{ | |
epsilon: [ | |
a.initial_state, | |
b.initial_state, | |
] | |
}, | |
) | |
accepting_state = State({}) | |
a.accepting_state.transitions[epsilon] = [accepting_state] | |
b.accepting_state.transitions[epsilon] = [accepting_state] | |
return Re(initial_state, accepting_state) | |
def concatenation(a: Re, b: Re) -> Re: | |
a = a._copy() | |
b = b._copy() | |
initial_state = State({epsilon: [a.initial_state]}) | |
a.accepting_state.transitions[epsilon] = [b.initial_state] | |
accepting_state = State({}) | |
b.accepting_state.transitions[epsilon] = [accepting_state] | |
return Re(initial_state, accepting_state) | |
def closure(a: Re) -> Re: | |
a = a._copy() | |
accepting_state = State({}) | |
initial_state = State({epsilon: [a.initial_state, accepting_state]}) | |
a.accepting_state.transitions[epsilon] = [accepting_state, a.initial_state] | |
return Re(initial_state, accepting_state) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment