Last active
November 15, 2024 15:29
-
-
Save dmitmel/a3be2af47a5d4e7ea7ee2087a4ba016f to your computer and use it in GitHub Desktop.
RE => NFA => DFA => min-DFA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import OrderedDict | |
from dataclasses import dataclass, field | |
from enum import Enum | |
import random | |
import sys | |
import typing as T | |
import graphviz | |
EPSILON = "\u03b5" | |
EMPTY_SET = "\u2205" | |
CENTERED_ASTERISK = "\u2217" | |
THIN_SPACE = "\u2009" | |
HAIR_SPACE = "\u200a" | |
FONT_SIZE = 12 | |
PLUS_IS_UNION = True | |
RENDER_GRAPHS = False | |
def add_visual_attrs_to_graph( | |
graph: graphviz.Digraph | graphviz.Graph, font_size: int = FONT_SIZE | |
) -> None: | |
# <https://tex.stackexchange.com/a/642722> | |
# <https://www.overleaf.com/learn/latex/Questions%2FWhich_OTF_or_TTF_fonts_are_supported_via_fontspec%3F#Serif_fonts> | |
font_attrs = {"fontsize": f"{font_size}pt", "fontname": "CMU Serif"} | |
graph.attr("graph", font_attrs, margin="0") | |
graph.attr("node", font_attrs) | |
graph.attr("edge", font_attrs) | |
def format_symbol(text: str): | |
return f"<I>{text}</I>" if not text.isnumeric() else text | |
@dataclass | |
class RegexAtom: | |
start_pos: int | |
end_pos: int | |
grouped: bool = field(default=False, init=False) | |
@dataclass | |
class RegexSequence(RegexAtom): | |
atoms: list[RegexAtom] | |
@dataclass | |
class RegexUnion(RegexAtom): | |
atoms: list[RegexAtom] | |
@dataclass | |
class RegexRepetition(RegexAtom): | |
atom: RegexAtom | |
zero: bool | |
@dataclass | |
class RegexSymbol(RegexAtom): | |
character: str | |
class RegexParser: | |
# expression ::= union ; | |
# union ::= sequence ( "|" sequence )* ; | |
# sequence ::= ( qualified-atom )* ; | |
# qualified-atom ::= atom ( "*" | "+" )? ; | |
# atom ::= symbol | group ; | |
# group ::= "(" expression ")" ; | |
class Token(Enum): | |
GROUP_START = 1 | |
GROUP_END = 2 | |
UNION = 3 | |
ONE_OR_MORE = 4 | |
ZERO_OR_MORE = 5 | |
LITERAL = 6 | |
METACHARACTER_TOKENS: dict[str, Token] = { | |
"(": Token.GROUP_START, | |
")": Token.GROUP_END, | |
"|": Token.UNION if not PLUS_IS_UNION else Token.LITERAL, | |
"+": Token.ONE_OR_MORE if not PLUS_IS_UNION else Token.UNION, | |
"*": Token.ZERO_OR_MORE, | |
} | |
def __init__(self, text: str) -> None: | |
self.text = text | |
self.pos = 0 | |
def parse(self) -> RegexAtom: | |
atom = self.parse_union() | |
if self.pos != len(self.text): | |
raise Exception("Unexpected input") | |
return atom | |
def parse_union(self) -> RegexUnion | RegexAtom: | |
start_pos = self.pos | |
atoms: list[RegexAtom] = [self.parse_sequence()] | |
while self.match(self.Token.UNION): | |
atoms.append(self.parse_sequence()) | |
return RegexUnion(start_pos, self.pos, atoms) if len(atoms) > 1 else atoms[0] | |
def parse_sequence(self) -> RegexSequence | RegexAtom: | |
start_pos = self.pos | |
atoms: list[RegexAtom] = [] | |
while atom := self.parse_quantified_atom(): | |
atoms.append(atom) | |
return RegexSequence(start_pos, self.pos, atoms) if len(atoms) != 1 else atoms[0] | |
def parse_quantified_atom(self) -> RegexRepetition | RegexAtom | None: | |
start_pos = self.pos | |
atom = self.parse_atom() | |
if char := self.match(self.Token.ZERO_OR_MORE, self.Token.ONE_OR_MORE): | |
if atom is None: | |
raise Exception("Nothing to repeat") | |
return RegexRepetition(start_pos, self.pos, atom, zero=(char == "*")) | |
else: | |
return atom | |
def parse_atom(self) -> RegexSymbol | RegexAtom | None: | |
start_pos = self.pos | |
if matched_char := self.match(self.Token.LITERAL): | |
return RegexSymbol(start_pos, self.pos, matched_char) | |
elif self.match(self.Token.GROUP_START): | |
atom = self.parse_union() | |
atom.grouped = True | |
if not self.match(self.Token.GROUP_END): | |
raise Exception("Expected ')'") | |
return atom | |
else: | |
return None | |
def match(self, *expected_tokens: Token) -> str | None: | |
if 0 <= self.pos < len(self.text): | |
char = self.text[self.pos] | |
token = self.METACHARACTER_TOKENS.get(char, self.Token.LITERAL) | |
if token in expected_tokens: | |
self.pos += 1 | |
return char | |
return None | |
@dataclass | |
class NFAState: | |
name: int | |
chain_group: int | |
@dataclass | |
class NFATransition: | |
from_state: NFAState | |
to_state: NFAState | |
symbol: str | |
atom_type: type[RegexAtom] | |
class _Sortable(T.Protocol): | |
def __lt__(self, other: T.Any, /) -> bool: ... | |
_State = T.TypeVar("_State", bound=_Sortable) | |
_Symbol = T.TypeVar("_Symbol", bound=_Sortable) | |
class DFA(T.Generic[_State, _Symbol]): | |
__slots__ = "_transition_table", "_symbols", "_states", "_starting_state", "_final_states" | |
def __init__(self) -> None: | |
self._transition_table = dict[tuple[_State, _Symbol], _State]() | |
self._symbols = set[_Symbol]() | |
self._states = set[_State]() | |
self._starting_state: _State | None = None | |
self._final_states = set[_State]() | |
@property | |
def symbols(self) -> T.AbstractSet[_Symbol]: | |
return self._symbols | |
@property | |
def states(self) -> T.AbstractSet[_State]: | |
return self._states | |
@property | |
def starting_state(self) -> _State | None: | |
return self._starting_state | |
@property | |
def final_states(self) -> T.AbstractSet[_State]: | |
return self._final_states | |
def add_symbol(self, symbol: _Symbol) -> None: | |
assert symbol not in self._symbols | |
self._symbols.add(symbol) | |
def add_symbols(self, symbols: T.Iterable[_Symbol]) -> None: | |
assert self._symbols.isdisjoint(symbols) | |
self._symbols.update(symbols) | |
def add_state(self, state: _State, starting: bool = False, final: bool = False) -> None: | |
assert state not in self._states | |
assert not starting or self._starting_state is None | |
self._states.add(state) | |
if starting: | |
self._starting_state = state | |
if final: | |
self._final_states.add(state) | |
def is_starting_state(self, state: _State) -> bool: | |
assert state in self._states | |
return state == self._starting_state | |
def is_final_state(self, state: _State) -> bool: | |
assert state in self._states | |
return state in self._final_states | |
def add_transition(self, state: _State, symbol: _Symbol, next_state: _State) -> None: | |
assert state in self._states | |
assert symbol in self._symbols | |
assert next_state in self._states | |
assigned_state = self._transition_table.setdefault((state, symbol), next_state) | |
assert assigned_state == next_state | |
def transition(self, state: _State, symbol: _Symbol) -> _State | None: | |
return self._transition_table.get((state, symbol)) | |
def all_transitions(self) -> T.Iterator[tuple[tuple[_State, _Symbol], _State]]: | |
return iter(self._transition_table.items()) | |
def to_graph(self) -> graphviz.Digraph: | |
graph = graphviz.Digraph() | |
add_visual_attrs_to_graph(graph) | |
graph.attr("graph", rankdir="LR") | |
graph.node("START", shape="plaintext") | |
graph.attr("node", shape="circle", margin="0.08,0.04", width="0", height="0") | |
for state in sorted(self._states): | |
graph.node(str(state), shape="doublecircle" if state in self.final_states else None) | |
if self.starting_state is not None: | |
graph.edge("START", str(self.starting_state)) | |
grouped_transitions = dict[tuple[_State, _State], list[_Symbol]]() | |
for (state, symbol), next_state in self._transition_table.items(): | |
grouped_transitions.setdefault((state, next_state), []).append(symbol) | |
for (state, next_state), symbols in sorted(grouped_transitions.items()): | |
label = "<" + ",".join(map(format_symbol, map(str, sorted(symbols)))) + ">" | |
graph.edge(str(state), str(next_state), label) | |
return graph | |
class RE2NFA: | |
def __init__(self, atom: RegexAtom) -> None: | |
self.states = list[NFAState]() | |
self.transitions = list[NFATransition]() | |
self.current_state_id = 0 | |
self.current_chain_group = 0 | |
self.start_state = self.add_state(self.current_chain_group) | |
self.final_state = self.add_state(self.current_chain_group) | |
self.name_state(self.start_state) | |
self.convert(atom, self.start_state, self.final_state, self.current_chain_group) | |
self.name_state(self.final_state) | |
def convert( | |
self, atom: RegexAtom, link_start: NFAState, link_end: NFAState, chain_group: int | |
) -> None: | |
if isinstance(atom, RegexSymbol): | |
self.add_transition(link_start, link_end, atom.character, RegexSymbol) | |
elif isinstance(atom, RegexSequence): | |
if len(atom.atoms) == 0: | |
self.add_transition(link_start, link_end, EPSILON, RegexSequence) | |
else: | |
link_prev = link_start | |
for subatom in atom.atoms[:-1]: | |
link_next = self.add_state(chain_group) | |
self.convert(subatom, link_prev, link_next, chain_group) | |
self.name_state(link_next) | |
link_prev = link_next | |
self.convert(atom.atoms[-1], link_prev, link_end, chain_group) | |
elif isinstance(atom, RegexUnion): | |
for subatom in atom.atoms: | |
self.current_chain_group += 1 | |
sub_chain_group = self.current_chain_group | |
sub_link_start = self.add_state(sub_chain_group) | |
sub_link_end = self.add_state(sub_chain_group) | |
self.add_transition(link_start, sub_link_start, EPSILON, RegexUnion) | |
self.add_transition(sub_link_end, link_end, EPSILON, RegexUnion) | |
self.name_state(sub_link_start) | |
self.convert(subatom, sub_link_start, sub_link_end, sub_chain_group) | |
self.name_state(sub_link_end) | |
elif isinstance(atom, RegexRepetition): | |
subatom = atom.atom | |
if atom.zero: | |
sub_link_start = self.add_state(chain_group) | |
sub_link_end = self.add_state(chain_group) | |
self.add_transition(link_start, sub_link_start, EPSILON, RegexSequence) | |
self.add_transition(sub_link_end, link_end, EPSILON, RegexSequence) | |
self.add_transition(link_start, link_end, EPSILON, RegexRepetition) | |
self.add_transition(sub_link_end, sub_link_start, EPSILON, RegexRepetition) | |
self.name_state(sub_link_start) | |
self.convert(subatom, sub_link_start, sub_link_end, chain_group) | |
self.name_state(sub_link_end) | |
else: | |
self.add_transition(link_end, link_start, EPSILON, RegexRepetition) | |
self.convert(subatom, link_start, link_end, chain_group) | |
else: | |
raise Exception(f"Unexpected atom type: {type(atom).__name__}") | |
def add_state(self, chain_group: int) -> NFAState: | |
state = NFAState(-1, chain_group) | |
self.states.append(state) | |
return state | |
def add_transition( | |
self, from_state: NFAState, to_state: NFAState, symbol: str, kind: type[RegexAtom] | |
) -> None: | |
self.transitions.append(NFATransition(from_state, to_state, symbol, kind)) | |
def name_state(self, state: NFAState) -> None: | |
state.name = self.current_state_id | |
self.current_state_id += 1 | |
def find_state(self, name: int) -> NFAState | None: | |
for state in self.states: | |
if state.name == name: | |
return state | |
return None | |
def to_graph(self, hide_state_labels: bool = False) -> graphviz.Digraph: | |
graph = graphviz.Digraph() | |
add_visual_attrs_to_graph(graph, font_size=20) | |
graph.attr("graph", rankdir="LR") | |
if hide_state_labels: | |
graph.node("START", shape="none", label="", width="0", height="0") | |
else: | |
graph.node("START", shape="plaintext") | |
graph.attr("node", shape="circle", margin="0.04,0.02", width="0", height="0") | |
if hide_state_labels: | |
graph.attr("node", label=" ") | |
for state in self.states: | |
graph.node( | |
str(state.name), | |
shape="doublecircle" if state == self.final_state else None, | |
group=str(state.chain_group), | |
) | |
for transition in self.transitions: | |
graph.edge( | |
str(transition.from_state.name), | |
str(transition.to_state.name), | |
"<" + format_symbol(transition.symbol) + ">", | |
constraint="false" if transition.atom_type is RegexRepetition else None, | |
# weight="0" if transition.atom_type is RegexUnion else None, | |
) | |
graph.edge("START", str(self.start_state.name)) | |
return graph | |
class NFA2DFA: | |
def __init__(self, nfa: RE2NFA) -> None: | |
self.nfa_start_state: int = nfa.start_state.name | |
self.nfa_final_state: int = nfa.final_state.name | |
self.nfa_transition_table = dict[tuple[int, str], list[int]]() | |
for transition in nfa.transitions: | |
key = (transition.from_state.name, transition.symbol) | |
self.nfa_transition_table.setdefault(key, []).append(transition.to_state.name) | |
self.subsets_to_names = dict[frozenset[int], str]() | |
self.subset_names_generator = iter("ABCDEFGHIJKLMNOPQRSTUVWXYZ") | |
self.all_symbols: list[str] = sorted(set(tr.symbol for tr in nfa.transitions) - set(EPSILON)) | |
self.dfa = DFA[str, str]() | |
self.dfa.add_symbols(self.all_symbols) | |
self.convert() | |
def convert(self) -> None: | |
# <https://tex.stackexchange.com/a/67540> | |
# <https://tex.stackexchange.com/a/1960> | |
# <https://stackoverflow.com/a/65431499/12005228> | |
# print(r"{") | |
# print(r"\makeatletter") | |
# print(r"\def\old@comma{,}") | |
# print(r"\catcode`\,=13") | |
# print(r"\def,{%") | |
# print(r" \ifmmode%") | |
# print(r" \old@comma\discretionary{}{}{}%") | |
# print(r" \else%") | |
# print(r" \old@comma%") | |
# print(r" \fi%") | |
# print(r"}") | |
# print(r"\makeatother") | |
# print() | |
def subset_to_string(subset: frozenset[int]) -> str: | |
if len(subset) > 0: | |
return r"\{" + ",".join(map(str, subset)) + r"\}" | |
else: | |
return r"\varnothing" | |
print( | |
r"We start out by finding and naming the subset reachable from the " | |
r"initial state of the NFA:" | |
) | |
starting_subset = frozenset([self.nfa_start_state]) | |
print(r"\[ \epscl(%s) = " % subset_to_string(starting_subset), end="") | |
starting_subset = self.epsilon_closure(starting_subset) | |
_, starting_subset_name = self.name_subset(starting_subset) | |
print(r"%s = %s \]" % (subset_to_string(starting_subset), starting_subset_name)) | |
print( | |
r"Next, we will try finding other reachable subsets by moving from the " | |
r"known subsets via transitions on all available symbols, until we cannot " | |
"assign any more new names." | |
) | |
queue = [(starting_subset_name, starting_subset)] | |
while len(queue) > 0: | |
subset_name, subset = queue.pop() | |
for symbol in self.all_symbols: | |
print(r"\[ \epscl(\move(%s, %s)) = " % (subset_name, symbol), end="") | |
new_subset = self.move_function(subset, symbol) | |
print(r"\epscl(%s) = " % (subset_to_string(new_subset)), end="") | |
new_subset = self.epsilon_closure(new_subset) | |
print(subset_to_string(new_subset), end="") | |
if len(new_subset) == 0: | |
print(r" \]") | |
continue | |
was_unnamed, new_subset_name = self.name_subset(new_subset) | |
print(r" = %s \]" % new_subset_name) | |
self.dfa.add_transition(subset_name, symbol, new_subset_name) | |
if was_unnamed: | |
print(r"\( %s \) is a new subset." % new_subset_name) | |
if self.nfa_final_state in new_subset: | |
print(r"It also contains the final state.") | |
queue.append((new_subset_name, new_subset)) | |
# print(r"}") | |
print() | |
print(r"At this point we have discovered and named all unique reachable subsets.") | |
# self.dfa.add_state("Z") | |
# for symbol in self.all_symbols: | |
# self.dfa.add_transition("Z", symbol, "Z") | |
# for state in self.dfa.states: | |
# for symbol in self.all_symbols: | |
# if self.dfa.transition(state, symbol) is None: | |
# print(state, symbol, file=sys.stderr) | |
# self.dfa.add_transition(state, symbol, "Z") | |
def move_function(self, starting_subset: frozenset[int], symbol: str) -> frozenset[int]: | |
reachable = set[int]() | |
for state in starting_subset: | |
transitions = self.nfa_transition_table.get((state, symbol), []) | |
reachable.update(transitions) | |
return frozenset(reachable) | |
def epsilon_closure(self, starting_subset: frozenset[int]) -> frozenset[int]: | |
queue = list(starting_subset) | |
visited = set(starting_subset) | |
while len(queue) > 0: | |
state = queue.pop() | |
transitions = self.nfa_transition_table.get((state, EPSILON), []) | |
for next_state in transitions: | |
if next_state not in visited: | |
visited.add(next_state) | |
queue.append(next_state) | |
return frozenset(visited) | |
def name_subset(self, subset: frozenset[int]) -> tuple[bool, str]: | |
assert len(subset) > 0 | |
name = self.subsets_to_names.get(subset) | |
if name is None: | |
name = next(self.subset_names_generator) | |
self.subsets_to_names[subset] = name | |
self.dfa.add_state( | |
name, starting=self.nfa_start_state in subset, final=self.nfa_final_state in subset | |
) | |
return True, name | |
else: | |
return False, name | |
def minimize_dfa(dfa: DFA[_State, _Symbol]) -> DFA[_State, _Symbol]: | |
marked_pairs = set[tuple[_State, _State]]() | |
unmarked_pairs = set[tuple[_State, _State]]() | |
dfa_states = sorted(dfa.states) | |
dfa_symbols = sorted(dfa.symbols) | |
def is_pair_marked(state1: _State, state2: _State) -> bool: | |
return (state1, state2) in marked_pairs or (state2, state1) in marked_pairs | |
def print_distinguishable_table() -> None: | |
print(r"\begin{tabular}{|r" + "|c" * len(dfa_states) + "|}") | |
print(r"\hline") | |
print("& " + " & ".join(map(str, dfa_states)) + " \\\\") | |
for i, state1 in enumerate(dfa_states): | |
print(r"\hline") | |
marks = list[str]() | |
for j, state2 in enumerate(dfa_states): | |
if i == j: | |
marks.append(r"\cellcolor{lightgray}{--}") | |
elif is_pair_marked(state1, state2): | |
marks.append(r"$\times$") | |
else: | |
marks.append(" ") | |
print(str(state1) + " & " + " & ".join(marks) + " \\\\") | |
print(r"\hline") | |
print(r"\end{tabular}") | |
print( | |
r"We draw a table for pairs of all states, and we will be marking every pair " | |
r"where it is known that the two states are definitely distinguishable. We start " | |
r"out by marking every pair that can be distinguished by the fact of only one of " | |
r"the states being a final state:" | |
) | |
for state1 in dfa_states: | |
for state2 in dfa_states: | |
if state1 != state2: | |
distinguishable = dfa.is_final_state(state1) != dfa.is_final_state(state2) | |
pair = (state1, state2) if state1 < state2 else (state2, state1) | |
(marked_pairs if distinguishable else unmarked_pairs).add(pair) | |
print(r"\begin{table}[H]") | |
print(r"\centering") | |
print_distinguishable_table() | |
print(r"\caption{Distinguishable states.}") | |
print(r"\end{table}") | |
print() | |
print( | |
r"Next, we will repeatedly check all pairs that have remained unmarked. A pair " | |
r"can be marked if a transition on any symbol leads to a pair that is already " | |
r"known to be distinguishable (therefore this property is transitive)." | |
) | |
print(r"\begin{enumerate}") | |
while True: | |
marked_any = False | |
for pair in sorted(unmarked_pairs): | |
state1, state2 = pair | |
for symbol in dfa_symbols: | |
next_state1 = dfa.transition(state1, symbol) | |
next_state2 = dfa.transition(state2, symbol) | |
pair1_text = (state1, state2) | |
pair2_text = (next_state1 or r"\text{--}", next_state2 or r"\text{--}") | |
print( | |
r"\item \( (%s, %s) \xrightarrow{%s} (%s, %s) \)" % (*pair1_text, symbol, *pair2_text) | |
) | |
distinguishable = False | |
if next_state1 is not None and next_state2 is not None: | |
if is_pair_marked(next_state1, next_state2): | |
distinguishable = True | |
print( | |
r" --- distinguishable because the pair \( (%s, %s) \) has been marked" % pair2_text | |
) | |
elif (next_state1 is None) != (next_state2 is None): | |
distinguishable = True | |
print(r" --- distinguishable because of missing transitions") | |
if distinguishable: | |
unmarked_pairs.remove(pair) | |
marked_pairs.add(pair) | |
marked_any = True | |
break | |
else: | |
print(r" --- cannot say anything about this pair yet") | |
if not marked_any: | |
break | |
print(r"\end{enumerate}") | |
print() | |
print(r"The final table of distinguishable pairs of states looks like:") | |
print() | |
print(r"\begin{table}[H]") | |
print(r"\centering") | |
print_distinguishable_table() | |
print(r"\caption{Distinguishable states in our DFA.}") | |
print(r"\end{table}") | |
print() | |
if len(unmarked_pairs) > 1: | |
print( | |
r"Some pairs have remained unmarked, which means that some states can be " | |
r"merged with equivalent ones and collapsed into one state." | |
) | |
else: | |
print( | |
r"All pairs have been marked, which means that there are no equivalent states. " | |
r"Our DFA cannot be minimized further." | |
) | |
print() | |
equivalence_classes = dict[_State, set[_State]]() | |
for state1, state2 in unmarked_pairs: | |
new_class = set[_State]() | |
class1 = equivalence_classes.setdefault(state1, new_class) | |
class2 = equivalence_classes.setdefault(state2, new_class) | |
if class1 is not class2: | |
class1.update(class2) | |
class2 = class1 | |
equivalence_classes[state2] = class2 | |
class1.add(state1) | |
class2.add(state2) | |
for state in dfa_states: | |
equivalence_classes.setdefault(state, {state}) | |
new_dfa_states_mapping: dict[_State, _State] = { | |
state: min(equiv_class) for state, equiv_class in equivalence_classes.items() | |
} | |
equivalence_classes_with_merges: list[_State] = [ | |
s for s in set(new_dfa_states_mapping.values()) if len(equivalence_classes[s]) > 1 | |
] | |
if len(equivalence_classes_with_merges) > 0: | |
print("Equivalence classes are:") | |
print(r"\begin{enumerate}") | |
for renamed_state in equivalence_classes_with_merges: | |
print( | |
r"\item \( (%s) \rightarrow %s \)" | |
% (", ".join(map(str, sorted(equivalence_classes[renamed_state]))), renamed_state) | |
) | |
print(r"\end{enumerate}") | |
print() | |
new_dfa = DFA[_State, _Symbol]() | |
new_dfa.add_symbols(dfa.symbols) | |
for renamed_state in set(new_dfa_states_mapping.values()): | |
new_dfa.add_state( | |
renamed_state, | |
starting=dfa.starting_state in equivalence_classes[renamed_state], | |
final=not dfa.final_states.isdisjoint(equivalence_classes[renamed_state]), | |
) | |
for (from_state, symbol), to_state in dfa.all_transitions(): | |
from_state = new_dfa_states_mapping[from_state] | |
to_state = new_dfa_states_mapping[to_state] | |
new_dfa.add_transition(from_state, symbol, to_state) | |
return new_dfa | |
def re2tree(atom: RegexAtom, atom_numbers: dict[int, int]) -> graphviz.Digraph: | |
graph = graphviz.Digraph() | |
add_visual_attrs_to_graph(graph) | |
graph.attr("graph", rankdir="TB") | |
graph.attr("node", shape="box", margin="0.08,0.06", width="0", height="0") | |
next_atom_number = 0 | |
def atom_node_id(atom: RegexAtom) -> str: | |
nonlocal next_atom_number | |
number = atom_numbers.get(id(atom)) | |
if number is None: | |
next_atom_number += 1 | |
number = next_atom_number | |
atom_numbers[id(atom)] = number | |
return str(number) | |
def atom_label(atom: RegexAtom) -> str: | |
return f"<I>R</I><SUB>{atom_node_id(atom)}</SUB>" | |
def atom_refer(atom: RegexAtom) -> str: | |
return f"({HAIR_SPACE}{atom_label(atom)}{HAIR_SPACE})" if atom.grouped else atom_label(atom) | |
def recurse(atom: RegexAtom) -> None: | |
node_id = atom_node_id(atom) | |
atom_def: str | |
subatoms: T.Iterable[RegexAtom] | |
if isinstance(atom, RegexSymbol): | |
subatoms = [] | |
atom_def = format_symbol(atom.character) | |
elif isinstance(atom, RegexRepetition): | |
subatoms = [atom.atom] | |
atom_def = atom_refer(atom.atom) + THIN_SPACE + (CENTERED_ASTERISK if atom.zero else "+") | |
elif isinstance(atom, RegexUnion): | |
subatoms = atom.atoms | |
joiner = " + " if PLUS_IS_UNION else " | " | |
atom_def = joiner.join(map(atom_refer, atom.atoms)) if atom.atoms else format_symbol(EPSILON) | |
elif isinstance(atom, RegexSequence): | |
subatoms = atom.atoms | |
atom_def = " ".join(map(atom_refer, atom.atoms)) if atom.atoms else format_symbol(EPSILON) | |
else: | |
raise Exception(f"Unexpected atom type: {type(atom).__name__}") | |
graph.node(node_id, f"<{atom_label(atom)} = {atom_def}>") | |
for subatom in subatoms: | |
recurse(subatom) | |
for subatom in subatoms: | |
graph.edge(node_id, atom_node_id(subatom)) | |
recurse(atom) | |
return graph | |
def regex_construction_steps( | |
regex_text: str, root_atom: RegexAtom, atom_numbers: dict[int, int] | |
) -> None: | |
steps = OrderedDict[str, list[RegexAtom]]() | |
def recurse(atom: RegexAtom) -> None: | |
subatoms: T.Iterable[RegexAtom] | |
if isinstance(atom, RegexSymbol): | |
subatoms = [] | |
elif isinstance(atom, RegexRepetition): | |
subatoms = [atom.atom] | |
elif isinstance(atom, RegexSequence): | |
subatoms = atom.atoms | |
elif isinstance(atom, RegexUnion): | |
subatoms = atom.atoms | |
else: | |
raise Exception(f"Unexpected atom type: {type(atom).__name__}") | |
for subatom in subatoms: | |
recurse(subatom) | |
atom_text = regex_text[atom.start_pos : atom.end_pos] | |
steps.setdefault(atom_text, []).append(atom) | |
recurse(root_atom) | |
for i, (atom_text, similar_atoms) in enumerate(steps.items()): | |
step = i + 1 | |
graph = RE2NFA(similar_atoms[0]).to_graph(hide_state_labels=True) | |
print(r"\begin{figure}[H]") | |
print(r"\centering") | |
print(r"\digraph[width=\maxwidth]{step%s}{" % step) | |
print("".join(graph.body), end="") | |
print(r"}") | |
atom_references = " = ".join("R_{%s}" % atom_numbers[id(atom)] for atom in similar_atoms) | |
print( | |
r"\caption{NFA construction step %s, \( %s = \text{``\texttt{%s}''} \).}" | |
% (step, atom_references, atom_text) | |
) | |
print(r"\end{figure}") | |
print() | |
def random_regex_example_string(atom: RegexAtom, shortest: bool = False) -> str: | |
def recurse(atom: RegexAtom) -> str: | |
if isinstance(atom, RegexSymbol): | |
return atom.character | |
elif isinstance(atom, RegexSequence): | |
return "".join(recurse(a) for a in atom.atoms) | |
elif isinstance(atom, RegexRepetition): | |
result = "" if atom.zero else recurse(atom.atom) | |
if not shortest: | |
while bool(random.randint(0, 1)): | |
result += recurse(atom.atom) | |
return result | |
elif isinstance(atom, RegexUnion): | |
if not shortest: | |
return recurse(random.choice(atom.atoms)) | |
else: | |
return min(map(recurse, atom.atoms), key=len) | |
else: | |
raise Exception(f"Unexpected atom type: {type(atom).__name__}") | |
return recurse(atom) | |
def print_dfa_transitions(dfa: DFA[str, str]) -> None: | |
symbols = sorted(dfa.symbols) | |
print(r"\begin{tabular}{|r" + "|c" * len(symbols) + "|}") | |
print(r"\hline") | |
print("& " + " & ".join(f"${s}$" for s in symbols) + " \\\\") | |
for state in sorted(dfa.states): | |
transitions = (dfa.transition(state, symbol) or "--" for symbol in symbols) | |
marker = "" | |
if dfa.is_starting_state(state): | |
marker += r"$\rightarrow$ " | |
if dfa.is_final_state(state): | |
marker += r"$*$ " | |
print(r"\hline") | |
print(marker + state + " & " + " & ".join(transitions) + " \\\\") | |
print(r"\hline") | |
print(r"\end{tabular}") | |
def check_dfa(dfa: DFA[str, str], example: str) -> str: | |
current_state = dfa.starting_state | |
assert current_state is not None | |
path: list[str] = [current_state] | |
for c in example: | |
current_state = dfa.transition(current_state, c) | |
path.append(r"\xrightarrow{%s}" % c) | |
if current_state is not None: | |
path.append(current_state) | |
else: | |
path.append(r"\varnothing") | |
break | |
accepted = current_state is not None and dfa.is_final_state(current_state) | |
status = "accepted" if accepted else "rejected" | |
return r"``\texttt{%s}'': \( %s \Rightarrow \text{%s} \)" % (example, " ".join(path), status) | |
def main() -> None: | |
[_, author, regex_text, *argv_examples] = sys.argv | |
print(r"\documentclass[%spt,a4paper,titlepage,fleqn]{article}" % FONT_SIZE) | |
print(r"\usepackage[margin=3cm]{geometry}") | |
print(r"\usepackage[utf8]{inputenc}") | |
print(r"\usepackage[english]{babel}") | |
print(r"\usepackage{parskip}") | |
print(r"\usepackage{graphicx}") | |
print(r"\usepackage[table]{xcolor}") | |
print(r"\usepackage[colorlinks=true,allcolors=blue]{hyperref}") | |
print(r"\usepackage{amsmath}") | |
print(r"\usepackage{amssymb}") | |
print(r"\usepackage{mathtools}") | |
# print(r"\usepackage[euler]{textgreek}") | |
print(r"\usepackage[Euler]{upgreek}") | |
print(r"\usepackage{float}") | |
print(r"\usepackage{shellesc}") | |
print(r"\usepackage[pdf,singlefile]{graphviz}") | |
print(r"\usepackage{xpatch}") | |
print() | |
# print(r"\usepackage{tikz}") | |
# print(r"\usetikzlibrary{automata,positioning,graphs,graphdrawing}") | |
# print(r"\usegdlibrary{force}") | |
# print(r"\newcommand{\acceptingdoubledistance}{2pt}") | |
# # <https://github.com/pgf-tikz/pgf/blob/3.1.10/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibraryautomata.code.tex#L35> | |
# print( | |
# r"\tikzset{accepting by double/.style={double distance=\acceptingdoubledistance,outer sep=.5\pgflinewidth+.5\acceptingdoubledistance}}" | |
# ) | |
# Taken from <https://www.overleaf.com/latex/templates/graphviz-drawing-example/xhrpxynbfxvw> | |
print(r"\makeatletter") | |
print(r"\newcommand*{\addGraphFileDependency}[1]{%") | |
print(r" \typeout{(#1)} \@addtofilelist{#1} \IfFileExists{#1}{}{\typeout{No file #1.}} }") | |
print(r"\makeatother") | |
print(r"\xpretocmd{\digraph}{\addGraphFileDependency{#2.dot}}{}{}") | |
print() | |
# <https://davidcarlisle.github.io/uk-tex-faq/FAQ-grmaxwidth.html> | |
# <https://tex.stackexchange.com/a/86355> | |
print(r"\makeatletter") | |
print(r"\newcommand{\maxwidth}{%") | |
print(r" \ifdim \Gin@nat@width>\linewidth \linewidth \else \Gin@nat@width \fi}") | |
print(r"\makeatother") | |
print() | |
print(r"\DeclareMathOperator{\move}{move}") | |
print(r"\DeclareMathOperator{\epscl}{\upvarepsilon-closure}") | |
print() | |
print(r"\title{ELAC Project 1 \\ ~ \\ Regular Expression ``\texttt{%s}''}" % regex_text) | |
print(r"\author{%s}" % author) | |
print(r"\date{\today}") | |
print() | |
print(r"\begin{document}") | |
print() | |
print(r"\maketitle") | |
print() | |
parser = RegexParser(regex_text) | |
regex = parser.parse() | |
print(r"\section{Example strings} \label{sec:examples}") | |
print() | |
examples = set[str](argv_examples) | |
attempt = 0 | |
while len(examples) < 5 and attempt < 1000: | |
examples.add(random_regex_example_string(regex)) | |
attempt += 1 | |
examples_sorted = sorted(examples, key=lambda s: (len(s), s)) | |
print(r"Here are some examples of strings accepted by our regular expression:") | |
print(r"\begin{enumerate}") | |
for s in examples_sorted: | |
print(r"\item ``\texttt{%s}''" % s) | |
print(r"\end{enumerate}") | |
print() | |
print(r"\section{Construction of an NFA}") | |
print() | |
atom_numbers = dict[int, int]() | |
tree = re2tree(regex, atom_numbers) | |
if RENDER_GRAPHS: | |
tree.render("elac_project_1_regex.gv", format="png", cleanup=True) | |
print(r"\begin{figure}[H]") | |
print(r"\centering") | |
print(r"\digraph[width=\maxwidth]{regex}{") | |
print("".join(tree.body), end="") | |
print(r"}") | |
print( | |
r"\caption{Decomposition of our regular expression into elementary " | |
r"sub-expressions $R_{i}$, presented as a tree in order to make the " | |
r"relations between the operators and sub-expressions easier to follow.}" | |
) | |
print(r"\end{figure}") | |
print() | |
print( | |
r"To build an NFA with $\upvarepsilon$-transitions equivalent to our regular " | |
r"expression, we will apply \textbf{Thompson's construction algorithm}. " | |
r"Construction steps of similar sub-expressions $R_{i}$ have been combined together." | |
) | |
print() | |
regex_construction_steps(regex_text, regex, atom_numbers) | |
nfa = RE2NFA(regex) | |
if RENDER_GRAPHS: | |
nfa.to_graph().render("elac_project_2_nfa.gv", format="png", cleanup=True) | |
print(r"Finally, to complete the NFA we assign numbers to all states as their names.") | |
print() | |
print(r"\begin{figure}[H]") | |
print(r"\centering") | |
print(r"\digraph[width=\maxwidth]{nfa}{") | |
print("".join(nfa.to_graph().body), end="") | |
print(r"}") | |
print(r"\caption{Diagram of the completed NFA.}") | |
print(r"\end{figure}") | |
print() | |
print(r"\section{Transformation of the NFA into a DFA}") | |
print() | |
print( | |
r"The $\upvarepsilon$-NFA is transformed into an equivalent DFA using the " | |
r"\textbf{subset construction algorithm}." | |
) | |
dfa = NFA2DFA(nfa).dfa | |
if RENDER_GRAPHS: | |
dfa.to_graph().render("elac_project_3_dfa.gv", format="png", cleanup=True) | |
print( | |
r"We can use the equivalences established by " | |
r"\( \epscl(\move(subset, symbol)) = next \: subset \) " | |
r"to build a transition table for a DFA, whose states will correspond one-to-one " | |
r"to the reachable subsets of the $\upvarepsilon$-NFA and be named accordingly. " | |
r"The transitions which result in an empty set $\varnothing$ will be filled simply " | |
"with blanks in the table." | |
) | |
print() | |
print(r"\begin{table}[H]") | |
print(r"\centering") | |
print_dfa_transitions(dfa) | |
print( | |
r"\caption{Transition table of our DFA. The starting state is marked with " | |
r"$\rightarrow$, and the final state%s %s marked with $*$.}" | |
% ("s" if len(dfa.final_states) != 1 else "", "are" if len(dfa.final_states) != 1 else "is") | |
) | |
print(r"\label{tab:dfa}") | |
print(r"\end{table}") | |
print() | |
print(r"\begin{figure}[H]") | |
print(r"\centering") | |
print(r"\digraph[width=\maxwidth]{dfa}{") | |
print("".join(dfa.to_graph().body), end="") | |
print(r"}") | |
print(r"\caption{Diagram of the completed DFA, constructed from table~\ref{tab:dfa}.}") | |
print(r"\end{figure}") | |
print() | |
print(r"Let us try checking our DFA with the example strings from section~\ref{sec:examples}:") | |
print(r"\begin{enumerate}") | |
for example in examples_sorted: | |
print(r"\item %s" % check_dfa(dfa, example)) | |
print(r"\end{enumerate}") | |
print() | |
print(r"\section{Construction of a minimized DFA}") | |
print() | |
print(r"DFA minimization is performed using the \textbf{table-filling algorithm}.") | |
mindfa = minimize_dfa(dfa) | |
if RENDER_GRAPHS: | |
mindfa.to_graph().render("elac_project_4_mindfa.gv", format="png", cleanup=True) | |
print(r"\begin{table}[H]") | |
print(r"\centering") | |
print_dfa_transitions(mindfa) | |
print( | |
r"\caption{Transition table of our minimized DFA. The starting state is " | |
r"marked with $\rightarrow$, and the final state%s %s marked with $*$.}" | |
% ( | |
"s" if len(mindfa.final_states) != 1 else "", | |
"are" if len(mindfa.final_states) != 1 else "is", | |
) | |
) | |
print(r"\label{tab:mindfa}") | |
print(r"\end{table}") | |
print() | |
print(r"\begin{figure}[H]") | |
print(r"\centering") | |
print(r"\digraph[width=\maxwidth]{mindfa}{") | |
print("".join(mindfa.to_graph().body), end="") | |
print(r"}") | |
print(r"\caption{Diagram of the minimized DFA, constructed from table~\ref{tab:mindfa}.}") | |
print(r"\end{figure}") | |
print() | |
print( | |
r"Let us try checking the minimized DFA with example strings from " | |
r"section~\ref{sec:examples} as well:" | |
) | |
print(r"\begin{enumerate}") | |
for example in examples_sorted: | |
print(r"\item %s" % check_dfa(mindfa, example)) | |
print(r"\end{enumerate}") | |
print() | |
print(r"\end{document}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment