Skip to content

Instantly share code, notes, and snippets.

@dmitmel
Last active November 15, 2024 15:29
Show Gist options
  • Save dmitmel/a3be2af47a5d4e7ea7ee2087a4ba016f to your computer and use it in GitHub Desktop.
Save dmitmel/a3be2af47a5d4e7ea7ee2087a4ba016f to your computer and use it in GitHub Desktop.
RE => NFA => DFA => min-DFA
from collections import OrderedDict
from dataclasses import dataclass, field
from enum import Enum
import random
import sys
import typing as T
import graphviz
EPSILON = "\u03b5"
EMPTY_SET = "\u2205"
CENTERED_ASTERISK = "\u2217"
THIN_SPACE = "\u2009"
HAIR_SPACE = "\u200a"
FONT_SIZE = 12
PLUS_IS_UNION = True
RENDER_GRAPHS = False
def add_visual_attrs_to_graph(
graph: graphviz.Digraph | graphviz.Graph, font_size: int = FONT_SIZE
) -> None:
# <https://tex.stackexchange.com/a/642722>
# <https://www.overleaf.com/learn/latex/Questions%2FWhich_OTF_or_TTF_fonts_are_supported_via_fontspec%3F#Serif_fonts>
font_attrs = {"fontsize": f"{font_size}pt", "fontname": "CMU Serif"}
graph.attr("graph", font_attrs, margin="0")
graph.attr("node", font_attrs)
graph.attr("edge", font_attrs)
def format_symbol(text: str):
return f"<I>{text}</I>" if not text.isnumeric() else text
@dataclass
class RegexAtom:
start_pos: int
end_pos: int
grouped: bool = field(default=False, init=False)
@dataclass
class RegexSequence(RegexAtom):
atoms: list[RegexAtom]
@dataclass
class RegexUnion(RegexAtom):
atoms: list[RegexAtom]
@dataclass
class RegexRepetition(RegexAtom):
atom: RegexAtom
zero: bool
@dataclass
class RegexSymbol(RegexAtom):
character: str
class RegexParser:
# expression ::= union ;
# union ::= sequence ( "|" sequence )* ;
# sequence ::= ( qualified-atom )* ;
# qualified-atom ::= atom ( "*" | "+" )? ;
# atom ::= symbol | group ;
# group ::= "(" expression ")" ;
class Token(Enum):
GROUP_START = 1
GROUP_END = 2
UNION = 3
ONE_OR_MORE = 4
ZERO_OR_MORE = 5
LITERAL = 6
METACHARACTER_TOKENS: dict[str, Token] = {
"(": Token.GROUP_START,
")": Token.GROUP_END,
"|": Token.UNION if not PLUS_IS_UNION else Token.LITERAL,
"+": Token.ONE_OR_MORE if not PLUS_IS_UNION else Token.UNION,
"*": Token.ZERO_OR_MORE,
}
def __init__(self, text: str) -> None:
self.text = text
self.pos = 0
def parse(self) -> RegexAtom:
atom = self.parse_union()
if self.pos != len(self.text):
raise Exception("Unexpected input")
return atom
def parse_union(self) -> RegexUnion | RegexAtom:
start_pos = self.pos
atoms: list[RegexAtom] = [self.parse_sequence()]
while self.match(self.Token.UNION):
atoms.append(self.parse_sequence())
return RegexUnion(start_pos, self.pos, atoms) if len(atoms) > 1 else atoms[0]
def parse_sequence(self) -> RegexSequence | RegexAtom:
start_pos = self.pos
atoms: list[RegexAtom] = []
while atom := self.parse_quantified_atom():
atoms.append(atom)
return RegexSequence(start_pos, self.pos, atoms) if len(atoms) != 1 else atoms[0]
def parse_quantified_atom(self) -> RegexRepetition | RegexAtom | None:
start_pos = self.pos
atom = self.parse_atom()
if char := self.match(self.Token.ZERO_OR_MORE, self.Token.ONE_OR_MORE):
if atom is None:
raise Exception("Nothing to repeat")
return RegexRepetition(start_pos, self.pos, atom, zero=(char == "*"))
else:
return atom
def parse_atom(self) -> RegexSymbol | RegexAtom | None:
start_pos = self.pos
if matched_char := self.match(self.Token.LITERAL):
return RegexSymbol(start_pos, self.pos, matched_char)
elif self.match(self.Token.GROUP_START):
atom = self.parse_union()
atom.grouped = True
if not self.match(self.Token.GROUP_END):
raise Exception("Expected ')'")
return atom
else:
return None
def match(self, *expected_tokens: Token) -> str | None:
if 0 <= self.pos < len(self.text):
char = self.text[self.pos]
token = self.METACHARACTER_TOKENS.get(char, self.Token.LITERAL)
if token in expected_tokens:
self.pos += 1
return char
return None
@dataclass
class NFAState:
name: int
chain_group: int
@dataclass
class NFATransition:
from_state: NFAState
to_state: NFAState
symbol: str
atom_type: type[RegexAtom]
class _Sortable(T.Protocol):
def __lt__(self, other: T.Any, /) -> bool: ...
_State = T.TypeVar("_State", bound=_Sortable)
_Symbol = T.TypeVar("_Symbol", bound=_Sortable)
class DFA(T.Generic[_State, _Symbol]):
__slots__ = "_transition_table", "_symbols", "_states", "_starting_state", "_final_states"
def __init__(self) -> None:
self._transition_table = dict[tuple[_State, _Symbol], _State]()
self._symbols = set[_Symbol]()
self._states = set[_State]()
self._starting_state: _State | None = None
self._final_states = set[_State]()
@property
def symbols(self) -> T.AbstractSet[_Symbol]:
return self._symbols
@property
def states(self) -> T.AbstractSet[_State]:
return self._states
@property
def starting_state(self) -> _State | None:
return self._starting_state
@property
def final_states(self) -> T.AbstractSet[_State]:
return self._final_states
def add_symbol(self, symbol: _Symbol) -> None:
assert symbol not in self._symbols
self._symbols.add(symbol)
def add_symbols(self, symbols: T.Iterable[_Symbol]) -> None:
assert self._symbols.isdisjoint(symbols)
self._symbols.update(symbols)
def add_state(self, state: _State, starting: bool = False, final: bool = False) -> None:
assert state not in self._states
assert not starting or self._starting_state is None
self._states.add(state)
if starting:
self._starting_state = state
if final:
self._final_states.add(state)
def is_starting_state(self, state: _State) -> bool:
assert state in self._states
return state == self._starting_state
def is_final_state(self, state: _State) -> bool:
assert state in self._states
return state in self._final_states
def add_transition(self, state: _State, symbol: _Symbol, next_state: _State) -> None:
assert state in self._states
assert symbol in self._symbols
assert next_state in self._states
assigned_state = self._transition_table.setdefault((state, symbol), next_state)
assert assigned_state == next_state
def transition(self, state: _State, symbol: _Symbol) -> _State | None:
return self._transition_table.get((state, symbol))
def all_transitions(self) -> T.Iterator[tuple[tuple[_State, _Symbol], _State]]:
return iter(self._transition_table.items())
def to_graph(self) -> graphviz.Digraph:
graph = graphviz.Digraph()
add_visual_attrs_to_graph(graph)
graph.attr("graph", rankdir="LR")
graph.node("START", shape="plaintext")
graph.attr("node", shape="circle", margin="0.08,0.04", width="0", height="0")
for state in sorted(self._states):
graph.node(str(state), shape="doublecircle" if state in self.final_states else None)
if self.starting_state is not None:
graph.edge("START", str(self.starting_state))
grouped_transitions = dict[tuple[_State, _State], list[_Symbol]]()
for (state, symbol), next_state in self._transition_table.items():
grouped_transitions.setdefault((state, next_state), []).append(symbol)
for (state, next_state), symbols in sorted(grouped_transitions.items()):
label = "<" + ",".join(map(format_symbol, map(str, sorted(symbols)))) + ">"
graph.edge(str(state), str(next_state), label)
return graph
class RE2NFA:
def __init__(self, atom: RegexAtom) -> None:
self.states = list[NFAState]()
self.transitions = list[NFATransition]()
self.current_state_id = 0
self.current_chain_group = 0
self.start_state = self.add_state(self.current_chain_group)
self.final_state = self.add_state(self.current_chain_group)
self.name_state(self.start_state)
self.convert(atom, self.start_state, self.final_state, self.current_chain_group)
self.name_state(self.final_state)
def convert(
self, atom: RegexAtom, link_start: NFAState, link_end: NFAState, chain_group: int
) -> None:
if isinstance(atom, RegexSymbol):
self.add_transition(link_start, link_end, atom.character, RegexSymbol)
elif isinstance(atom, RegexSequence):
if len(atom.atoms) == 0:
self.add_transition(link_start, link_end, EPSILON, RegexSequence)
else:
link_prev = link_start
for subatom in atom.atoms[:-1]:
link_next = self.add_state(chain_group)
self.convert(subatom, link_prev, link_next, chain_group)
self.name_state(link_next)
link_prev = link_next
self.convert(atom.atoms[-1], link_prev, link_end, chain_group)
elif isinstance(atom, RegexUnion):
for subatom in atom.atoms:
self.current_chain_group += 1
sub_chain_group = self.current_chain_group
sub_link_start = self.add_state(sub_chain_group)
sub_link_end = self.add_state(sub_chain_group)
self.add_transition(link_start, sub_link_start, EPSILON, RegexUnion)
self.add_transition(sub_link_end, link_end, EPSILON, RegexUnion)
self.name_state(sub_link_start)
self.convert(subatom, sub_link_start, sub_link_end, sub_chain_group)
self.name_state(sub_link_end)
elif isinstance(atom, RegexRepetition):
subatom = atom.atom
if atom.zero:
sub_link_start = self.add_state(chain_group)
sub_link_end = self.add_state(chain_group)
self.add_transition(link_start, sub_link_start, EPSILON, RegexSequence)
self.add_transition(sub_link_end, link_end, EPSILON, RegexSequence)
self.add_transition(link_start, link_end, EPSILON, RegexRepetition)
self.add_transition(sub_link_end, sub_link_start, EPSILON, RegexRepetition)
self.name_state(sub_link_start)
self.convert(subatom, sub_link_start, sub_link_end, chain_group)
self.name_state(sub_link_end)
else:
self.add_transition(link_end, link_start, EPSILON, RegexRepetition)
self.convert(subatom, link_start, link_end, chain_group)
else:
raise Exception(f"Unexpected atom type: {type(atom).__name__}")
def add_state(self, chain_group: int) -> NFAState:
state = NFAState(-1, chain_group)
self.states.append(state)
return state
def add_transition(
self, from_state: NFAState, to_state: NFAState, symbol: str, kind: type[RegexAtom]
) -> None:
self.transitions.append(NFATransition(from_state, to_state, symbol, kind))
def name_state(self, state: NFAState) -> None:
state.name = self.current_state_id
self.current_state_id += 1
def find_state(self, name: int) -> NFAState | None:
for state in self.states:
if state.name == name:
return state
return None
def to_graph(self, hide_state_labels: bool = False) -> graphviz.Digraph:
graph = graphviz.Digraph()
add_visual_attrs_to_graph(graph, font_size=20)
graph.attr("graph", rankdir="LR")
if hide_state_labels:
graph.node("START", shape="none", label="", width="0", height="0")
else:
graph.node("START", shape="plaintext")
graph.attr("node", shape="circle", margin="0.04,0.02", width="0", height="0")
if hide_state_labels:
graph.attr("node", label=" ")
for state in self.states:
graph.node(
str(state.name),
shape="doublecircle" if state == self.final_state else None,
group=str(state.chain_group),
)
for transition in self.transitions:
graph.edge(
str(transition.from_state.name),
str(transition.to_state.name),
"<" + format_symbol(transition.symbol) + ">",
constraint="false" if transition.atom_type is RegexRepetition else None,
# weight="0" if transition.atom_type is RegexUnion else None,
)
graph.edge("START", str(self.start_state.name))
return graph
class NFA2DFA:
def __init__(self, nfa: RE2NFA) -> None:
self.nfa_start_state: int = nfa.start_state.name
self.nfa_final_state: int = nfa.final_state.name
self.nfa_transition_table = dict[tuple[int, str], list[int]]()
for transition in nfa.transitions:
key = (transition.from_state.name, transition.symbol)
self.nfa_transition_table.setdefault(key, []).append(transition.to_state.name)
self.subsets_to_names = dict[frozenset[int], str]()
self.subset_names_generator = iter("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
self.all_symbols: list[str] = sorted(set(tr.symbol for tr in nfa.transitions) - set(EPSILON))
self.dfa = DFA[str, str]()
self.dfa.add_symbols(self.all_symbols)
self.convert()
def convert(self) -> None:
# <https://tex.stackexchange.com/a/67540>
# <https://tex.stackexchange.com/a/1960>
# <https://stackoverflow.com/a/65431499/12005228>
# print(r"{")
# print(r"\makeatletter")
# print(r"\def\old@comma{,}")
# print(r"\catcode`\,=13")
# print(r"\def,{%")
# print(r" \ifmmode%")
# print(r" \old@comma\discretionary{}{}{}%")
# print(r" \else%")
# print(r" \old@comma%")
# print(r" \fi%")
# print(r"}")
# print(r"\makeatother")
# print()
def subset_to_string(subset: frozenset[int]) -> str:
if len(subset) > 0:
return r"\{" + ",".join(map(str, subset)) + r"\}"
else:
return r"\varnothing"
print(
r"We start out by finding and naming the subset reachable from the "
r"initial state of the NFA:"
)
starting_subset = frozenset([self.nfa_start_state])
print(r"\[ \epscl(%s) = " % subset_to_string(starting_subset), end="")
starting_subset = self.epsilon_closure(starting_subset)
_, starting_subset_name = self.name_subset(starting_subset)
print(r"%s = %s \]" % (subset_to_string(starting_subset), starting_subset_name))
print(
r"Next, we will try finding other reachable subsets by moving from the "
r"known subsets via transitions on all available symbols, until we cannot "
"assign any more new names."
)
queue = [(starting_subset_name, starting_subset)]
while len(queue) > 0:
subset_name, subset = queue.pop()
for symbol in self.all_symbols:
print(r"\[ \epscl(\move(%s, %s)) = " % (subset_name, symbol), end="")
new_subset = self.move_function(subset, symbol)
print(r"\epscl(%s) = " % (subset_to_string(new_subset)), end="")
new_subset = self.epsilon_closure(new_subset)
print(subset_to_string(new_subset), end="")
if len(new_subset) == 0:
print(r" \]")
continue
was_unnamed, new_subset_name = self.name_subset(new_subset)
print(r" = %s \]" % new_subset_name)
self.dfa.add_transition(subset_name, symbol, new_subset_name)
if was_unnamed:
print(r"\( %s \) is a new subset." % new_subset_name)
if self.nfa_final_state in new_subset:
print(r"It also contains the final state.")
queue.append((new_subset_name, new_subset))
# print(r"}")
print()
print(r"At this point we have discovered and named all unique reachable subsets.")
# self.dfa.add_state("Z")
# for symbol in self.all_symbols:
# self.dfa.add_transition("Z", symbol, "Z")
# for state in self.dfa.states:
# for symbol in self.all_symbols:
# if self.dfa.transition(state, symbol) is None:
# print(state, symbol, file=sys.stderr)
# self.dfa.add_transition(state, symbol, "Z")
def move_function(self, starting_subset: frozenset[int], symbol: str) -> frozenset[int]:
reachable = set[int]()
for state in starting_subset:
transitions = self.nfa_transition_table.get((state, symbol), [])
reachable.update(transitions)
return frozenset(reachable)
def epsilon_closure(self, starting_subset: frozenset[int]) -> frozenset[int]:
queue = list(starting_subset)
visited = set(starting_subset)
while len(queue) > 0:
state = queue.pop()
transitions = self.nfa_transition_table.get((state, EPSILON), [])
for next_state in transitions:
if next_state not in visited:
visited.add(next_state)
queue.append(next_state)
return frozenset(visited)
def name_subset(self, subset: frozenset[int]) -> tuple[bool, str]:
assert len(subset) > 0
name = self.subsets_to_names.get(subset)
if name is None:
name = next(self.subset_names_generator)
self.subsets_to_names[subset] = name
self.dfa.add_state(
name, starting=self.nfa_start_state in subset, final=self.nfa_final_state in subset
)
return True, name
else:
return False, name
def minimize_dfa(dfa: DFA[_State, _Symbol]) -> DFA[_State, _Symbol]:
marked_pairs = set[tuple[_State, _State]]()
unmarked_pairs = set[tuple[_State, _State]]()
dfa_states = sorted(dfa.states)
dfa_symbols = sorted(dfa.symbols)
def is_pair_marked(state1: _State, state2: _State) -> bool:
return (state1, state2) in marked_pairs or (state2, state1) in marked_pairs
def print_distinguishable_table() -> None:
print(r"\begin{tabular}{|r" + "|c" * len(dfa_states) + "|}")
print(r"\hline")
print("& " + " & ".join(map(str, dfa_states)) + " \\\\")
for i, state1 in enumerate(dfa_states):
print(r"\hline")
marks = list[str]()
for j, state2 in enumerate(dfa_states):
if i == j:
marks.append(r"\cellcolor{lightgray}{--}")
elif is_pair_marked(state1, state2):
marks.append(r"$\times$")
else:
marks.append(" ")
print(str(state1) + " & " + " & ".join(marks) + " \\\\")
print(r"\hline")
print(r"\end{tabular}")
print(
r"We draw a table for pairs of all states, and we will be marking every pair "
r"where it is known that the two states are definitely distinguishable. We start "
r"out by marking every pair that can be distinguished by the fact of only one of "
r"the states being a final state:"
)
for state1 in dfa_states:
for state2 in dfa_states:
if state1 != state2:
distinguishable = dfa.is_final_state(state1) != dfa.is_final_state(state2)
pair = (state1, state2) if state1 < state2 else (state2, state1)
(marked_pairs if distinguishable else unmarked_pairs).add(pair)
print(r"\begin{table}[H]")
print(r"\centering")
print_distinguishable_table()
print(r"\caption{Distinguishable states.}")
print(r"\end{table}")
print()
print(
r"Next, we will repeatedly check all pairs that have remained unmarked. A pair "
r"can be marked if a transition on any symbol leads to a pair that is already "
r"known to be distinguishable (therefore this property is transitive)."
)
print(r"\begin{enumerate}")
while True:
marked_any = False
for pair in sorted(unmarked_pairs):
state1, state2 = pair
for symbol in dfa_symbols:
next_state1 = dfa.transition(state1, symbol)
next_state2 = dfa.transition(state2, symbol)
pair1_text = (state1, state2)
pair2_text = (next_state1 or r"\text{--}", next_state2 or r"\text{--}")
print(
r"\item \( (%s, %s) \xrightarrow{%s} (%s, %s) \)" % (*pair1_text, symbol, *pair2_text)
)
distinguishable = False
if next_state1 is not None and next_state2 is not None:
if is_pair_marked(next_state1, next_state2):
distinguishable = True
print(
r" --- distinguishable because the pair \( (%s, %s) \) has been marked" % pair2_text
)
elif (next_state1 is None) != (next_state2 is None):
distinguishable = True
print(r" --- distinguishable because of missing transitions")
if distinguishable:
unmarked_pairs.remove(pair)
marked_pairs.add(pair)
marked_any = True
break
else:
print(r" --- cannot say anything about this pair yet")
if not marked_any:
break
print(r"\end{enumerate}")
print()
print(r"The final table of distinguishable pairs of states looks like:")
print()
print(r"\begin{table}[H]")
print(r"\centering")
print_distinguishable_table()
print(r"\caption{Distinguishable states in our DFA.}")
print(r"\end{table}")
print()
if len(unmarked_pairs) > 1:
print(
r"Some pairs have remained unmarked, which means that some states can be "
r"merged with equivalent ones and collapsed into one state."
)
else:
print(
r"All pairs have been marked, which means that there are no equivalent states. "
r"Our DFA cannot be minimized further."
)
print()
equivalence_classes = dict[_State, set[_State]]()
for state1, state2 in unmarked_pairs:
new_class = set[_State]()
class1 = equivalence_classes.setdefault(state1, new_class)
class2 = equivalence_classes.setdefault(state2, new_class)
if class1 is not class2:
class1.update(class2)
class2 = class1
equivalence_classes[state2] = class2
class1.add(state1)
class2.add(state2)
for state in dfa_states:
equivalence_classes.setdefault(state, {state})
new_dfa_states_mapping: dict[_State, _State] = {
state: min(equiv_class) for state, equiv_class in equivalence_classes.items()
}
equivalence_classes_with_merges: list[_State] = [
s for s in set(new_dfa_states_mapping.values()) if len(equivalence_classes[s]) > 1
]
if len(equivalence_classes_with_merges) > 0:
print("Equivalence classes are:")
print(r"\begin{enumerate}")
for renamed_state in equivalence_classes_with_merges:
print(
r"\item \( (%s) \rightarrow %s \)"
% (", ".join(map(str, sorted(equivalence_classes[renamed_state]))), renamed_state)
)
print(r"\end{enumerate}")
print()
new_dfa = DFA[_State, _Symbol]()
new_dfa.add_symbols(dfa.symbols)
for renamed_state in set(new_dfa_states_mapping.values()):
new_dfa.add_state(
renamed_state,
starting=dfa.starting_state in equivalence_classes[renamed_state],
final=not dfa.final_states.isdisjoint(equivalence_classes[renamed_state]),
)
for (from_state, symbol), to_state in dfa.all_transitions():
from_state = new_dfa_states_mapping[from_state]
to_state = new_dfa_states_mapping[to_state]
new_dfa.add_transition(from_state, symbol, to_state)
return new_dfa
def re2tree(atom: RegexAtom, atom_numbers: dict[int, int]) -> graphviz.Digraph:
graph = graphviz.Digraph()
add_visual_attrs_to_graph(graph)
graph.attr("graph", rankdir="TB")
graph.attr("node", shape="box", margin="0.08,0.06", width="0", height="0")
next_atom_number = 0
def atom_node_id(atom: RegexAtom) -> str:
nonlocal next_atom_number
number = atom_numbers.get(id(atom))
if number is None:
next_atom_number += 1
number = next_atom_number
atom_numbers[id(atom)] = number
return str(number)
def atom_label(atom: RegexAtom) -> str:
return f"<I>R</I><SUB>{atom_node_id(atom)}</SUB>"
def atom_refer(atom: RegexAtom) -> str:
return f"({HAIR_SPACE}{atom_label(atom)}{HAIR_SPACE})" if atom.grouped else atom_label(atom)
def recurse(atom: RegexAtom) -> None:
node_id = atom_node_id(atom)
atom_def: str
subatoms: T.Iterable[RegexAtom]
if isinstance(atom, RegexSymbol):
subatoms = []
atom_def = format_symbol(atom.character)
elif isinstance(atom, RegexRepetition):
subatoms = [atom.atom]
atom_def = atom_refer(atom.atom) + THIN_SPACE + (CENTERED_ASTERISK if atom.zero else "+")
elif isinstance(atom, RegexUnion):
subatoms = atom.atoms
joiner = " + " if PLUS_IS_UNION else " | "
atom_def = joiner.join(map(atom_refer, atom.atoms)) if atom.atoms else format_symbol(EPSILON)
elif isinstance(atom, RegexSequence):
subatoms = atom.atoms
atom_def = " ".join(map(atom_refer, atom.atoms)) if atom.atoms else format_symbol(EPSILON)
else:
raise Exception(f"Unexpected atom type: {type(atom).__name__}")
graph.node(node_id, f"<{atom_label(atom)} = {atom_def}>")
for subatom in subatoms:
recurse(subatom)
for subatom in subatoms:
graph.edge(node_id, atom_node_id(subatom))
recurse(atom)
return graph
def regex_construction_steps(
regex_text: str, root_atom: RegexAtom, atom_numbers: dict[int, int]
) -> None:
steps = OrderedDict[str, list[RegexAtom]]()
def recurse(atom: RegexAtom) -> None:
subatoms: T.Iterable[RegexAtom]
if isinstance(atom, RegexSymbol):
subatoms = []
elif isinstance(atom, RegexRepetition):
subatoms = [atom.atom]
elif isinstance(atom, RegexSequence):
subatoms = atom.atoms
elif isinstance(atom, RegexUnion):
subatoms = atom.atoms
else:
raise Exception(f"Unexpected atom type: {type(atom).__name__}")
for subatom in subatoms:
recurse(subatom)
atom_text = regex_text[atom.start_pos : atom.end_pos]
steps.setdefault(atom_text, []).append(atom)
recurse(root_atom)
for i, (atom_text, similar_atoms) in enumerate(steps.items()):
step = i + 1
graph = RE2NFA(similar_atoms[0]).to_graph(hide_state_labels=True)
print(r"\begin{figure}[H]")
print(r"\centering")
print(r"\digraph[width=\maxwidth]{step%s}{" % step)
print("".join(graph.body), end="")
print(r"}")
atom_references = " = ".join("R_{%s}" % atom_numbers[id(atom)] for atom in similar_atoms)
print(
r"\caption{NFA construction step %s, \( %s = \text{``\texttt{%s}''} \).}"
% (step, atom_references, atom_text)
)
print(r"\end{figure}")
print()
def random_regex_example_string(atom: RegexAtom, shortest: bool = False) -> str:
def recurse(atom: RegexAtom) -> str:
if isinstance(atom, RegexSymbol):
return atom.character
elif isinstance(atom, RegexSequence):
return "".join(recurse(a) for a in atom.atoms)
elif isinstance(atom, RegexRepetition):
result = "" if atom.zero else recurse(atom.atom)
if not shortest:
while bool(random.randint(0, 1)):
result += recurse(atom.atom)
return result
elif isinstance(atom, RegexUnion):
if not shortest:
return recurse(random.choice(atom.atoms))
else:
return min(map(recurse, atom.atoms), key=len)
else:
raise Exception(f"Unexpected atom type: {type(atom).__name__}")
return recurse(atom)
def print_dfa_transitions(dfa: DFA[str, str]) -> None:
symbols = sorted(dfa.symbols)
print(r"\begin{tabular}{|r" + "|c" * len(symbols) + "|}")
print(r"\hline")
print("& " + " & ".join(f"${s}$" for s in symbols) + " \\\\")
for state in sorted(dfa.states):
transitions = (dfa.transition(state, symbol) or "--" for symbol in symbols)
marker = ""
if dfa.is_starting_state(state):
marker += r"$\rightarrow$ "
if dfa.is_final_state(state):
marker += r"$*$ "
print(r"\hline")
print(marker + state + " & " + " & ".join(transitions) + " \\\\")
print(r"\hline")
print(r"\end{tabular}")
def check_dfa(dfa: DFA[str, str], example: str) -> str:
current_state = dfa.starting_state
assert current_state is not None
path: list[str] = [current_state]
for c in example:
current_state = dfa.transition(current_state, c)
path.append(r"\xrightarrow{%s}" % c)
if current_state is not None:
path.append(current_state)
else:
path.append(r"\varnothing")
break
accepted = current_state is not None and dfa.is_final_state(current_state)
status = "accepted" if accepted else "rejected"
return r"``\texttt{%s}'': \( %s \Rightarrow \text{%s} \)" % (example, " ".join(path), status)
def main() -> None:
[_, author, regex_text, *argv_examples] = sys.argv
print(r"\documentclass[%spt,a4paper,titlepage,fleqn]{article}" % FONT_SIZE)
print(r"\usepackage[margin=3cm]{geometry}")
print(r"\usepackage[utf8]{inputenc}")
print(r"\usepackage[english]{babel}")
print(r"\usepackage{parskip}")
print(r"\usepackage{graphicx}")
print(r"\usepackage[table]{xcolor}")
print(r"\usepackage[colorlinks=true,allcolors=blue]{hyperref}")
print(r"\usepackage{amsmath}")
print(r"\usepackage{amssymb}")
print(r"\usepackage{mathtools}")
# print(r"\usepackage[euler]{textgreek}")
print(r"\usepackage[Euler]{upgreek}")
print(r"\usepackage{float}")
print(r"\usepackage{shellesc}")
print(r"\usepackage[pdf,singlefile]{graphviz}")
print(r"\usepackage{xpatch}")
print()
# print(r"\usepackage{tikz}")
# print(r"\usetikzlibrary{automata,positioning,graphs,graphdrawing}")
# print(r"\usegdlibrary{force}")
# print(r"\newcommand{\acceptingdoubledistance}{2pt}")
# # <https://github.com/pgf-tikz/pgf/blob/3.1.10/tex/generic/pgf/frontendlayer/tikz/libraries/tikzlibraryautomata.code.tex#L35>
# print(
# r"\tikzset{accepting by double/.style={double distance=\acceptingdoubledistance,outer sep=.5\pgflinewidth+.5\acceptingdoubledistance}}"
# )
# Taken from <https://www.overleaf.com/latex/templates/graphviz-drawing-example/xhrpxynbfxvw>
print(r"\makeatletter")
print(r"\newcommand*{\addGraphFileDependency}[1]{%")
print(r" \typeout{(#1)} \@addtofilelist{#1} \IfFileExists{#1}{}{\typeout{No file #1.}} }")
print(r"\makeatother")
print(r"\xpretocmd{\digraph}{\addGraphFileDependency{#2.dot}}{}{}")
print()
# <https://davidcarlisle.github.io/uk-tex-faq/FAQ-grmaxwidth.html>
# <https://tex.stackexchange.com/a/86355>
print(r"\makeatletter")
print(r"\newcommand{\maxwidth}{%")
print(r" \ifdim \Gin@nat@width>\linewidth \linewidth \else \Gin@nat@width \fi}")
print(r"\makeatother")
print()
print(r"\DeclareMathOperator{\move}{move}")
print(r"\DeclareMathOperator{\epscl}{\upvarepsilon-closure}")
print()
print(r"\title{ELAC Project 1 \\ ~ \\ Regular Expression ``\texttt{%s}''}" % regex_text)
print(r"\author{%s}" % author)
print(r"\date{\today}")
print()
print(r"\begin{document}")
print()
print(r"\maketitle")
print()
parser = RegexParser(regex_text)
regex = parser.parse()
print(r"\section{Example strings} \label{sec:examples}")
print()
examples = set[str](argv_examples)
attempt = 0
while len(examples) < 5 and attempt < 1000:
examples.add(random_regex_example_string(regex))
attempt += 1
examples_sorted = sorted(examples, key=lambda s: (len(s), s))
print(r"Here are some examples of strings accepted by our regular expression:")
print(r"\begin{enumerate}")
for s in examples_sorted:
print(r"\item ``\texttt{%s}''" % s)
print(r"\end{enumerate}")
print()
print(r"\section{Construction of an NFA}")
print()
atom_numbers = dict[int, int]()
tree = re2tree(regex, atom_numbers)
if RENDER_GRAPHS:
tree.render("elac_project_1_regex.gv", format="png", cleanup=True)
print(r"\begin{figure}[H]")
print(r"\centering")
print(r"\digraph[width=\maxwidth]{regex}{")
print("".join(tree.body), end="")
print(r"}")
print(
r"\caption{Decomposition of our regular expression into elementary "
r"sub-expressions $R_{i}$, presented as a tree in order to make the "
r"relations between the operators and sub-expressions easier to follow.}"
)
print(r"\end{figure}")
print()
print(
r"To build an NFA with $\upvarepsilon$-transitions equivalent to our regular "
r"expression, we will apply \textbf{Thompson's construction algorithm}. "
r"Construction steps of similar sub-expressions $R_{i}$ have been combined together."
)
print()
regex_construction_steps(regex_text, regex, atom_numbers)
nfa = RE2NFA(regex)
if RENDER_GRAPHS:
nfa.to_graph().render("elac_project_2_nfa.gv", format="png", cleanup=True)
print(r"Finally, to complete the NFA we assign numbers to all states as their names.")
print()
print(r"\begin{figure}[H]")
print(r"\centering")
print(r"\digraph[width=\maxwidth]{nfa}{")
print("".join(nfa.to_graph().body), end="")
print(r"}")
print(r"\caption{Diagram of the completed NFA.}")
print(r"\end{figure}")
print()
print(r"\section{Transformation of the NFA into a DFA}")
print()
print(
r"The $\upvarepsilon$-NFA is transformed into an equivalent DFA using the "
r"\textbf{subset construction algorithm}."
)
dfa = NFA2DFA(nfa).dfa
if RENDER_GRAPHS:
dfa.to_graph().render("elac_project_3_dfa.gv", format="png", cleanup=True)
print(
r"We can use the equivalences established by "
r"\( \epscl(\move(subset, symbol)) = next \: subset \) "
r"to build a transition table for a DFA, whose states will correspond one-to-one "
r"to the reachable subsets of the $\upvarepsilon$-NFA and be named accordingly. "
r"The transitions which result in an empty set $\varnothing$ will be filled simply "
"with blanks in the table."
)
print()
print(r"\begin{table}[H]")
print(r"\centering")
print_dfa_transitions(dfa)
print(
r"\caption{Transition table of our DFA. The starting state is marked with "
r"$\rightarrow$, and the final state%s %s marked with $*$.}"
% ("s" if len(dfa.final_states) != 1 else "", "are" if len(dfa.final_states) != 1 else "is")
)
print(r"\label{tab:dfa}")
print(r"\end{table}")
print()
print(r"\begin{figure}[H]")
print(r"\centering")
print(r"\digraph[width=\maxwidth]{dfa}{")
print("".join(dfa.to_graph().body), end="")
print(r"}")
print(r"\caption{Diagram of the completed DFA, constructed from table~\ref{tab:dfa}.}")
print(r"\end{figure}")
print()
print(r"Let us try checking our DFA with the example strings from section~\ref{sec:examples}:")
print(r"\begin{enumerate}")
for example in examples_sorted:
print(r"\item %s" % check_dfa(dfa, example))
print(r"\end{enumerate}")
print()
print(r"\section{Construction of a minimized DFA}")
print()
print(r"DFA minimization is performed using the \textbf{table-filling algorithm}.")
mindfa = minimize_dfa(dfa)
if RENDER_GRAPHS:
mindfa.to_graph().render("elac_project_4_mindfa.gv", format="png", cleanup=True)
print(r"\begin{table}[H]")
print(r"\centering")
print_dfa_transitions(mindfa)
print(
r"\caption{Transition table of our minimized DFA. The starting state is "
r"marked with $\rightarrow$, and the final state%s %s marked with $*$.}"
% (
"s" if len(mindfa.final_states) != 1 else "",
"are" if len(mindfa.final_states) != 1 else "is",
)
)
print(r"\label{tab:mindfa}")
print(r"\end{table}")
print()
print(r"\begin{figure}[H]")
print(r"\centering")
print(r"\digraph[width=\maxwidth]{mindfa}{")
print("".join(mindfa.to_graph().body), end="")
print(r"}")
print(r"\caption{Diagram of the minimized DFA, constructed from table~\ref{tab:mindfa}.}")
print(r"\end{figure}")
print()
print(
r"Let us try checking the minimized DFA with example strings from "
r"section~\ref{sec:examples} as well:"
)
print(r"\begin{enumerate}")
for example in examples_sorted:
print(r"\item %s" % check_dfa(mindfa, example))
print(r"\end{enumerate}")
print()
print(r"\end{document}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment