Skip to content

Instantly share code, notes, and snippets.

@Julien00859
Last active November 12, 2018 00:36
Show Gist options
  • Save Julien00859/2ad2f6a0834d8d86a023d7166433a143 to your computer and use it in GitHub Desktop.
Save Julien00859/2ad2f6a0834d8d86a023d7166433a143 to your computer and use it in GitHub Desktop.
unamur_regexp
*.pyc
__pycache__
from sys import argv, exit
from finite_automaton import *
from parser import *
def main():
global auto
# re = "a*b(c|d)ef"
n0 = Node()
n1 = Node("f", n0)
n2 = Node("e", n1)
n3 = Node("e", n1)
n4 = Node("c", n3, "d", n2)
n5 = Node("b", n4)
n5.add("a", n5)
auto = Automaton(n5)
auto.show_tree()
match = auto.match(" ".join(argv[1:]))
if match:
print("match")
else:
print("no match")
auto = Automaton(groups(r"a.c"))
for re in [r"ab|cd|e", r"ab*c", r"a.c", r"a|,", r"a*|b*"]:
print()
print(re)
auto = groups(re)
auto.show_tree()
if __name__ == "__main__":
main()

Regexp

Just a toy repo to test stuff with regexp

from abc import abstractmethod
from collections import defaultdict
from itertools import chain, count
SIGMA = object()
class Node:
count = 0
def __init__(self, *pairs):
self.nexts = defaultdict(list)
self.prevs = defaultdict(list)
ipairs = iter(pairs)
for character, node in zip(ipairs, ipairs):
self.add(character, node)
self.id = Node.count
Node.count += 1
def add(self, *pairs):
ipairs = iter(pairs)
for character, node in zip(ipairs, ipairs):
self.nexts[character].append(node)
node.prevs[character].append(self)
def rem(self, *pairs):
ipairs = iter(pairs)
for character, node in zip(ipairs, ipairs):
self.nexts[character].remove(node)
node.prevs[character].remove(self)
def replace(self, node, character=None):
if character is None:
keys = list(self.prevs.keys())
if len(keys) != 1:
raise ValueError("Cannot determine previous transition to replace")
character = keys[0]
for prev_node in self.prevs[character]:
prev_node.rem(character, self)
prev_node.add(character, node)
def read(self, character):
return self.nexts.get(character, self.nexts.get(SIGMA, []))
@property
def is_terminal(self):
return len(self.nexts) == 0
def __repr__(self):
return "<Node #{}>".format(self.id)
def __str__(self):
lines = []
if not self.nexts:
return " ({:2d})→".format(self.id)
if not self.prevs or all(nodes == [self] for nodes in self.prevs.values()):
lines.append("→({:2d})".format(self.id))
for char, nodes in self.nexts.items():
for node in nodes:
lines.append(" ({:2d}) {} ({:2d})".format(
self.id,
(char or "ε") if char is not SIGMA else "Σ",
node.id))
return "\n".join(lines)
class Automaton:
def __init__(self, initial_node):
self.initial_node = initial_node
def match(self, string):
nodes = [self.initial_node]
for char in string:
next_nodes = []
for node in nodes:
next_nodes.extend(node.read(char))
if not next_nodes:
return False
nodes = next_nodes
for node in nodes:
if node.is_terminal:
return True
return False
def show_tree(self):
seen = set()
show = [self.initial_node]
while show:
next_nodes = set()
for node in show:
print(node)
seen.add(node)
next_nodes = next_nodes.union(*node.nexts.values())
show = next_nodes - seen
@classmethod
def determine(class_, initial_node):
raise NotImplementedError()
r"""
Minimal regexp language:
. : any character
| : or
(): group
* : any time
, : null
\ : espape
Exemple:
aa*b*(c|,)(d|e|f) = a+b*c?[def]
,|0|(1(01*0)1)*
"""
from finite_automaton import Node, Automaton, SIGMA as SIGMA_
ESCAPE = "\\"
OPEN = "("
CLOSE = ")"
OR = "|"
KLEENE = "*"
EPSILON = ","
SIGMA = "."
def groups(string):
entry_node = Node()
node_stack = [entry_node, Node()]
group_stack = [[]]
escape = False
for letter in string:
if escape:
escape = False
group_stack[-1].append(letter)
continue
if letter == ESCAPE:
escape = True
elif letter == OPEN:
new_in = Node()
new_out = Node()
node_stack[-2].add("", new_in)
new_out.add("", node_stack[-1])
node_stack.extend([new_in, new_out])
group_stack.append([])
elif letter == CLOSE:
substring = "".join(group_stack.pop())
out = node_stack.pop()
in_ = node_stack.pop()
choices(substring, in_, out)
else:
group_stack[-1].append(letter)
substring = "".join(group_stack.pop())
out = node_stack.pop()
in_ = node_stack.pop()
choices(substring, in_, out)
return Automaton(entry_node)
def choices(string, in_, out):
"""
┌─── a→ ───┐
a|b|c = (0)─ → ─(1)── b→ ──(2)─ → ─(3)
└─── c→ ───┘
"""
choice_stack = [[]]
escape = False
for letter in string:
if escape:
escape = False
choice_stack[-1].append(letter)
continue
if letter == ESCAPE:
escape = True
elif letter == "|":
choice_stack.append([])
else:
choice_stack[-1].append(letter)
choice_stack = map("".join, choice_stack)
for choice in choice_stack:
parse(choice, in_, out)
def parse(string, in_, out):
"""
abc = (0)── a→ ──(1)── b→ ──(2)── c→ ──(3)
"""
escape = False
nodes = [in_]
for letter in string:
if escape:
node = Node()
nodes[-1].add((letter, node))
nodes.append(node)
escape = False
continue
if letter == ESCAPE:
escape = True
elif letter == KLEENE:
prev_out = nodes.pop()
prev_in = nodes.pop()
nodes.extend(kleene(prev_in, prev_out))
else:
if letter == SIGMA:
letter = SIGMA_
elif letter == EPSILON:
letter = ""
node = Node()
nodes[-1].add(letter, node)
nodes.append(node)
nodes[-1].replace(out, "")
def kleene(in_, out):
"""
┌── a→ ─┐
a* = (0)── → ──(1)─ ← ─(2)── → ──(3)
└──────────── → ────────────┘
"""
new_out = Node()
new_in = Node("", new_out)
in_.replace(new_in)
new_in.add("", in_)
out.add("", in_, "", new_out)
return new_in, new_out
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment