Just a toy repo to test stuff with regexp
Last active
November 12, 2018 00:36
-
-
Save Julien00859/2ad2f6a0834d8d86a023d7166433a143 to your computer and use it in GitHub Desktop.
unamur_regexp
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*.pyc | |
__pycache__ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sys import argv, exit | |
from finite_automaton import * | |
from parser import * | |
def main(): | |
global auto | |
# re = "a*b(c|d)ef" | |
n0 = Node() | |
n1 = Node("f", n0) | |
n2 = Node("e", n1) | |
n3 = Node("e", n1) | |
n4 = Node("c", n3, "d", n2) | |
n5 = Node("b", n4) | |
n5.add("a", n5) | |
auto = Automaton(n5) | |
auto.show_tree() | |
match = auto.match(" ".join(argv[1:])) | |
if match: | |
print("match") | |
else: | |
print("no match") | |
auto = Automaton(groups(r"a.c")) | |
for re in [r"ab|cd|e", r"ab*c", r"a.c", r"a|,", r"a*|b*"]: | |
print() | |
print(re) | |
auto = groups(re) | |
auto.show_tree() | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from abc import abstractmethod | |
from collections import defaultdict | |
from itertools import chain, count | |
SIGMA = object() | |
class Node: | |
count = 0 | |
def __init__(self, *pairs): | |
self.nexts = defaultdict(list) | |
self.prevs = defaultdict(list) | |
ipairs = iter(pairs) | |
for character, node in zip(ipairs, ipairs): | |
self.add(character, node) | |
self.id = Node.count | |
Node.count += 1 | |
def add(self, *pairs): | |
ipairs = iter(pairs) | |
for character, node in zip(ipairs, ipairs): | |
self.nexts[character].append(node) | |
node.prevs[character].append(self) | |
def rem(self, *pairs): | |
ipairs = iter(pairs) | |
for character, node in zip(ipairs, ipairs): | |
self.nexts[character].remove(node) | |
node.prevs[character].remove(self) | |
def replace(self, node, character=None): | |
if character is None: | |
keys = list(self.prevs.keys()) | |
if len(keys) != 1: | |
raise ValueError("Cannot determine previous transition to replace") | |
character = keys[0] | |
for prev_node in self.prevs[character]: | |
prev_node.rem(character, self) | |
prev_node.add(character, node) | |
def read(self, character): | |
return self.nexts.get(character, self.nexts.get(SIGMA, [])) | |
@property | |
def is_terminal(self): | |
return len(self.nexts) == 0 | |
def __repr__(self): | |
return "<Node #{}>".format(self.id) | |
def __str__(self): | |
lines = [] | |
if not self.nexts: | |
return " ({:2d})→".format(self.id) | |
if not self.prevs or all(nodes == [self] for nodes in self.prevs.values()): | |
lines.append("→({:2d})".format(self.id)) | |
for char, nodes in self.nexts.items(): | |
for node in nodes: | |
lines.append(" ({:2d}) {} ({:2d})".format( | |
self.id, | |
(char or "ε") if char is not SIGMA else "Σ", | |
node.id)) | |
return "\n".join(lines) | |
class Automaton: | |
def __init__(self, initial_node): | |
self.initial_node = initial_node | |
def match(self, string): | |
nodes = [self.initial_node] | |
for char in string: | |
next_nodes = [] | |
for node in nodes: | |
next_nodes.extend(node.read(char)) | |
if not next_nodes: | |
return False | |
nodes = next_nodes | |
for node in nodes: | |
if node.is_terminal: | |
return True | |
return False | |
def show_tree(self): | |
seen = set() | |
show = [self.initial_node] | |
while show: | |
next_nodes = set() | |
for node in show: | |
print(node) | |
seen.add(node) | |
next_nodes = next_nodes.union(*node.nexts.values()) | |
show = next_nodes - seen | |
@classmethod | |
def determine(class_, initial_node): | |
raise NotImplementedError() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
r""" | |
Minimal regexp language: | |
. : any character | |
| : or | |
(): group | |
* : any time | |
, : null | |
\ : espape | |
Exemple: | |
aa*b*(c|,)(d|e|f) = a+b*c?[def] | |
,|0|(1(01*0)1)* | |
""" | |
from finite_automaton import Node, Automaton, SIGMA as SIGMA_ | |
ESCAPE = "\\" | |
OPEN = "(" | |
CLOSE = ")" | |
OR = "|" | |
KLEENE = "*" | |
EPSILON = "," | |
SIGMA = "." | |
def groups(string): | |
entry_node = Node() | |
node_stack = [entry_node, Node()] | |
group_stack = [[]] | |
escape = False | |
for letter in string: | |
if escape: | |
escape = False | |
group_stack[-1].append(letter) | |
continue | |
if letter == ESCAPE: | |
escape = True | |
elif letter == OPEN: | |
new_in = Node() | |
new_out = Node() | |
node_stack[-2].add("", new_in) | |
new_out.add("", node_stack[-1]) | |
node_stack.extend([new_in, new_out]) | |
group_stack.append([]) | |
elif letter == CLOSE: | |
substring = "".join(group_stack.pop()) | |
out = node_stack.pop() | |
in_ = node_stack.pop() | |
choices(substring, in_, out) | |
else: | |
group_stack[-1].append(letter) | |
substring = "".join(group_stack.pop()) | |
out = node_stack.pop() | |
in_ = node_stack.pop() | |
choices(substring, in_, out) | |
return Automaton(entry_node) | |
def choices(string, in_, out): | |
""" | |
┌─── a→ ───┐ | |
a|b|c = (0)─ → ─(1)── b→ ──(2)─ → ─(3) | |
└─── c→ ───┘ | |
""" | |
choice_stack = [[]] | |
escape = False | |
for letter in string: | |
if escape: | |
escape = False | |
choice_stack[-1].append(letter) | |
continue | |
if letter == ESCAPE: | |
escape = True | |
elif letter == "|": | |
choice_stack.append([]) | |
else: | |
choice_stack[-1].append(letter) | |
choice_stack = map("".join, choice_stack) | |
for choice in choice_stack: | |
parse(choice, in_, out) | |
def parse(string, in_, out): | |
""" | |
abc = (0)── a→ ──(1)── b→ ──(2)── c→ ──(3) | |
""" | |
escape = False | |
nodes = [in_] | |
for letter in string: | |
if escape: | |
node = Node() | |
nodes[-1].add((letter, node)) | |
nodes.append(node) | |
escape = False | |
continue | |
if letter == ESCAPE: | |
escape = True | |
elif letter == KLEENE: | |
prev_out = nodes.pop() | |
prev_in = nodes.pop() | |
nodes.extend(kleene(prev_in, prev_out)) | |
else: | |
if letter == SIGMA: | |
letter = SIGMA_ | |
elif letter == EPSILON: | |
letter = "" | |
node = Node() | |
nodes[-1].add(letter, node) | |
nodes.append(node) | |
nodes[-1].replace(out, "") | |
def kleene(in_, out): | |
""" | |
┌── a→ ─┐ | |
a* = (0)── → ──(1)─ ← ─(2)── → ──(3) | |
└──────────── → ────────────┘ | |
""" | |
new_out = Node() | |
new_in = Node("", new_out) | |
in_.replace(new_in) | |
new_in.add("", in_) | |
out.add("", in_, "", new_out) | |
return new_in, new_out |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment