Last active
November 28, 2015 12:34
-
-
Save clee704/4ad73918d1c408b8814c to your computer and use it in GitHub Desktop.
Simple tag expression matcher
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
LPAR = 0 | |
RPAR = 1 | |
AND = 2 | |
OR = 3 | |
NOT = 4 | |
IDENT = 5 | |
EOF = 6 | |
token_names = ('LPAR', 'RPAR', 'AND', 'OR', 'NOT', 'IDENT', 'EOF') | |
class Lexer(object): | |
symbol_map = { | |
'(': LPAR, | |
')': RPAR, | |
'+': AND, | |
',': OR, | |
'~': NOT, | |
} | |
keyword_map = { | |
'and': AND, | |
'or': OR, | |
'not': NOT, | |
} | |
def __init__(self, input): | |
self.input = input | |
self.curpos = -1 | |
self.nextpos = 0 | |
self.token = (EOF, '') | |
def next(self): | |
n = len(self.input) | |
i = self.nextpos | |
while i < n and self.input[i].isspace(): | |
i += 1 | |
self.curpos = i | |
if i == n: | |
self.nextpos = i | |
self.token = (EOF, '') | |
return self.token | |
c = self.input[i] | |
if c in self.symbol_map: | |
self.nextpos = i + 1 | |
self.token = (self.symbol_map[c], c) | |
return self.token | |
j = i + 1 | |
while j < n and not (self.input[j].isspace() or | |
self.input[j] in self.symbol_map): | |
j += 1 | |
self.nextpos = j | |
tok = self.input[i:j] | |
self.token = (self.keyword_map.get(tok.lower(), IDENT), tok) | |
return self.token | |
def generate(self): | |
if self.token[0] == EOF: | |
self.next() | |
while self.token[0] != EOF: | |
yield self.token | |
self.next() | |
class Node(object): | |
def __init__(self, kind, *children): | |
self.kind = kind | |
self.children = tuple(children) | |
def __eq__(self, rhs): | |
return (isinstance(rhs, Node) and self.kind == rhs.kind and | |
self.children == rhs.children) | |
def __repr__(self): | |
return 'Node(kind={!r}, {})'.format(self.kind, | |
', '.join(repr(c) for c in self.children)) | |
class ParserException(Exception): | |
pass | |
class Parser(object): | |
def __init__(self, lexer): | |
self.lexer = lexer | |
self.stash = None | |
def parse(self): | |
ret = self.parse_expr() | |
tok = self._next_token() | |
if tok[0] != EOF: | |
raise ParserException('expected EOF, got {} at pos {}'.format( | |
token_names[tok[0]], self.lexer.curpos + 1)) | |
return ret | |
def parse_expr(self): | |
return self._parse_nary(OR, self.parse_term) | |
def parse_term(self): | |
return self._parse_nary(AND, self.parse_atom) | |
def parse_atom(self): | |
tok = self._next_token() | |
if tok[0] == IDENT: | |
return tok[1] | |
elif tok[0] == NOT: | |
return Node(NOT, self.parse_atom()) | |
elif tok[0] == LPAR: | |
ret = self.parse_expr() | |
tok = self._next_token() | |
if tok[0] != RPAR: | |
raise ParserException('expected RPAR, got {} at pos {}' | |
.format(token_names[tok[0]], self.lexer.curpos + 1)) | |
return ret | |
else: | |
raise ParserException('expected IDENT, NOT, or LPAR, got {} at ' | |
'pos {}' .format(token_names[tok[0]], | |
self.lexer.curpos + 1)) | |
def _next_token(self): | |
if self.stash is not None: | |
ret = self.stash | |
self.stash = None | |
return ret | |
else: | |
return self.lexer.next() | |
def _putback(self, token): | |
assert self.stash is None | |
self.stash = token | |
def _parse_nary(self, kind, parse_child): | |
nodes = [parse_child()] | |
tok = self._next_token() | |
while tok[0] == kind: | |
nodes.append(parse_child()) | |
tok = self._next_token() | |
self._putback(tok) | |
return nodes[0] if len(nodes) == 1 else Node(kind, *nodes) | |
class And(object): | |
def __init__(self, *matchers): | |
self.matchers = matchers | |
def __call__(self, arg): | |
return all(m(arg) for m in self.matchers) | |
def __repr__(self): | |
return 'And({})'.format(', '.join(repr(m) for m in self.matchers)) | |
class Or(object): | |
def __init__(self, *matchers): | |
self.matchers = matchers | |
def __call__(self, arg): | |
return any(m(arg) for m in self.matchers) | |
def __repr__(self): | |
return 'Or({})'.format(', '.join(repr(m) for m in self.matchers)) | |
class Not(object): | |
def __init__(self, matcher): | |
self.matcher = matcher | |
def __call__(self, arg): | |
return not self.matcher(arg) | |
def __repr__(self): | |
return 'Not({!r})'.format(self.matcher) | |
class Contains(object): | |
def __init__(self, expected): | |
self.expected = expected | |
def __call__(self, arg): | |
return self.expected in arg | |
def __repr__(self): | |
return 'Contains({!r})'.format(self.expected) | |
def get_matcher(expr): | |
return _get_matcher(Parser(Lexer(expr)).parse()) | |
def _get_matcher(ast): | |
if isinstance(ast, Node): | |
if ast.kind == AND: | |
return And(*[_get_matcher(c) for c in ast.children]) | |
elif ast.kind == OR: | |
return Or(*[_get_matcher(c) for c in ast.children]) | |
elif ast.kind == NOT: | |
return Not(_get_matcher(ast.children[0])) | |
else: | |
raise RuntimeError('unknown AST type: {!r}'.format(ast.kind)) | |
else: | |
return Contains(ast) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pytest | |
from matcher import (AND, IDENT, LPAR, NOT, OR, RPAR, Lexer, Node, Parser, | |
ParserException, get_matcher) | |
@pytest.mark.parametrize('input, output', [ | |
('a + b', [(IDENT, 'a'), (AND, '+'), (IDENT, 'b')]), | |
('((anderson and oreo) or not notton)', [ | |
(LPAR, '('), | |
(LPAR, '('), | |
(IDENT, 'anderson'), | |
(AND, 'and'), | |
(IDENT, 'oreo'), | |
(RPAR, ')'), | |
(OR, 'or'), | |
(NOT, 'not'), | |
(IDENT, 'notton'), | |
(RPAR, ')'), | |
]), | |
('Dumb And Dumber', [(IDENT, 'Dumb'), (AND, 'And'), (IDENT, 'Dumber')]), | |
(' a\n +\n b \n', [(IDENT, 'a'), (AND, '+'), (IDENT, 'b')]), | |
('a+b', [(IDENT, 'a'), (AND, '+'), (IDENT, 'b')]), | |
]) | |
def test_lexer(input, output): | |
assert list(Lexer(input).generate()) == output | |
@pytest.mark.parametrize('input, output', [ | |
('a + b', Node(AND, 'a', 'b')), | |
('(a and b)', Node(AND, 'a', 'b')), | |
('((a and ((b))))', Node(AND, 'a', 'b')), | |
('((anderson and oreo) or not notton)', | |
Node(OR, | |
Node(AND, 'anderson', 'oreo'), | |
Node(NOT, 'notton'))), | |
('Dumb And Dumber', Node(AND, 'Dumb', 'Dumber')), | |
('a and b or c', Node(OR, Node(AND, 'a', 'b'), 'c')), | |
('a and (b or c)', Node(AND, 'a', Node(OR, 'b', 'c'))), | |
('a or b and c', Node(OR, 'a', Node(AND, 'b', 'c'))), | |
('a and b and c', Node(AND, 'a', 'b', 'c')), | |
('a or b or c', Node(OR, 'a', 'b', 'c')), | |
('not a or b', Node(OR, Node(NOT, 'a'), 'b')), | |
('a or not b', Node(OR, 'a', Node(NOT, 'b'))), | |
('not (a or b)', Node(NOT, Node(OR, 'a', 'b'))), | |
('~a+b,c', Node(OR, Node(AND, Node(NOT, 'a'), 'b'), 'c')), | |
('a,b,c,d', Node(OR, 'a', 'b', 'c', 'd')), | |
('a+b+c+d', Node(AND, 'a', 'b', 'c', 'd')), | |
('~a+b+~c+d', Node(AND, Node(NOT, 'a'), 'b', Node(NOT, 'c'), 'd')), | |
('~a+b+c,~d', | |
Node(OR, | |
Node(AND, Node(NOT, 'a'), 'b', 'c'), | |
Node(NOT, 'd'))), | |
('(a,b)+c', Node(AND, Node(OR, 'a', 'b'), 'c')), | |
]) | |
def test_parser(input, output): | |
assert Parser(Lexer(input)).parse() == output | |
@pytest.mark.parametrize('input, error', [ | |
('', "expected IDENT, NOT, or LPAR, got EOF at pos 1"), | |
('a +', "expected IDENT, NOT, or LPAR, got EOF at pos 4"), | |
('a a', "expected EOF, got IDENT at pos 3"), | |
('a (and) b', "expected EOF, got LPAR at pos 3"), | |
('and and and', "expected IDENT, NOT, or LPAR, got AND at pos 1"), | |
('(())', "expected IDENT, NOT, or LPAR, got RPAR at pos 3"), | |
]) | |
def test_parser_error(input, error): | |
parser = Parser(Lexer(input)) | |
try: | |
parser.parse() | |
except ParserException as e: | |
assert e.message == error | |
else: | |
assert False, 'exception should have been thrown' | |
@pytest.mark.parametrize('input, arg, result', [ | |
('a', ['a'], True), | |
('a or b', ['a'], True), | |
('a or b', ['b'], True), | |
('a or b', ['c'], False), | |
('a and b or c', ['a'], False), | |
('a and b or c', ['b'], False), | |
('a and b or c', ['a', 'b'], True), | |
('a and b or c', ['c'], True), | |
('not a', ['a'], False), | |
('not a', ['b'], True), | |
('(not a or not b) and (c or d)', ['a'], False), | |
('(not a or not b) and (c or d)', ['a', 'b'], False), | |
('(not a or not b) and (c or d)', ['a', 'c'], True), | |
('(not a or not b) and (c or d)', ['a', 'b', 'c'], False), | |
('(not a or not b) and (c or d)', ['e'], False), | |
('(not a or not b) and (c or d)', ['c'], True), | |
('(not a or not b) and (c or d)', ['d'], True), | |
('(not a or not b) and (c or d)', ['c', 'e'], True), | |
]) | |
def test_get_matcher(input, arg, result): | |
assert get_matcher(input)(arg) == result |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment