Skip to content

Instantly share code, notes, and snippets.

@SmileyChris
Created December 6, 2009 08:14
Show Gist options
  • Save SmileyChris/250128 to your computer and use it in GitHub Desktop.
Save SmileyChris/250128 to your computer and use it in GitHub Desktop.
import re
re_float = re.compile('[-+]?(\d+\.\d*|\.\d+)$')
re_decimal = re.compile('[-+]?\d+$')
class CompileError(Exception):
pass
class Tokenizer(object):
"""
A generic lexer that converts a text stream into a list of tokens which can
then be parsed.
"""
def __init__(self):
"""
Initialize the symbol library.
"""
self._library = {}
def register(self, symbol_class, weight, *definitions):
"""
Register a symbol class to the tokenizer library, along with a weight
and one or more definitions.
"""
if not definitions:
raise SyntaxError('No symbol definitions provided.')
if not issubclass(symbol_class, Symbol):
raise SyntaxError('Tried to register a non Symbol.')
for definition in definitions:
if definition in self._library:
raise SyntaxError('The "%s" definition has already been '
'registered.' % definition)
self._library[definition] = (symbol_class, weight)
# Clear the compiled definitions regular expression (if it exists).
if hasattr(self, '_re_definitions'):
del self._re_definitions
def scanner(self, text):
if not hasattr(self, '_re_definitions'):
definitions = []
# Add all definitions (ordered by longest first so that they match
# first in the regular expression lookup).
for definition in sorted(self._library.iterkeys(),
cmp=lambda x, y: cmp(len(y), len(x))):
if re.match('\w+$', definition):
definitions.append(r'\b%s\b' % re.escape(definition))
else:
definitions.append(re.escape(definition))
self._re_definitions = re.compile('\s+|(%s)' %
'|'.join(definitions))
return [bit for bit in self._re_definitions.split(text)
if bit]
def tokenize(self, text):
tokens = []
for bit in self.scanner(text):
if bit in self._library:
symbol_class, weight = self._library[bit]
tokens.append(symbol_class(bit, weight, tokenizer=self))
else:
tokens.append(self.unknown_bit(bit))
return tokens
def lookup(self, symbol_class):
"""
Look up a symbol class in the library, returning the first matching
symbol token or ``None`` if the class is not registered.
"""
for token in self._library.values():
if isinstance(token, symbol_class):
return token
def unknown_bit(self, bit):
return LiteralSymbol(bit, weight=0, tokenizer=self)
class Parser(object):
"""
A parser which compiles token symbols from the generic Tokenizer lexer.
"""
def __init__(self, tokens):
"""
Initialize the parser, setting the token position to 0.
"""
self._tokens = tokens
self._pos = 0
def next(self, peek=False):
"""
Return the next token.
If ``peek`` is True, the token position will not be advanced.
"""
if self.end:
raise CompileError('No more symbols.')
symbol = self._tokens[self._pos]
if not peek:
self._pos += 1
return symbol
@property
def end(self):
"""
Calculate if the end of the token stream has been reached.
"""
return self._pos >= len(self._tokens)
@property
def next_weight(self):
"""
Return the next token's weight, or -1 if there are no more tokens.
"""
return self.end and -1 or self.next(peek=True).weight
def compile(self, max_weight=-1, until=None):
"""
Compile the tokens into a single expression, stopping when the next
symbol has a higher weight than ``max_weight`` or the current symbol
matches the class provided in ``until``.
"""
token = self.next()
symbol = token.compile_first(parser=self)
symbol.compiled = True
while (until and isinstance(symbol, until)
or self.next_weight > max_weight):
next_token = self.next()
symbol = next_token.compile_next(symbol, parser=self)
symbol.compiled = True
return symbol
def compile(tokenizer, text):
"""
A shortcut method to tokenize and parse text in one go.
"""
tokens = tokenizer.tokenize(text)
return Parser(tokens=tokens).compile()
class Symbol(object):
"""
A base class for all parser which can be used by the Tokenizer.
Subclasses should implement one or both of the ``compile_first`` or
``compile_next`` methods.
"""
def __init__(self, value, weight, tokenizer):
"""
Initialize the symbol (as an uncompiled token) and keep a reference to
the tokenizer instance which was used (some symbols may wish to use the
token library in their ``compile`` method).
"""
self.value = value
self.weight = weight
self.compiled = False
self.tokenizer = tokenizer
def compile_first(self, parser):
"""
Compile this token when it appears at the beginning of a language
construct.
For example, when parsing ``[var1, add, var2]``, the ``var1`` token
would be compiled using ``compile_first``, then ``add`` would be
compiled with ``compile_next``.
"""
raise CompileError('Invalid use of "%s"' % self.value)
def compile_next(self, previous, parser):
"""
Compile this token when it appears inside the language construct.
See also: the ``compile_first`` method.
"""
raise CompileError('Invalid use of "%s"' % self.value)
def __repr__(self):
if not self.compiled:
return '%s(value=%r, weight=%s)' % (self.__class__.__name__,
self.value, self.weight)
return self.compiled_repr()
def compiled_repr(self):
return '(%s)' % self.__class__.__name__.lower()
class LiteralSymbol(Symbol):
def compile_first(self, parser):
return self
def render(self, **kwargs):
if re_decimal.match(self.value):
return int(self.value)
if re_float.match(self.value):
return float(self.value)
return self.value
def compiled_repr(self):
return repr(self.render())
class OperatorSymbol(Symbol):
def compile_next(self, previous, parser):
self.first = previous
self.second = parser.compile(self.weight)
return self
def compiled_repr(self):
return '(%s %s %s)' % (self.value, self.first, self.second)
def arithmetic_tester():
"""
Return a method which can be used to test the lexer against basic addition
/ multiplication precidence rules.
>>> test = arithmetic_tester()
>>> test('1 + 2')
(+ 1 2)
3
>>> test('1 + 2 * 3')
(+ 1 (* 2 3))
7
>>> test('1 + 2 * 3 + 4')
(+ (+ 1 (* 2 3)) 4)
11
"""
class Add(OperatorSymbol):
def compile_first(self, parser):
return parser.compile()
def render(self, **kwargs):
return self.first.render(**kwargs) + self.second.render(**kwargs)
class Multiply(OperatorSymbol):
def render(self, **kwargs):
return self.first.render(**kwargs) * self.second.render(**kwargs)
test_lexer = Tokenizer()
test_lexer.register(Add, 10, '+')
test_lexer.register(Multiply, 20, '*')
def test(text):
expression = compile(test_lexer, text)
print expression
print expression.render()
return test
def boolean_tester():
"""
Return a method which can be used to test the lexer against most boolean
logic operators.
>>> test = boolean_tester()
>>> test('0 == 1')
(== 0 1)
False
>>> test('1 == 1')
(== 1 1)
True
>>> test('0 > 1')
(> 0 1)
False
>>> test('1 > 0')
(> 1 0)
True
>>> test('0 and 1')
(and 0 1)
0
>>> test('0 or 1')
(or 0 1)
1
>>> test('a in abc')
(in 'a' 'abc')
True
>>> test('1 is 2')
(is 1 2)
False
>>> test('! 0')
(! 0)
True
>>> test('1 is not 2')
(not (is 1 2))
True
>>> test('a not in bcd')
(not (in 'a' 'bcd'))
True
>>> test('0 or 0 and 1')
(or 0 (and 0 1))
0
>>> test('0 and 0 or 1')
(or (and 0 0) 1)
1
>>> test('0 and ( 0 or 1 )')
(and 0 (or 0 1))
0
>>> test('not 0 and not 0')
(and (not 0) (not 0))
True
>>> test('1 < 2 and 1 < 3')
(and (< 1 2) (< 1 3))
True
"""
class Not(Symbol):
def compile_first(self, parser):
self.next = parser.compile(self.weight)
return self
def compile_next(self, previous, parser):
next = parser.next()
if not isinstance(next, In):
token = self.tokenizer.lookup(In)
in_definition = token and token.value or 'in'
raise CompileError('%r expected to be followed by %r' %
(self.value, in_definition))
self.next = next.compile_next(previous, parser)
self.next.compiled = True
return self
def render(self, **kwargs):
return not self.next.render(**kwargs)
def compiled_repr(self):
return '(%s %s)' % (self.value, self.next)
class And(OperatorSymbol):
def render(self, **kwargs):
return self.first.render(**kwargs) and self.second.render(**kwargs)
class Or(OperatorSymbol):
def render(self, **kwargs):
return self.first.render(**kwargs) or self.second.render(**kwargs)
class Greater(OperatorSymbol):
def render(self, **kwargs):
return self.first.render(**kwargs) > self.second.render(**kwargs)
class GreaterEqual(OperatorSymbol):
def render(self, **kwargs):
return self.first.render(**kwargs) >= self.second.render(**kwargs)
class Less(OperatorSymbol):
def render(self, **kwargs):
return self.first.render(**kwargs) < self.second.render(**kwargs)
class LessEqual(OperatorSymbol):
def render(self, **kwargs):
return self.first.render(**kwargs) <= self.second.render(**kwargs)
class Equals(OperatorSymbol):
def render(self, **kwargs):
return self.first.render(**kwargs) == self.second.render(**kwargs)
class GroupStart(Symbol):
def compile_first(self, parser):
return parser.compile(until=GroupEnd)
class GroupEnd(Symbol):
def compile_next(self, previous, parser):
return previous
class In(OperatorSymbol):
def render(self, **kwargs):
return self.first.render(**kwargs) in self.second.render(**kwargs)
class Is(OperatorSymbol):
def render(self, **kwargs):
return self.first.render(**kwargs) is self.second.render(**kwargs)
def compile_next(self, previous, parser):
inverse = isinstance(parser.next(peek=True), Not)
if inverse:
not_token = parser.next()
expression = super(Is, self).compile_next(previous, parser)
if inverse:
expression.compiled = True
not_token.next = expression
return not_token
return expression
test_lexer = Tokenizer()
test_lexer.register(Greater, 10, '>')
test_lexer.register(GreaterEqual, 90, '>=')
test_lexer.register(Less, 90, '<')
test_lexer.register(LessEqual, 90, '<=')
test_lexer.register(Equals, 90, '=', '==')
test_lexer.register(Or, 50, 'or')
test_lexer.register(And, 60, 'and')
test_lexer.register(Not, 80, 'not', '!')
test_lexer.register(GroupStart, 100, '(')
test_lexer.register(GroupEnd, 100, ')')
test_lexer.register(Is, 80, 'is')
test_lexer.register(In, 80, 'in')
def test(text):
expression = compile(test_lexer, text)
print expression
print expression.render()
return test
if __name__ == '__main__':
from doctest import testmod
testmod()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment