Created
December 6, 2009 08:14
-
-
Save SmileyChris/250128 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
re_float = re.compile('[-+]?(\d+\.\d*|\.\d+)$') | |
re_decimal = re.compile('[-+]?\d+$') | |
class CompileError(Exception): | |
pass | |
class Tokenizer(object): | |
""" | |
A generic lexer that converts a text stream into a list of tokens which can | |
then be parsed. | |
""" | |
def __init__(self): | |
""" | |
Initialize the symbol library. | |
""" | |
self._library = {} | |
def register(self, symbol_class, weight, *definitions): | |
""" | |
Register a symbol class to the tokenizer library, along with a weight | |
and one or more definitions. | |
""" | |
if not definitions: | |
raise SyntaxError('No symbol definitions provided.') | |
if not issubclass(symbol_class, Symbol): | |
raise SyntaxError('Tried to register a non Symbol.') | |
for definition in definitions: | |
if definition in self._library: | |
raise SyntaxError('The "%s" definition has already been ' | |
'registered.' % definition) | |
self._library[definition] = (symbol_class, weight) | |
# Clear the compiled definitions regular expression (if it exists). | |
if hasattr(self, '_re_definitions'): | |
del self._re_definitions | |
def scanner(self, text): | |
if not hasattr(self, '_re_definitions'): | |
definitions = [] | |
# Add all definitions (ordered by longest first so that they match | |
# first in the regular expression lookup). | |
for definition in sorted(self._library.iterkeys(), | |
cmp=lambda x, y: cmp(len(y), len(x))): | |
if re.match('\w+$', definition): | |
definitions.append(r'\b%s\b' % re.escape(definition)) | |
else: | |
definitions.append(re.escape(definition)) | |
self._re_definitions = re.compile('\s+|(%s)' % | |
'|'.join(definitions)) | |
return [bit for bit in self._re_definitions.split(text) | |
if bit] | |
def tokenize(self, text): | |
tokens = [] | |
for bit in self.scanner(text): | |
if bit in self._library: | |
symbol_class, weight = self._library[bit] | |
tokens.append(symbol_class(bit, weight, tokenizer=self)) | |
else: | |
tokens.append(self.unknown_bit(bit)) | |
return tokens | |
def lookup(self, symbol_class): | |
""" | |
Look up a symbol class in the library, returning the first matching | |
symbol token or ``None`` if the class is not registered. | |
""" | |
for token in self._library.values(): | |
if isinstance(token, symbol_class): | |
return token | |
def unknown_bit(self, bit): | |
return LiteralSymbol(bit, weight=0, tokenizer=self) | |
class Parser(object): | |
""" | |
A parser which compiles token symbols from the generic Tokenizer lexer. | |
""" | |
def __init__(self, tokens): | |
""" | |
Initialize the parser, setting the token position to 0. | |
""" | |
self._tokens = tokens | |
self._pos = 0 | |
def next(self, peek=False): | |
""" | |
Return the next token. | |
If ``peek`` is True, the token position will not be advanced. | |
""" | |
if self.end: | |
raise CompileError('No more symbols.') | |
symbol = self._tokens[self._pos] | |
if not peek: | |
self._pos += 1 | |
return symbol | |
@property | |
def end(self): | |
""" | |
Calculate if the end of the token stream has been reached. | |
""" | |
return self._pos >= len(self._tokens) | |
@property | |
def next_weight(self): | |
""" | |
Return the next token's weight, or -1 if there are no more tokens. | |
""" | |
return self.end and -1 or self.next(peek=True).weight | |
def compile(self, max_weight=-1, until=None): | |
""" | |
Compile the tokens into a single expression, stopping when the next | |
symbol has a higher weight than ``max_weight`` or the current symbol | |
matches the class provided in ``until``. | |
""" | |
token = self.next() | |
symbol = token.compile_first(parser=self) | |
symbol.compiled = True | |
while (until and isinstance(symbol, until) | |
or self.next_weight > max_weight): | |
next_token = self.next() | |
symbol = next_token.compile_next(symbol, parser=self) | |
symbol.compiled = True | |
return symbol | |
def compile(tokenizer, text): | |
""" | |
A shortcut method to tokenize and parse text in one go. | |
""" | |
tokens = tokenizer.tokenize(text) | |
return Parser(tokens=tokens).compile() | |
class Symbol(object): | |
""" | |
A base class for all parser which can be used by the Tokenizer. | |
Subclasses should implement one or both of the ``compile_first`` or | |
``compile_next`` methods. | |
""" | |
def __init__(self, value, weight, tokenizer): | |
""" | |
Initialize the symbol (as an uncompiled token) and keep a reference to | |
the tokenizer instance which was used (some symbols may wish to use the | |
token library in their ``compile`` method). | |
""" | |
self.value = value | |
self.weight = weight | |
self.compiled = False | |
self.tokenizer = tokenizer | |
def compile_first(self, parser): | |
""" | |
Compile this token when it appears at the beginning of a language | |
construct. | |
For example, when parsing ``[var1, add, var2]``, the ``var1`` token | |
would be compiled using ``compile_first``, then ``add`` would be | |
compiled with ``compile_next``. | |
""" | |
raise CompileError('Invalid use of "%s"' % self.value) | |
def compile_next(self, previous, parser): | |
""" | |
Compile this token when it appears inside the language construct. | |
See also: the ``compile_first`` method. | |
""" | |
raise CompileError('Invalid use of "%s"' % self.value) | |
def __repr__(self): | |
if not self.compiled: | |
return '%s(value=%r, weight=%s)' % (self.__class__.__name__, | |
self.value, self.weight) | |
return self.compiled_repr() | |
def compiled_repr(self): | |
return '(%s)' % self.__class__.__name__.lower() | |
class LiteralSymbol(Symbol): | |
def compile_first(self, parser): | |
return self | |
def render(self, **kwargs): | |
if re_decimal.match(self.value): | |
return int(self.value) | |
if re_float.match(self.value): | |
return float(self.value) | |
return self.value | |
def compiled_repr(self): | |
return repr(self.render()) | |
class OperatorSymbol(Symbol): | |
def compile_next(self, previous, parser): | |
self.first = previous | |
self.second = parser.compile(self.weight) | |
return self | |
def compiled_repr(self): | |
return '(%s %s %s)' % (self.value, self.first, self.second) | |
def arithmetic_tester(): | |
""" | |
Return a method which can be used to test the lexer against basic addition | |
/ multiplication precidence rules. | |
>>> test = arithmetic_tester() | |
>>> test('1 + 2') | |
(+ 1 2) | |
3 | |
>>> test('1 + 2 * 3') | |
(+ 1 (* 2 3)) | |
7 | |
>>> test('1 + 2 * 3 + 4') | |
(+ (+ 1 (* 2 3)) 4) | |
11 | |
""" | |
class Add(OperatorSymbol): | |
def compile_first(self, parser): | |
return parser.compile() | |
def render(self, **kwargs): | |
return self.first.render(**kwargs) + self.second.render(**kwargs) | |
class Multiply(OperatorSymbol): | |
def render(self, **kwargs): | |
return self.first.render(**kwargs) * self.second.render(**kwargs) | |
test_lexer = Tokenizer() | |
test_lexer.register(Add, 10, '+') | |
test_lexer.register(Multiply, 20, '*') | |
def test(text): | |
expression = compile(test_lexer, text) | |
print expression | |
print expression.render() | |
return test | |
def boolean_tester(): | |
""" | |
Return a method which can be used to test the lexer against most boolean | |
logic operators. | |
>>> test = boolean_tester() | |
>>> test('0 == 1') | |
(== 0 1) | |
False | |
>>> test('1 == 1') | |
(== 1 1) | |
True | |
>>> test('0 > 1') | |
(> 0 1) | |
False | |
>>> test('1 > 0') | |
(> 1 0) | |
True | |
>>> test('0 and 1') | |
(and 0 1) | |
0 | |
>>> test('0 or 1') | |
(or 0 1) | |
1 | |
>>> test('a in abc') | |
(in 'a' 'abc') | |
True | |
>>> test('1 is 2') | |
(is 1 2) | |
False | |
>>> test('! 0') | |
(! 0) | |
True | |
>>> test('1 is not 2') | |
(not (is 1 2)) | |
True | |
>>> test('a not in bcd') | |
(not (in 'a' 'bcd')) | |
True | |
>>> test('0 or 0 and 1') | |
(or 0 (and 0 1)) | |
0 | |
>>> test('0 and 0 or 1') | |
(or (and 0 0) 1) | |
1 | |
>>> test('0 and ( 0 or 1 )') | |
(and 0 (or 0 1)) | |
0 | |
>>> test('not 0 and not 0') | |
(and (not 0) (not 0)) | |
True | |
>>> test('1 < 2 and 1 < 3') | |
(and (< 1 2) (< 1 3)) | |
True | |
""" | |
class Not(Symbol): | |
def compile_first(self, parser): | |
self.next = parser.compile(self.weight) | |
return self | |
def compile_next(self, previous, parser): | |
next = parser.next() | |
if not isinstance(next, In): | |
token = self.tokenizer.lookup(In) | |
in_definition = token and token.value or 'in' | |
raise CompileError('%r expected to be followed by %r' % | |
(self.value, in_definition)) | |
self.next = next.compile_next(previous, parser) | |
self.next.compiled = True | |
return self | |
def render(self, **kwargs): | |
return not self.next.render(**kwargs) | |
def compiled_repr(self): | |
return '(%s %s)' % (self.value, self.next) | |
class And(OperatorSymbol): | |
def render(self, **kwargs): | |
return self.first.render(**kwargs) and self.second.render(**kwargs) | |
class Or(OperatorSymbol): | |
def render(self, **kwargs): | |
return self.first.render(**kwargs) or self.second.render(**kwargs) | |
class Greater(OperatorSymbol): | |
def render(self, **kwargs): | |
return self.first.render(**kwargs) > self.second.render(**kwargs) | |
class GreaterEqual(OperatorSymbol): | |
def render(self, **kwargs): | |
return self.first.render(**kwargs) >= self.second.render(**kwargs) | |
class Less(OperatorSymbol): | |
def render(self, **kwargs): | |
return self.first.render(**kwargs) < self.second.render(**kwargs) | |
class LessEqual(OperatorSymbol): | |
def render(self, **kwargs): | |
return self.first.render(**kwargs) <= self.second.render(**kwargs) | |
class Equals(OperatorSymbol): | |
def render(self, **kwargs): | |
return self.first.render(**kwargs) == self.second.render(**kwargs) | |
class GroupStart(Symbol): | |
def compile_first(self, parser): | |
return parser.compile(until=GroupEnd) | |
class GroupEnd(Symbol): | |
def compile_next(self, previous, parser): | |
return previous | |
class In(OperatorSymbol): | |
def render(self, **kwargs): | |
return self.first.render(**kwargs) in self.second.render(**kwargs) | |
class Is(OperatorSymbol): | |
def render(self, **kwargs): | |
return self.first.render(**kwargs) is self.second.render(**kwargs) | |
def compile_next(self, previous, parser): | |
inverse = isinstance(parser.next(peek=True), Not) | |
if inverse: | |
not_token = parser.next() | |
expression = super(Is, self).compile_next(previous, parser) | |
if inverse: | |
expression.compiled = True | |
not_token.next = expression | |
return not_token | |
return expression | |
test_lexer = Tokenizer() | |
test_lexer.register(Greater, 10, '>') | |
test_lexer.register(GreaterEqual, 90, '>=') | |
test_lexer.register(Less, 90, '<') | |
test_lexer.register(LessEqual, 90, '<=') | |
test_lexer.register(Equals, 90, '=', '==') | |
test_lexer.register(Or, 50, 'or') | |
test_lexer.register(And, 60, 'and') | |
test_lexer.register(Not, 80, 'not', '!') | |
test_lexer.register(GroupStart, 100, '(') | |
test_lexer.register(GroupEnd, 100, ')') | |
test_lexer.register(Is, 80, 'is') | |
test_lexer.register(In, 80, 'in') | |
def test(text): | |
expression = compile(test_lexer, text) | |
print expression | |
print expression.render() | |
return test | |
if __name__ == '__main__': | |
from doctest import testmod | |
testmod() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment