Skip to content

Instantly share code, notes, and snippets.

Forked from blinks/
Created January 13, 2011 12:32
Show Gist options
  • Save nigma/777794 to your computer and use it in GitHub Desktop.
Save nigma/777794 to your computer and use it in GitHub Desktop.
Scanner: match text to generate tokens.
Adam Blinkinsop <[email protected]>
First, construct a scanner with the tokens you'd like to match described as
keyword arguments, using Python-syntax regular expressions.
WARNING: Group syntax in these expressions has an undefined effect.
>>> simple = Scan(ID=r'\w+')
You can now use this object to generate tokens by calling it with one or more
>>> tokens = list(simple('hello'))
>>> len(tokens), tokens[0]
(1, Token("ID -> 'hello'"))
>>> print simple
ID -> \\w+
Characters that don't match will raise an exception with the location of the
error (see
>>> tokens = list(simple('hello world'))
Traceback (most recent call last):
UnrecognizedCharacter: 1.5: couldn't match ' '
>>> len(tokens), tokens[0]
(1, Token("ID -> 'hello'"))
>>> simple = Scan(ID=r'\w+', SPACE=r'\s+')
>>> list(simple('hello world'))
[Token("ID -> 'hello'"), Token("SPACE -> ' '"), Token("ID -> 'world'")]
You can also ignore tokens to keep them from being generated.
>>> simple.ignore('SPACE')
>>> list(simple('hello world'))
[Token("ID -> 'hello'"), Token("ID -> 'world'")]
While this scanner doesn't keep track of lines
import re
class Error(Exception):
"""The generic error class for this module."""
class UnrecognizedCharacter(Error):
"""Position `pos` in the text doesn't match any tokens."""
def __init__(self, string, start, stop=None):
self.value = (string, start, stop)
self.char = string[start:stop]
self.span = span_of(string, start, stop)
def __str__(self):
return "%s: couldn't match %r" % (self.span, self.char)
class Token(object):
"""A token matched in a string."""
def __init__(self, m):
self.value =
self.start, self.end = m.span()
self.span = (self.start, self.end)
self.pos = m.pos
self.token = m.lastgroup
self.string = m.string
def __repr__(self):
return 'Token(%r)' % str(self)
def __str__(self):
return '%s -> %r' % (self.token, self.value)
def span_of(string, start, stop):
"""Return a string representing the position of this slice."""
def column_of(p):
line_start = string.rfind('\n', 0, p)
if line_start == -1: return p
else: return p - line_start
stline, stcol = string.count('\n', 0, start) + 1, column_of(start)
loc = '%i.%i' % (stline, stcol)
if stop > start + 1 and string.count('\n', start, stop) > 0:
col = column_of(stop)
return loc + '-%i.%i' % (
stline + string.count('\n', start, stop), col)
elif stop > start + 1:
col = column_of(stop)
return loc + '-%i' % (col)
return loc
class Scan(object):
"""A scanner for a particular set of tokens (defined as keyword args)."""
def __init__(self, **tokens):
self.tokens = tokens
self.ignores = set()
def __call__(self, *args, **opts):
"""Call on a string (or a list of strings) to generate tokens."""
# Start at the beginning of this text.
text, pos = ''.join(args), 0
while pos < len(text):
# Match the text, looking for the next token.
m = self.regex.match(text, pos)
if m is None:
# No token was found, raise an error.
raise UnrecognizedCharacter(text, pos, pos + 1)
elif m.lastgroup in self.ignores:
# An ignored token was found, continue without yielding.
pos = m.end()
# Found a token; yield its name and the text it matched.
yield Token(m)
pos = m.end()
def __str__(self):
return '\n'.join('%s -> %s' % (k, v) for (k, v) in self.tokens.items())
def __compile(self):
"""Compile the dict of tokens into a regular expression."""
self.regex = re.compile('|'.join(
'(?P<%s>%s)' % (k, self.tokens[k]) for k in self.tokens))
def update(self, **tokens):
"""Update the tokens matched with token=regex pairs."""
def ignore(self, key):
"""Ignore a particular token when it is found."""
def unignore(self, key):
"""Remove a token from the ignores list."""
if __name__ == '__main__':
import doctest
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment