Skip to content

Instantly share code, notes, and snippets.

@airekans
Created January 22, 2014 08:05
Show Gist options
  • Save airekans/8555105 to your computer and use it in GitHub Desktop.
Save airekans/8555105 to your computer and use it in GitHub Desktop.
A simple parser based on the lexer mentioned in https://github.com/alanszlosek/mercussion/blob/master/lexer.py
#! /usr/bin/env python
#
# The grammar for the parsed language is very simple.
# It's like a JSON variation.
# Here's a simple valid expression:
# {
# a 1,
# b 2,
# c {
# c_1 3,
# c_2 4
# }
# }
#
# And the expression will be parsed into a dict in python.
#
import re
class UnknownTokenError(Exception):
""" This exception is for use to be thrown when an unknown token is
encountered in the token stream. It hols the line number and the
offending token.
"""
def __init__(self, token, lineno):
self.token = token
self.lineno = lineno
def __str__(self):
return "Line #%s, Found token: %s" % (self.lineno, self.token)
class _InputScanner(object):
""" This class manages the scanning of a specific input. An instance of it is
returned when scan() is called. It is built to be great for iteration. This is
mainly to be used by the Lexer and ideally not directly.
"""
def __init__(self, lexer, input):
""" Put the lexer into this instance so the callbacks can reference it
if needed.
"""
self._position = 0
self.lexer = lexer
self.input = input
def __iter__(self):
""" All of the code for iteration is controlled by the class itself.
This and next() (or __next__() in Python 3.0) are so syntax
like `for token in Lexer(...):` is valid and works.
"""
return self
def next(self):
""" Used for iteration. It returns token after token until there
are no more tokens. (change this to __next__(self) if using Py3.0)
"""
if not self.done_scanning():
return self.scan_next()
raise StopIteration
def done_scanning(self):
""" A simple boolean function that returns true if scanning is
complete and false if it isn't.
"""
return self._position >= len(self.input)
def scan_next(self):
""" Retreive the next token from the input. If the
flag `omit_whitespace` is set to True, then it will
skip over the whitespace characters present.
"""
if self.done_scanning():
return None
if self.lexer.omit_whitespace:
match = self.lexer.ws_regexc.match(self.input, self._position)
if match:
self._position = match.end()
match = self.lexer.regexc.match(self.input, self._position)
if match is None:
lineno = self.input[:self._position].count("\n") + 1
raise UnknownTokenError(self.input[self._position], lineno)
self._position = match.end()
value = match.group(match.lastgroup)
if match.lastgroup in self.lexer._callbacks:
value = self.lexer._callbacks[match.lastgroup](self, value)
return match.lastgroup, value
class Lexer(object):
""" A lexical scanner. It takes in an input and a set of rules based
on reqular expressions. It then scans the input and returns the
tokens one-by-one. It is meant to be used through iterating.
"""
def __init__(self, rules, case_sensitive=True, omit_whitespace=True):
""" Set up the lexical scanner. Build and compile the regular expression
and prepare the whitespace searcher.
"""
self._callbacks = {}
self.omit_whitespace = omit_whitespace
self.case_sensitive = case_sensitive
parts = []
for name, rule in rules:
if not isinstance(rule, str):
rule, callback = rule
self._callbacks[name] = callback
parts.append("(?P<%s>%s)" % (name, rule))
if self.case_sensitive:
flags = re.M
else:
flags = re.M|re.I
self.regexc = re.compile("|".join(parts), flags)
self.ws_regexc = re.compile("\s*", re.MULTILINE)
def scan(self, input):
""" Return a scanner built for matching through the `input` field.
The scanner that it returns is built well for iterating.
"""
return _InputScanner(self, input)
class UnmatchTokenError(Exception):
def __init__(self, token):
self.__token = token
def __str__(self):
return str(self.__token)
class LexStream(object):
def __init__(self, scanner):
self.__scanner = scanner
self.__cur_tok = None
def _next(self):
try:
tok = self.__scanner.next()
while tok[0] == 'comment':
tok = self.__scanner.next()
return tok
except StopIteration:
return 'eof', ''
def next(self):
if self.__cur_tok is None:
return self._next()
else:
tok = self.__cur_tok
self.__cur_tok = None
return tok
def current(self):
if self.__cur_tok is None:
self.__cur_tok = self._next()
return self.__cur_tok
def match(self, expected_tok):
tok = self.next()
if tok[0] != expected_tok:
raise UnmatchTokenError(tok)
def parse_set(iter_token):
iter_token.match('lbrace')
tok = iter_token.current()
if tok[0] == 'var':
res = parse_dict(iter_token)
else:
res = parse_seq(iter_token)
iter_token.match('rbrace')
return res
def parse_dict(iter_token):
res = {}
while iter_token.current()[0] != 'rbrace':
tok = iter_token.current()
iter_token.match('var')
res[tok[1]] = parse_value(iter_token)
if iter_token.current()[0] == 'comma':
iter_token.match('comma')
return res
def parse_seq(iter_token):
res = []
while iter_token.current()[0] != 'rbrace':
res.append(parse_value(iter_token))
return res
def parse_value(iter_token):
tok = iter_token.current()
if tok[0] == 'num':
iter_token.match('num')
return int(tok[1])
elif tok[0] == 'str':
iter_token.match('str')
return tok[1]
elif tok[0] == 'lbrace':
return parse_set(iter_token)
else:
raise UnmatchTokenError(tok)
if __name__ == '__main__':
import sys
in_str = sys.argv[1]
rules = [('var', '[_a-z]\\w*'), ('num', '-?\\d+'),
('lbrace', '{'), ('rbrace', '}'),
('str', '\'.*\'H'),
('comma', ','), ('comment', '--.*$')]
lexer = Lexer(rules)
print parse_set(LexStream(lexer.scan(in_str)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment