Created
January 22, 2014 08:05
-
-
Save airekans/8555105 to your computer and use it in GitHub Desktop.
A simple parser based on the lexer mentioned in https://github.com/alanszlosek/mercussion/blob/master/lexer.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# | |
# The grammar for the parsed language is very simple. | |
# It's like a JSON variation. | |
# Here's a simple valid expression: | |
# { | |
# a 1, | |
# b 2, | |
# c { | |
# c_1 3, | |
# c_2 4 | |
# } | |
# } | |
# | |
# And the expression will be parsed into a dict in python. | |
# | |
import re | |
class UnknownTokenError(Exception): | |
""" This exception is for use to be thrown when an unknown token is | |
encountered in the token stream. It hols the line number and the | |
offending token. | |
""" | |
def __init__(self, token, lineno): | |
self.token = token | |
self.lineno = lineno | |
def __str__(self): | |
return "Line #%s, Found token: %s" % (self.lineno, self.token) | |
class _InputScanner(object): | |
""" This class manages the scanning of a specific input. An instance of it is | |
returned when scan() is called. It is built to be great for iteration. This is | |
mainly to be used by the Lexer and ideally not directly. | |
""" | |
def __init__(self, lexer, input): | |
""" Put the lexer into this instance so the callbacks can reference it | |
if needed. | |
""" | |
self._position = 0 | |
self.lexer = lexer | |
self.input = input | |
def __iter__(self): | |
""" All of the code for iteration is controlled by the class itself. | |
This and next() (or __next__() in Python 3.0) are so syntax | |
like `for token in Lexer(...):` is valid and works. | |
""" | |
return self | |
def next(self): | |
""" Used for iteration. It returns token after token until there | |
are no more tokens. (change this to __next__(self) if using Py3.0) | |
""" | |
if not self.done_scanning(): | |
return self.scan_next() | |
raise StopIteration | |
def done_scanning(self): | |
""" A simple boolean function that returns true if scanning is | |
complete and false if it isn't. | |
""" | |
return self._position >= len(self.input) | |
def scan_next(self): | |
""" Retreive the next token from the input. If the | |
flag `omit_whitespace` is set to True, then it will | |
skip over the whitespace characters present. | |
""" | |
if self.done_scanning(): | |
return None | |
if self.lexer.omit_whitespace: | |
match = self.lexer.ws_regexc.match(self.input, self._position) | |
if match: | |
self._position = match.end() | |
match = self.lexer.regexc.match(self.input, self._position) | |
if match is None: | |
lineno = self.input[:self._position].count("\n") + 1 | |
raise UnknownTokenError(self.input[self._position], lineno) | |
self._position = match.end() | |
value = match.group(match.lastgroup) | |
if match.lastgroup in self.lexer._callbacks: | |
value = self.lexer._callbacks[match.lastgroup](self, value) | |
return match.lastgroup, value | |
class Lexer(object): | |
""" A lexical scanner. It takes in an input and a set of rules based | |
on reqular expressions. It then scans the input and returns the | |
tokens one-by-one. It is meant to be used through iterating. | |
""" | |
def __init__(self, rules, case_sensitive=True, omit_whitespace=True): | |
""" Set up the lexical scanner. Build and compile the regular expression | |
and prepare the whitespace searcher. | |
""" | |
self._callbacks = {} | |
self.omit_whitespace = omit_whitespace | |
self.case_sensitive = case_sensitive | |
parts = [] | |
for name, rule in rules: | |
if not isinstance(rule, str): | |
rule, callback = rule | |
self._callbacks[name] = callback | |
parts.append("(?P<%s>%s)" % (name, rule)) | |
if self.case_sensitive: | |
flags = re.M | |
else: | |
flags = re.M|re.I | |
self.regexc = re.compile("|".join(parts), flags) | |
self.ws_regexc = re.compile("\s*", re.MULTILINE) | |
def scan(self, input): | |
""" Return a scanner built for matching through the `input` field. | |
The scanner that it returns is built well for iterating. | |
""" | |
return _InputScanner(self, input) | |
class UnmatchTokenError(Exception): | |
def __init__(self, token): | |
self.__token = token | |
def __str__(self): | |
return str(self.__token) | |
class LexStream(object): | |
def __init__(self, scanner): | |
self.__scanner = scanner | |
self.__cur_tok = None | |
def _next(self): | |
try: | |
tok = self.__scanner.next() | |
while tok[0] == 'comment': | |
tok = self.__scanner.next() | |
return tok | |
except StopIteration: | |
return 'eof', '' | |
def next(self): | |
if self.__cur_tok is None: | |
return self._next() | |
else: | |
tok = self.__cur_tok | |
self.__cur_tok = None | |
return tok | |
def current(self): | |
if self.__cur_tok is None: | |
self.__cur_tok = self._next() | |
return self.__cur_tok | |
def match(self, expected_tok): | |
tok = self.next() | |
if tok[0] != expected_tok: | |
raise UnmatchTokenError(tok) | |
def parse_set(iter_token): | |
iter_token.match('lbrace') | |
tok = iter_token.current() | |
if tok[0] == 'var': | |
res = parse_dict(iter_token) | |
else: | |
res = parse_seq(iter_token) | |
iter_token.match('rbrace') | |
return res | |
def parse_dict(iter_token): | |
res = {} | |
while iter_token.current()[0] != 'rbrace': | |
tok = iter_token.current() | |
iter_token.match('var') | |
res[tok[1]] = parse_value(iter_token) | |
if iter_token.current()[0] == 'comma': | |
iter_token.match('comma') | |
return res | |
def parse_seq(iter_token): | |
res = [] | |
while iter_token.current()[0] != 'rbrace': | |
res.append(parse_value(iter_token)) | |
return res | |
def parse_value(iter_token): | |
tok = iter_token.current() | |
if tok[0] == 'num': | |
iter_token.match('num') | |
return int(tok[1]) | |
elif tok[0] == 'str': | |
iter_token.match('str') | |
return tok[1] | |
elif tok[0] == 'lbrace': | |
return parse_set(iter_token) | |
else: | |
raise UnmatchTokenError(tok) | |
if __name__ == '__main__': | |
import sys | |
in_str = sys.argv[1] | |
rules = [('var', '[_a-z]\\w*'), ('num', '-?\\d+'), | |
('lbrace', '{'), ('rbrace', '}'), | |
('str', '\'.*\'H'), | |
('comma', ','), ('comment', '--.*$')] | |
lexer = Lexer(rules) | |
print parse_set(LexStream(lexer.scan(in_str))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment