airekans · January 22, 2014 08:05
diff --git a/parse.py b/parse.py
 #! /usr/bin/env python
 #
 # The grammar for the parsed language is very simple.
 # It's like a JSON variation.
 # Here's a simple valid expression:
 # {
 #    a 1,
 #    b 2,
 #    c {
 #       c_1 3,
 #       c_2 4
 #    }
 # }
 #
 # And the expression will be parsed into a dict in python.
 #

 import re
 
 class UnknownTokenError(Exception):
    """ This exception is for use to be thrown when an unknown token is
        encountered in the token stream. It hols the line number and the
        offending token.
    """
    def __init__(self, token, lineno):
        self.token = token
        self.lineno = lineno
 
    def __str__(self):
        return "Line #%s, Found token: %s" % (self.lineno, self.token)
 
 
 class _InputScanner(object):
    """ This class manages the scanning of a specific input. An instance of it is
        returned when scan() is called. It is built to be great for iteration. This is
        mainly to be used by the Lexer and ideally not directly.
    """
 
    def __init__(self, lexer, input):
        """ Put the lexer into this instance so the callbacks can reference it 
            if needed.
        """
        self._position = 0
        self.lexer = lexer
        self.input = input
 
    def __iter__(self):
        """ All of the code for iteration is controlled by the class itself.
            This and next() (or __next__() in Python 3.0) are so syntax
            like `for token in Lexer(...):` is valid and works.
        """
        return self
 
    def next(self):
        """ Used for iteration. It returns token after token until there
            are no more tokens. (change this to __next__(self) if using Py3.0)
        """
        if not self.done_scanning():
            return self.scan_next()
        raise StopIteration
 
    def done_scanning(self):
        """ A simple boolean function that returns true if scanning is
            complete and false if it isn't.
        """
        return self._position >= len(self.input)
 
    def scan_next(self):
        """ Retreive the next token from the input. If the
            flag `omit_whitespace` is set to True, then it will
            skip over the whitespace characters present.
        """
        if self.done_scanning():
            return None
        if self.lexer.omit_whitespace:
            match = self.lexer.ws_regexc.match(self.input, self._position)
            if match:
                self._position = match.end()
        match = self.lexer.regexc.match(self.input, self._position)
        if match is None:
            lineno = self.input[:self._position].count("\n") + 1
            raise UnknownTokenError(self.input[self._position], lineno)
        self._position = match.end()
        value = match.group(match.lastgroup)
        if match.lastgroup in self.lexer._callbacks:
            value = self.lexer._callbacks[match.lastgroup](self, value)
        return match.lastgroup, value
 
 
 class Lexer(object):
    """ A lexical scanner. It takes in an input and a set of rules based
        on reqular expressions. It then scans the input and returns the
        tokens one-by-one. It is meant to be used through iterating.
    """
 
    def __init__(self, rules, case_sensitive=True, omit_whitespace=True):
        """ Set up the lexical scanner. Build and compile the regular expression
            and prepare the whitespace searcher.
        """
        self._callbacks = {}
        self.omit_whitespace = omit_whitespace
        self.case_sensitive = case_sensitive
        parts = []
        for name, rule in rules:
            if not isinstance(rule, str):
                rule, callback = rule
                self._callbacks[name] = callback
            parts.append("(?P<%s>%s)" % (name, rule))
        if self.case_sensitive:
            flags = re.M
        else:
            flags = re.M|re.I
        self.regexc = re.compile("|".join(parts), flags)
        self.ws_regexc = re.compile("\s*", re.MULTILINE)
 
    def scan(self, input):
        """ Return a scanner built for matching through the `input` field. 
            The scanner that it returns is built well for iterating.
        """
        return _InputScanner(self, input)

 class UnmatchTokenError(Exception):
    def __init__(self, token):
        self.__token = token

    def __str__(self):
        return str(self.__token)


 class LexStream(object):
    def __init__(self, scanner):
        self.__scanner = scanner
        self.__cur_tok = None

    def _next(self):
        try:
            tok = self.__scanner.next()
            while tok[0] == 'comment':
                tok = self.__scanner.next()
            return tok
        except StopIteration:
            return 'eof', ''

    def next(self):
        if self.__cur_tok is None:
            return self._next()
        else:
            tok = self.__cur_tok
            self.__cur_tok = None
            return tok

    def current(self):
        if self.__cur_tok is None:
            self.__cur_tok = self._next()
        return self.__cur_tok

    def match(self, expected_tok):
        tok = self.next()
        if tok[0] != expected_tok:
            raise UnmatchTokenError(tok)


 def parse_set(iter_token):
    iter_token.match('lbrace')
    tok = iter_token.current()
    if tok[0] == 'var':
        res = parse_dict(iter_token)
    else:
        res = parse_seq(iter_token)
    iter_token.match('rbrace')
    return res

 def parse_dict(iter_token):
    res = {}
    while iter_token.current()[0] != 'rbrace':
        tok = iter_token.current()
        iter_token.match('var')
        res[tok[1]] = parse_value(iter_token)
        if iter_token.current()[0] == 'comma':
            iter_token.match('comma')

    return res

 def parse_seq(iter_token):
    res = []
    while iter_token.current()[0] != 'rbrace':
        res.append(parse_value(iter_token))

    return res
    
 def parse_value(iter_token):
    tok = iter_token.current()
    if tok[0] == 'num':
        iter_token.match('num')
        return int(tok[1])
    elif tok[0] == 'str':
        iter_token.match('str')
        return tok[1]
    elif tok[0] == 'lbrace':
        return parse_set(iter_token)
    else:
        raise UnmatchTokenError(tok)
        

 if __name__ == '__main__':
    import sys
    in_str = sys.argv[1]

    rules = [('var', '[_a-z]\\w*'), ('num', '-?\\d+'),
             ('lbrace', '{'), ('rbrace', '}'),
             ('str', '\'.*\'H'),
             ('comma', ','), ('comment', '--.*$')]

    lexer = Lexer(rules)
    print parse_set(LexStream(lexer.scan(in_str)))
	#! /usr/bin/env python
	#
	# The grammar for the parsed language is very simple.
	# It's like a JSON variation.
	# Here's a simple valid expression:
	# {
	# a 1,
	# b 2,
	# c {
	# c_1 3,
	# c_2 4
	# }
	# }
	#
	# And the expression will be parsed into a dict in python.
	#

	import re

	class UnknownTokenError(Exception):
	""" This exception is for use to be thrown when an unknown token is
	encountered in the token stream. It hols the line number and the
	offending token.
	"""
	def __init__(self, token, lineno):
	self.token = token
	self.lineno = lineno

	def __str__(self):
	return "Line #%s, Found token: %s" % (self.lineno, self.token)


	class _InputScanner(object):
	""" This class manages the scanning of a specific input. An instance of it is
	returned when scan() is called. It is built to be great for iteration. This is
	mainly to be used by the Lexer and ideally not directly.
	"""

	def __init__(self, lexer, input):
	""" Put the lexer into this instance so the callbacks can reference it
	if needed.
	"""
	self._position = 0
	self.lexer = lexer
	self.input = input

	def __iter__(self):
	""" All of the code for iteration is controlled by the class itself.
	This and next() (or __next__() in Python 3.0) are so syntax
	like `for token in Lexer(...):` is valid and works.
	"""
	return self

	def next(self):
	""" Used for iteration. It returns token after token until there
	are no more tokens. (change this to __next__(self) if using Py3.0)
	"""
	if not self.done_scanning():
	return self.scan_next()
	raise StopIteration

	def done_scanning(self):
	""" A simple boolean function that returns true if scanning is
	complete and false if it isn't.
	"""
	return self._position >= len(self.input)

	def scan_next(self):
	""" Retreive the next token from the input. If the
	flag `omit_whitespace` is set to True, then it will
	skip over the whitespace characters present.
	"""
	if self.done_scanning():
	return None
	if self.lexer.omit_whitespace:
	match = self.lexer.ws_regexc.match(self.input, self._position)
	if match:
	self._position = match.end()
	match = self.lexer.regexc.match(self.input, self._position)
	if match is None:
	lineno = self.input[:self._position].count("\n") + 1
	raise UnknownTokenError(self.input[self._position], lineno)
	self._position = match.end()
	value = match.group(match.lastgroup)
	if match.lastgroup in self.lexer._callbacks:
	value = self.lexer._callbacks[match.lastgroup](self, value)
	return match.lastgroup, value


	class Lexer(object):
	""" A lexical scanner. It takes in an input and a set of rules based
	on reqular expressions. It then scans the input and returns the
	tokens one-by-one. It is meant to be used through iterating.
	"""

	def __init__(self, rules, case_sensitive=True, omit_whitespace=True):
	""" Set up the lexical scanner. Build and compile the regular expression
	and prepare the whitespace searcher.
	"""
	self._callbacks = {}
	self.omit_whitespace = omit_whitespace
	self.case_sensitive = case_sensitive
	parts = []
	for name, rule in rules:
	if not isinstance(rule, str):
	rule, callback = rule
	self._callbacks[name] = callback
	parts.append("(?P<%s>%s)" % (name, rule))
	if self.case_sensitive:
	flags = re.M
	else:
	flags = re.M\|re.I
	self.regexc = re.compile("\|".join(parts), flags)
	self.ws_regexc = re.compile("\s*", re.MULTILINE)

	def scan(self, input):
	""" Return a scanner built for matching through the `input` field.
	The scanner that it returns is built well for iterating.
	"""
	return _InputScanner(self, input)

	class UnmatchTokenError(Exception):
	def __init__(self, token):
	self.__token = token

	def __str__(self):
	return str(self.__token)


	class LexStream(object):
	def __init__(self, scanner):
	self.__scanner = scanner
	self.__cur_tok = None

	def _next(self):
	try:
	tok = self.__scanner.next()
	while tok[0] == 'comment':
	tok = self.__scanner.next()
	return tok
	except StopIteration:
	return 'eof', ''

	def next(self):
	if self.__cur_tok is None:
	return self._next()
	else:
	tok = self.__cur_tok
	self.__cur_tok = None
	return tok

	def current(self):
	if self.__cur_tok is None:
	self.__cur_tok = self._next()
	return self.__cur_tok

	def match(self, expected_tok):
	tok = self.next()
	if tok[0] != expected_tok:
	raise UnmatchTokenError(tok)


	def parse_set(iter_token):
	iter_token.match('lbrace')
	tok = iter_token.current()
	if tok[0] == 'var':
	res = parse_dict(iter_token)
	else:
	res = parse_seq(iter_token)
	iter_token.match('rbrace')
	return res

	def parse_dict(iter_token):
	res = {}
	while iter_token.current()[0] != 'rbrace':
	tok = iter_token.current()
	iter_token.match('var')
	res[tok[1]] = parse_value(iter_token)
	if iter_token.current()[0] == 'comma':
	iter_token.match('comma')

	return res

	def parse_seq(iter_token):
	res = []
	while iter_token.current()[0] != 'rbrace':
	res.append(parse_value(iter_token))

	return res

	def parse_value(iter_token):
	tok = iter_token.current()
	if tok[0] == 'num':
	iter_token.match('num')
	return int(tok[1])
	elif tok[0] == 'str':
	iter_token.match('str')
	return tok[1]
	elif tok[0] == 'lbrace':
	return parse_set(iter_token)
	else:
	raise UnmatchTokenError(tok)


	if __name__ == '__main__':
	import sys
	in_str = sys.argv[1]

	rules = [('var', '[_a-z]\\w*'), ('num', '-?\\d+'),
	('lbrace', '{'), ('rbrace', '}'),
	('str', '\'.*\'H'),
	('comma', ','), ('comment', '--.*$')]

	lexer = Lexer(rules)
	print parse_set(LexStream(lexer.scan(in_str)))