-
-
Save igauravsehrawat/0075658c39f2c3d743f95aee416f02e0 to your computer and use it in GitHub Desktop.
Writing Parsers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
# | |
# ParserInput | |
# | |
# This class represents the input data and the current | |
# position in the data. | |
# | |
# Brief note about 'max_position': | |
# | |
# When running a complex parser, the current position moves forward | |
# when a particular sub-parser matches and moves backward when | |
# it doesn't match. The 'max_position' field tracks the farthest position | |
# over the entire execution of a complex parser. | |
# When the complex parser fails, the farthest position is a good indicator | |
# of the location of the error. | |
# | |
# The purpose of the 'max_position' field is to report | |
# the error location when the main parser doesn't match. | |
# | |
class ParserInput(object) : | |
def __init__(self, data, position) : | |
self._data = data | |
self._position = position | |
self._max_position = position | |
def data(self) : | |
return self._data | |
def position(self) : | |
return self._position | |
def remaining(self) : | |
return len(self._data) - self._position | |
def read(self) : | |
i = self._position | |
return self._data[i:i+1] | |
def inc_position(self) : | |
self._position += 1 | |
self._max_position = max(self._max_position, self._position) | |
def set_position(self, position) : | |
self._position = position | |
self._max_position = max(self._max_position, self._position) | |
def max_position(self) : | |
return self._max_position | |
# | |
# Match, NoMatch | |
# | |
# A parser object returns either a Match or NoMatch object. | |
# The Match object contains a 'value' property that holds | |
# the result of parsing. | |
# | |
class Match(object) : | |
def __init__(self, value) : | |
self.value = value | |
def __repr__(self) : | |
return "Match(%s)" % (self.value,) | |
class NoMatch(object) : | |
def __repr__(self) : | |
return "NoMatch()" | |
def is_match(result) : | |
return isinstance(result, Match) | |
# | |
# Parser | |
# | |
# This is the base-class representing parser objects. | |
# The 'parse' method is the method that does the parsing. | |
# It accepts a ParserInput object and returns either a Match | |
# or NoMatch object. | |
# | |
# In the case of a match, the parsed result is returned | |
# in the 'value' property of the Match object and the | |
# parser input position is advanced by the length of the input | |
# that was parsed. | |
# | |
# In the case of no-match, the parser input position must | |
# remain unchanged from what it was when the 'parse' method | |
# was called. If the input position was advanced during | |
# intermediate stages, then it should be restored back to | |
# what it was when the 'parse' method was called. | |
# | |
# Note on the 'match' method and 'modifier' attribute: | |
# | |
# The 'modifier' attribute can be set to a function that | |
# modifies the 'value' property of the Match object in the event | |
# of a successful match. The 'match' method is a helper method | |
# that applies the 'modifier' if it is present. | |
# | |
class Parser(object) : | |
def match(self, value) : | |
modifier = getattr(self, "modifier", None) | |
if modifier is not None : | |
value = self.modifier(value) | |
return Match(value) | |
def parse(self, parser_input) : | |
raise NotImplementedError | |
# | |
# Char | |
# | |
# The Char parser parses a single character matching the character | |
# provided in the constructor if it is found at the current position | |
# and fails if it's not found or if there are no more characters left. | |
# | |
class Char(Parser) : | |
def __init__(self, char) : | |
self._char = char | |
def parse(self, parser_input) : | |
if parser_input.remaining() == 0 : | |
return NoMatch() | |
c = parser_input.read() | |
if c != self._char : | |
return NoMatch() | |
parser_input.inc_position() | |
return self.match(c) | |
# | |
# CharSet | |
# | |
# The CharSet parser parses a single character matching any one | |
# of the characters passed in the constructor. The input to the | |
# constructor can be a string, a list of characters, or a set of | |
# characters. | |
# | |
class CharSet(Parser) : | |
def __init__(self, chars) : | |
self._chars = chars | |
def parse(self, parser_input) : | |
if parser_input.remaining() == 0 : | |
return NoMatch() | |
c = parser_input.read() | |
if c not in self._chars : | |
return NoMatch() | |
parser_input.inc_position() | |
return self.match(c) | |
# | |
# ZeroOrMore | |
# | |
# ZeroOrMore takes another parser as input and applies it repeatedly | |
# until it doesn't match. All the results of the repeated application | |
# are collected in a list and returned as the result of parsing. | |
# | |
# This parser never fails to match because it simply returns an | |
# empty list as a match if the input parser doesn't match even once. | |
# | |
class ZeroOrMore(Parser) : | |
def __init__(self, parser) : | |
self._parser = parser | |
def parse(self, parser_input) : | |
values = [] | |
while True : | |
result = self._parser.parse(parser_input) | |
if not is_match(result) : | |
break | |
values.append(result.value) | |
return self.match(values) | |
# | |
# OneOrMore | |
# | |
# OneOrMore takes another parser as input and applies it repeatedly | |
# until it doesn't match. All the results of the repeated application | |
# are collected in a list and returned as the result of parsing. | |
# | |
# This parser matches only if the input parser matches atleast once. | |
# | |
class OneOrMore(Parser) : | |
def __init__(self, parser) : | |
self._parser = parser | |
def parse(self, parser_input) : | |
values = [] | |
while True : | |
result = self._parser.parse(parser_input) | |
if not is_match(result) : | |
break | |
values.append(result.value) | |
if len(values) == 0 : | |
return NoMatch() | |
return self.match(values) | |
# | |
# Sequence | |
# | |
# The Sequence parser takes N parsers as input and applies them | |
# one after another. Sequence succeeds in matching if all the input | |
# parsers succeed and doesn't match even if one of input parsers don't | |
# match. In the event of a successful match, the results of the input | |
# parsers are collected in a list and returned as the result of parsing. | |
# | |
class Sequence(Parser) : | |
def __init__(self, *parsers) : | |
self._parsers = parsers | |
def parse(self, parser_input) : | |
values = [] | |
saved = parser_input.position() | |
for parser in self._parsers : | |
result = parser.parse(parser_input) | |
if not is_match(result) : | |
parser_input.set_position(saved) | |
return NoMatch() | |
values.append(result.value) | |
return self.match(values) | |
# | |
# Choice | |
# | |
# The Choice parser takes N parsers as input. It applies the first one | |
# and if it succeeds then the result is returned as a successful match. | |
# If the first parser doesn't match, then the second parser is attempted. | |
# This continues until one of the parsers matches. The result of the | |
# first matching parser is returned as the match result of | |
# the Choice parser. The Choice parser doesn't match if all the | |
# input parsers fail to match. | |
# | |
class Choice(Parser) : | |
def __init__(self, *parsers) : | |
self._parsers = parsers | |
def parse(self, parser_input) : | |
for parser in self._parsers : | |
result = parser.parse(parser_input) | |
if is_match(result) : | |
return self.match(result.value) | |
return NoMatch() | |
# | |
# Optional | |
# | |
# The Optional parser takes a parser as input and always succeeds | |
# in matching. If the input parser succeeds in matching, then its | |
# result is returned as the result of a successful match. If the | |
# input parser doesn't match, then None is returned as a successful | |
# match. | |
# | |
class Optional(Parser) : | |
def __init__(self, parser) : | |
self._parser = parser | |
def parse(self, parser_input) : | |
result = self._parser.parse(parser_input) | |
if not is_match(result) : | |
return self.match(None) | |
return self.match(result.value) | |
# | |
# Wrapper | |
# | |
# Wrapper is a simple direct wrapper around another parser. | |
# The input parser is available in the 'parser' property. | |
# It can be leftout during construction and can be set directly | |
# set later. This allows for forward declaring a parser that needs | |
# to call itself recursively. | |
# | |
class Wrapper(Parser) : | |
def __init__(self, parser=None) : | |
self.parser = parser | |
def parse(self, parser_input) : | |
result = self.parser.parse(parser_input) | |
if not is_match(result) : | |
return result | |
return self.match(result.value) | |
# | |
# EOF | |
# | |
# The EOF parser succeeds in matching if the input posiition | |
# is at the end. It fails to match otherwise. | |
# | |
class EOF(Parser) : | |
def parse(self, parser_input) : | |
if parser_input.remaining() == 0 : | |
return self.match(None) | |
else : | |
return NoMatch() | |
# | |
# Example 1 | |
# | |
# Parse one or more digits | |
# | |
def example1() : | |
digit = CharSet("0123456789") | |
digits = OneOrMore(digit) | |
return digits | |
# | |
# Example 2 | |
# | |
# Parse one or more digits and convert to integer | |
# | |
def example2() : | |
# convert single digit to integer | |
def char_to_digit(c) : | |
return ord(c) - ord("0") | |
# convert list of digit values to integer | |
def digits_to_number(a) : | |
result = 0 | |
for x in a : | |
result = result*10 + x | |
return result | |
digit = CharSet("0123456789") | |
digit.modifier = char_to_digit | |
digits = OneOrMore(digit) | |
digits.modifier = digits_to_number | |
return digits | |
# | |
# Example 3 | |
# | |
# An expression parser. | |
# | |
def example3() : | |
# Helpers | |
def char_to_digit(c) : | |
return ord(c) - ord("0") | |
def digits_to_number(a) : | |
result = 0 | |
for x in a : | |
result = result*10 + x | |
return result | |
def bracket_inner(a) : | |
return a[1] | |
def apply_sign(a) : | |
sign = a[0] | |
if sign is not None : | |
return [sign, a[1]] | |
else : | |
return a[1] | |
def make_operation_tree(a) : | |
value = a[0] | |
for operator, value2 in a[1] : | |
value = [operator, value, value2] | |
return value | |
# parsers | |
whitespace = ZeroOrMore(Char(" ")) | |
def eat_space(parser) : | |
parser2 = Sequence(whitespace, parser) | |
parser2.modifier = lambda a : a[1] | |
return Wrapper(parser2) | |
digit = CharSet("0123456789") | |
digit.modifier = char_to_digit | |
number = eat_space(OneOrMore(digit)) | |
number.modifier = digits_to_number | |
expr = Wrapper() | |
left_parenthesis = eat_space(Char("(")) | |
right_parenthesis = eat_space(Char(")")) | |
bracketed = Sequence(left_parenthesis, expr, right_parenthesis) | |
bracketed.modifier = bracket_inner | |
sign = eat_space(CharSet("+-")) | |
atomic = Sequence(Optional(sign), Choice(number, bracketed)) | |
atomic.modifier = apply_sign | |
term_operator = eat_space(CharSet("*/")) | |
term_operation = Sequence(term_operator, atomic) | |
term = Sequence(atomic, ZeroOrMore(term_operation)) | |
term.modifier = make_operation_tree | |
expr_operator = eat_space(CharSet("+-")) | |
expr_operation = Sequence(expr_operator, term) | |
expr.parser = Sequence(term, ZeroOrMore(expr_operation)) | |
expr.modifier = make_operation_tree | |
return expr | |
# | |
# main | |
# | |
main_parser = None | |
# | |
# Uncomment one of the following lines. | |
# | |
# main_parser = example1() | |
# main_parser = example2() | |
# main_parser = example3() | |
def main() : | |
if main_parser is None : | |
print "Please uncomment one of the example initializer lines" | |
return | |
if len(sys.argv) != 2 : | |
print "%s <input>" % (sys.argv[0],) | |
return | |
s = sys.argv[1] | |
parser_input = ParserInput(s, 0) | |
result = main_parser.parse(parser_input) | |
if not is_match(result) : | |
print "ERROR: %s" % (s,) | |
i = len("ERROR: ") + parser_input.max_position() | |
print "%s^" % ("-"*i,) | |
else : | |
print "RESULT:", result.value | |
i = parser_input.position() | |
data = parser_input.data() | |
if i < len(data) : | |
print "UNPARSED LEFTOVER:", data[i:] | |
if __name__ == "__main__" : | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment