Last active
September 7, 2020 21:41
-
-
Save cjsmeele/4e8107e870519b2298821a3639d49d7d to your computer and use it in GitHub Desktop.
Parser combinators in Python3 (bad)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Parser combinators in Python3 - Chris Smeele, 2020. | |
# | |
# This code works. Just not very good, due to it mapping very badly to | |
# Python's execution model (it blows up with recursion depth). | |
# It's also incredibly ugly, since Python very much does not want you to write | |
# code in this style. | |
# Creating it was a nice exercise however. | |
# | |
# I hereby release this awful code into the public domain. | |
# Please leave it there and don't try to actually use it :-) | |
# | |
# As a test, this implements a sort-of XML parser (tag soup) in two steps: | |
# (1) text -> tokens (2) tokens -> elements | |
# | |
# Types for the stuff we'll be parsing {{{ | |
class Element(): | |
def __init__(self, name, body): | |
if type(body) is not list: | |
body = decode_entities(body) | |
self.name = name | |
self.body = body | |
self.text = body | |
def find(self, name): | |
for x in self.findall(name): | |
return x | |
def findall(self, name): | |
return list(self._findall(name)) | |
def _findall(self, name): | |
for el in self.body: | |
if el.name == name: | |
yield el | |
def __str__(self): | |
if type(self.body) is list: | |
return '<{}>{}</{}>'.format(self.name, ''.join(map(str, self.body)), self.name) | |
else: | |
return '<{}>{}</{}>'.format(self.name, encode_entities(self.body), self.name) | |
def __repr__(self): | |
return '{}({})'.format(self.name, repr(self.body)) | |
class Token(): | |
def __init__(self, s): | |
self.text = s | |
def __repr__(self): | |
return str(type(self).__name__) + '(' + self.text.decode('utf-8') + ')' | |
def __str__(self): | |
return repr(self) | |
class TokenElOpen(Token): pass | |
class TokenElClose(Token): pass | |
class TokenCData(Token): pass | |
# }}} | |
# A parser is a function s -> (a, s) | |
# Execute a parser. | |
def parse(p, s): | |
return p(s) | |
# Execute a parser, return None unless all input was succesfully consumed. | |
def parse_(p, s): | |
val, rest = parse(p, s) | |
if len(rest) != 0: | |
return None | |
else: | |
return val | |
# >>=, for the parser monad. | |
def pbind(p, f): | |
def g(s): | |
x, rest = parse(p, s) | |
if x is None: | |
return None, s | |
y, rest = f(x)(rest) | |
if y is None: | |
return None, s | |
return y, rest | |
return g | |
ppure = lambda x: lambda s: (x, s) | |
def pmap(g, p): | |
return pbind(p, lambda x: ppure(g(x))) | |
# Why does python's builtin `reduce` not specify if it folds left or right? grrr. | |
def foldl(f, init, xs): | |
if xs == []: | |
return init | |
return foldl(f, f(init, xs[0]), xs[1:]) | |
# Try to create new syntax within Python. That will end well. | |
pdo = lambda p, *fs: foldl(pbind, p, list(fs)) | |
# Some sexy mutual recursion. | |
pmany = lambda p: palt(psome(p), ppure([])) | |
psome = lambda p: pbind(p, lambda x: pmap(lambda xs: [x] + xs, pmany(p))) | |
# <|> | |
def palt(*ps): | |
def f(s): | |
for p in ps: | |
val, rest = p(s) | |
if val is not None: | |
return val, rest | |
return None, s | |
return f | |
# this might be pushing it a bit too far... | |
# pitem = lambda: lambda s: ((None, s) if len(s) == 0 else ((bytes([s[0]]) if type(s) is bytes else s[0]), s[1:])) | |
def pitem(): | |
def f(s): | |
if len(s) == 0: | |
return None, s | |
elif type(s) is bytes: | |
return bytes([s[0]]), s[1:] | |
else: | |
return s[0], s[1:] | |
return f | |
# Here be dragons. | |
psat = lambda f: pbind(pitem(), lambda x: ppure(x) if f(x) else ppure(None)) | |
pchar = lambda expect: psat(lambda x: x == expect) | |
pspace = lambda: psat(lambda x: x in b' \n\t\r') | |
pspace_before = lambda p: pbind(pmany(pspace()), lambda _: p) | |
pspace_after = lambda p: pbind(p, lambda x: pbind(pmany(pspace()), lambda _: ppure(x))) | |
pspace_around = lambda p: pspace_after(pspace_before(p)) | |
# Token parser {{{ | |
# element open. (<...>) | |
pTopen = lambda: pmap(TokenElOpen, | |
pspace_before(pdo( pchar(b'<'), | |
lambda _: psome(psat(lambda x: x not in b'/>')), | |
lambda tag: pbind(pchar(b'>'),\ | |
lambda _: ppure(b''.join(tag)))))) | |
# element close. (</...>) | |
pTclose = lambda: pmap(TokenElClose, | |
pspace_around(pdo( pchar(b'<'), | |
lambda _: pchar(b'/'), | |
lambda _: psome(psat(lambda x: x != b'>')), | |
lambda tag: pbind(pchar(b'>'), | |
lambda _: ppure(b''.join(tag)))))) | |
# textual element body (...) | |
pTcdata = lambda: pmap(TokenCData, | |
pbind(psome(psat(lambda x: x != b'<')), | |
lambda chars: ppure(b''.join(chars)))) | |
pT = lambda: palt(pTopen(), pTclose(), pTcdata()) | |
# }}} | |
# Element parser (token -> Element) {{{ | |
popen = lambda: pmap(lambda x: x.text.decode('utf-8'), psat(lambda x: type(x) == TokenElOpen)) | |
pclose = lambda s: psat(lambda x: type(x) == TokenElClose and x.text.decode('utf-8') == s) | |
pcdata = lambda: pmap(lambda x: x.text, psat(lambda x: type(x) == TokenCData)) | |
pelem_body = lambda: palt(pmap(lambda x: x.decode('utf-8'), pcdata()), pmany(pelem())) | |
pelem = lambda: pbind(popen(), | |
lambda name: pbind(pelem_body(), | |
lambda body: pbind(pclose(name), | |
lambda _: ppure(Element(name, body))))) | |
# }}} | |
pxml = pelem | |
entities = [('&', '&'), | |
('<', '<'), | |
('>', '>')] | |
def encode_entities(s): | |
for k, v in entities: | |
s = s.replace(k, v) | |
return s | |
def decode_entities(s): | |
rev = [x for x in entities] | |
rev.reverse() | |
for k, v in rev: | |
s = s.replace(v, k) | |
return s | |
def fromstring(s): | |
if type(s) is str: | |
s = s.encode('utf-8') | |
# What is tail recursion? | |
import sys | |
r = sys.getrecursionlimit() | |
# sys.setrecursionlimit(4000) | |
tokens = parse_(pmany(pT()), s) | |
#print(tokens) | |
if tokens is None: | |
return None | |
tags = parse_(pxml(), tokens) | |
sys.setrecursionlimit(r) | |
# print(str(tags)) | |
return tags | |
# Converts to an element tree, and back to a string for printing. | |
# | |
print(fromstring("<a>\n\ | |
<b>bloep</b>\n\ | |
<b>bloep</b>\n\ | |
<b>bloep</b>\n <b>bl&ep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b><b>bloep</b> \ | |
<c>blarp</c> </a>")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment