Skip to content

Instantly share code, notes, and snippets.

@indivisible
Created May 2, 2017 16:24
Show Gist options
  • Select an option

  • Save indivisible/2d4919637e20b235f4c7e9ccf91e595b to your computer and use it in GitHub Desktop.

Select an option

Save indivisible/2d4919637e20b235f4c7e9ccf91e595b to your computer and use it in GitHub Desktop.
html5lib.serializer.HTMLSerializer bug
#!/usr/bin/env python3
import html5lib
from html5lib.filters.base import Filter as BaseFilter
class LogFilter(BaseFilter):
'''Print tokens passing through. Useful for debugging filters.'''
def __iter__(self):
for token in BaseFilter.__iter__(self):
print ('token: %r' % token)
yield token
def test_parse_serialize(html, tree_type, filters=[]):
walker = html5lib.getTreeWalker(tree_type)
serializer = html5lib.serializer.HTMLSerializer()
root = html5lib.parseFragment(html, namespaceHTMLElements=False, treebuilder=tree_type)
stream = walker(root)
for filter_ in filters:
stream = filter_(stream)
return serializer.render(stream)
def test_html(html, tree_types=('etree', 'lxml'), filters=[]):
print ('\ntesting %r:' % html)
for tt in ('etree', 'lxml'):
print (' with %r:\n %r' % (tt, test_parse_serialize(html, tt, filters)))
if __name__ == '__main__':
# this works as expected
test_html('<a>one</a><b>two</b>')
# this does not
test_html('<a>one</a><p>two</p>')
# this shows that the EndTag 'p' token does get emitted, but not serialized
#test_html('<a>one</a><p>two</p>', filters=[LogFilter])
testing '<a>one</a><b>two</b>':
with 'etree':
'<a>one</a><b>two</b>'
with 'lxml':
'<a>one</a><b>two</b>'
testing '<a>one</a><p>two</p>':
with 'etree':
'<a>one</a><p>two'
with 'lxml':
'<a>one</a><p>two'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment