tcg · September 5, 2018 15:18 · tcg · Sep 5, 2018
diff --git a/html_entity_decode.py b/html_entity_decode.py
 def html_entity_decoder(s):
    """
    Decodes HTML entities like `&amp;` and `&ldquo;` into their 
    plain-text counterparts. 

    NOTE: This is Python2 specific, and will require changes when
            porting to Python3.

    Args:
        s: string of of HTML to filter.

    Returns:
        The provided string, with decoded HTML entities where present.
    """
    from HTMLParser import HTMLParser
    p = HTMLParser()
    # The HTMLParser instance has an "internal" method called `unescape`
    # that will convert all HTML entities into their Unicode codepoint
    # equivalent.
    # The "decoded" characters it returns are ISO 8859-1 (Latin-1) characters, 
    # per the docs for `htmlentitydefs`, which is what the HTMLParser
    # uses internally. 
    # See: https://github.com/python/cpython/blob/2.7/Lib/HTMLParser.py#L447
    # And: https://docs.python.org/2/library/htmllib.html#module-htmlentitydefs
    # So here, we'll explicitly encode the returned string as 
    # UTF-8, before returning it: 
    return p.unescape(s).encode("utf-8")
	def html_entity_decoder(s):
	"""
	Decodes HTML entities like `&` and `“` into their
	plain-text counterparts.

	NOTE: This is Python2 specific, and will require changes when
	porting to Python3.

	Args:
	s: string of of HTML to filter.

	Returns:
	The provided string, with decoded HTML entities where present.
	"""
	from HTMLParser import HTMLParser
	p = HTMLParser()
	# The HTMLParser instance has an "internal" method called `unescape`
	# that will convert all HTML entities into their Unicode codepoint
	# equivalent.
	# The "decoded" characters it returns are ISO 8859-1 (Latin-1) characters,
	# per the docs for `htmlentitydefs`, which is what the HTMLParser
	# uses internally.
	# See: https://github.com/python/cpython/blob/2.7/Lib/HTMLParser.py#L447
	# And: https://docs.python.org/2/library/htmllib.html#module-htmlentitydefs
	# So here, we'll explicitly encode the returned string as
	# UTF-8, before returning it:
	return p.unescape(s).encode("utf-8")