Skip to content

Instantly share code, notes, and snippets.

@piskvorky
Created March 30, 2015 22:04
Show Gist options
  • Save piskvorky/dccb0d12153d4fa248e1 to your computer and use it in GitHub Desktop.
Save piskvorky/dccb0d12153d4fa248e1 to your computer and use it in GitHub Desktop.
def unescape(text):
"""Unescape HTML entities. Input is either unicode or utf8 string; output is always utf8 string."""
# adapted from http://effbot.org/zone/re-sub.htm#unescape-html
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return unichr(int(text[3:-1], 16))
else:
return unichr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
if not isinstance(text, unicode):
text = unicode(text, 'utf8')
text = re.sub("&#?\w+;", fixup, text)
if isinstance(text, unicode):
text = text.encode('utf8')
return text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment