Skip to content

Instantly share code, notes, and snippets.

@johansten
Created January 14, 2014 06:53
Show Gist options
  • Save johansten/8414225 to your computer and use it in GitHub Desktop.
Save johansten/8414225 to your computer and use it in GitHub Desktop.
Unescaping html entities
#-------------------------------------------------------------------------------
# John J. Lee, http://www.velocityreviews.com/forums/t511850-how-do-you-htmlentities-in-python.html
#-------------------------------------------------------------------------------
import htmlentitydefs
import re
def unescape_charref(ref):
name = ref[2:-1]
base = 10
if name.startswith("x"):
name = name[1:]
base = 16
return unichr(int(name, base))
def replace_entities(match):
ent = match.group()
if ent[1] == "#":
return unescape_charref(ent)
repl = htmlentitydefs.name2codepoint.get(ent[1:-1])
if repl is not None:
repl = unichr(repl)
else:
repl = ent
return repl
def unescape(data):
return re.sub(r"&#?[A-Za-z0-9]+?;", replace_entities, data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment