Created
October 1, 2010 13:31
-
-
Save jone/606214 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from htmlentitydefs import name2codepoint as n2cp | |
from htmlentitydefs import codepoint2name as cp2n | |
import re | |
def decode_htmlentities(string): | |
""" | |
Decodes htmlentities or xmlentities | |
>>> decode_htmlentities('"X>Y"') | |
u'"X>Y"' | |
>>> decode_htmlentities('m&m') | |
u'm&m' | |
""" | |
entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});") | |
def substitute_entity(match): | |
ent = match.group(2) | |
if match.group(1) == "#": | |
return unichr(int(ent)) | |
else: | |
cp = n2cp.get(ent) | |
if cp: | |
return unichr(cp) | |
else: | |
return match.group() | |
return entity_re.subn(substitute_entity, string)[0] | |
def html2xmlentities(string): | |
""" | |
Converts htmlentities to xmlentities | |
>>> html2xmlentities('m&m') | |
'm&m' | |
""" | |
xpr = re.compile('&(\w{1,8});') | |
def substitute_entity(match): | |
ent = match.group(1) | |
if ent in n2cp.keys(): | |
return '&#%i;' % n2cp[ent] | |
else: | |
return match.group(0) | |
return xpr.subn(substitute_entity, string)[0] | |
def xml2htmlentities(string): | |
""" | |
Converts xmlentities to htmlentities | |
>>> xml2htmlentities('m&m') | |
'm&m' | |
""" | |
xpr = re.compile('&#(\d{1,5});') | |
def substitute_entity(match): | |
ent = int(match.group(1)) | |
if ent in cp2n.keys(): | |
return '&%s;' % cp2n[ent] | |
else: | |
return match.group(0) | |
return xpr.subn(substitute_entity, string)[0] | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment