Skip to content

Instantly share code, notes, and snippets.

@inky
Created July 7, 2011 12:31
Show Gist options
  • Save inky/1069404 to your computer and use it in GitHub Desktop.
Save inky/1069404 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
"""
Reads a series of HTML entities from stdin, and outputs a table of the
corresponding Unicode characters.
"""
import sys
from htmlentitydefs import name2codepoint
def codepoint(entity):
name = entity[1:-1]
if name.startswith('#'):
if name[1] == 'x':
return int(name[2:], 16)
else:
return int(name[1:])
else:
return name2codepoint.get(name, None)
def tokens(input):
return (w for w in input.split() if w.startswith('&') and w.endswith(';'))
def main():
for token in tokens(sys.stdin.read()):
try:
uc = unichr(codepoint(token))
except TypeError:
uc = 'n/a'
print "%s\t%s" % (uc, token)
if __name__ == '__main__':
main()
$ python entity2unicode.py <<< '&#8230; &#x2026; &hellip; &troll; &#9731;'
… &#8230;
… &#x2026;
… &hellip;
n/a &troll;
☃ &#9731;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment