Created
July 29, 2017 02:48
-
-
Save jvanasco/d0e3ae4aee6a99cef6cb1f09327e3cf5 to your computer and use it in GitHub Desktop.
this standardizes unicode codepoints to html entities when possible.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
this translates unicode codepoints in the input into *NAMED* html entities | |
a future version may do the html spec supported entities as well | |
this does not escape unsafe html into entities, as lots of libraries do that and | |
this function is likely to be used in a pipeline that does that too. | |
this simply standardizes unicode points into html entities. | |
""" | |
from six.moves.html_entities import codepoint2name | |
# we shall start with a custom version of codepoint2name... | |
codepoint2name_custom = dict(codepoint2name.items()) | |
# however it should not translate the following | |
# this will break everything, as & is the control character | |
del codepoint2name_custom[38] # & & | |
# the following are indeed unsafe, but we're not sanitizing | |
del codepoint2name_custom[34] # " " | |
del codepoint2name_custom[60] # < < | |
del codepoint2name_custom[62] # > > | |
unicode_to_entity = {k: (u'&%s;' % v) | |
for k, v in codepoint2name_custom.items() | |
} | |
def unicode_to_entity_transation(input): | |
return input.translate(unicode_to_entity) | |
if __name__ == '__main__': | |
sample_text = u"""& & • β’ β£ # ' " " ' < > <> ππ&""" | |
expectedout = u"""& & • • ♣ # ' " " ' < > <> ππ&""" | |
generated = unicode_to_entity_transation(sample_text) | |
assert expectedout == generated |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment