Last active
September 22, 2022 18:30
-
-
Save treyhunner/3c517975af26ab1676ab9fb3a6c7c681 to your computer and use it in GitHub Desktop.
I implemented code to convert unicode code points to UTF-8, just for fun. Inspired by https://sethmlarson.dev/blog/utf-8
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Just some code that needlessly converts unicode codepoints to UTF-8. | |
Example: | |
$ python utf8ify.py U+2728 | |
Bytes: 0xe2 0x9c 0xa8 | |
Text: β¨ | |
$ python utf8ify.py U+1F3F3 U+FE0F U+200D U+1F308 | |
Bytes: 0xf0 0x9f 0x8f 0xb3 0xef 0xb8 0x8f 0xe2 0x80 0x8d 0xf0 0x9f 0x8c 0x88 | |
Text: π³οΈβπ | |
""" | |
from collections import deque | |
import sys | |
LAST_6_BITS = 0b0011_1111 | |
LAST_7_BITS = 0b0111_1111 | |
TAIL_PREFIX = 0b1000_0000 | |
HEADERS = { | |
2: 0b1100_0000, | |
3: 0b1110_0000, | |
4: 0b1111_0000, | |
} | |
def parse_codepoint_string(string): | |
"""Convert codepoint string to number (e.g. U+2728 to 0x2728).""" | |
return int(string.removeprefix("U+"), 16) | |
def codepoint_to_utf8(codepoint): | |
""" | |
Convert codepoint number (e.g. 0x2728) to UTF-8 bytes. | |
Example:: | |
>>> codepoint_to_utf8(0x2728) | |
b'\xe2\x9c\xa8' | |
""" | |
if codepoint.bit_length() <= 7: # ASCII | |
return bytes([codepoint]) | |
elif codepoint.bit_length() <= 11: | |
octet_count = 2 | |
elif codepoint.bit_length() <= 16: | |
octet_count = 3 | |
elif codepoint.bit_length() <= 21: | |
octet_count = 4 | |
else: | |
raise ValueError("Invalid codepoint") | |
octets = deque() | |
for n in range(octet_count-1): | |
octets.appendleft(TAIL_PREFIX | codepoint & LAST_6_BITS) | |
codepoint = codepoint >> 6 | |
octets.appendleft(HEADERS[octet_count] | codepoint) | |
return bytes(octets) | |
def parse_codepoints(string): | |
""" | |
Parse string of space-separated UTF-8 codepoints (e.g. U+2728). | |
Example: | |
>>> parse_codepoints("U+1F3F3 U+FE0F U+200D U+1F308") | |
b'\xf0\x9f\x8f\xb3\xef\xb8\x8f\xe2\x80\x8d\xf0\x9f\x8c\x88' | |
""" | |
return b"".join([ | |
codepoint_to_utf8(parse_codepoint_string(substring)) | |
for substring in string.split() | |
]) | |
def parse_codepoints_but_easy(string): | |
"""Same function as above, but without re-implementing the wheel.""" | |
return b"".join([ | |
bytes([parse_codepoint_string(substring)]) | |
for substring in string.split() | |
]) | |
def codepoints_from_text(string): | |
""" | |
Convert string to string of codepoints (starting with U+). | |
Example: | |
>>> codepoints_from_text("ππ¦π¦πΆ") | |
'U+1F308 U+1F986 U+1F1E6 U+1F1F | |
""" | |
return " ".join([ | |
f"U+{ord(character):X}" | |
for character in string | |
]) | |
if __name__ == "__main__": | |
utf8_bytes = parse_codepoints(" ".join(sys.argv[1:])) | |
print("Bytes:", *[f"{byte:#x}" for byte in utf8_bytes]) | |
print("Text:", utf8_bytes.decode("utf-8")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment