Created
January 29, 2024 23:06
-
-
Save onecrayon/402f45446ae78ee7ba00d4cc36ea9a93 to your computer and use it in GitHub Desktop.
BaseEmoji CLI
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""BaseEmoji CLI | |
Inspired by this lovely little gem: https://github.com/amoallim15/base-emoji | |
But using something a little closer to the base64 algorithm (effectively base128, using emojis). | |
Comparison of a 13 character/13 byte string in two encodings: | |
$ base-64 <<< 'Hello, world!' | |
SGVsbG8sIHdvcmxkIQo= | |
20 characters in 20 bytes. | |
$ base-emoji <<< 'Hello, world!' | |
๐ค๐๐ญ๐๐คฃ๐ผ๐ค๐ ๐ป๐ค๐คฎ๐ฆ๐คฃ๐๐๐ | |
16 characters in 64 bytes. What a savings! | |
This script works by breaking up normal 8-bit bytes into 7-bit bytes and translating those into | |
emoji via a lookup table (very similar to the base64 algorithm, except using base128). Can be | |
used for both text and arbitrary binary data. | |
""" | |
import argparse | |
import os | |
import sys | |
LEFTOVER_MASK = ( | |
0, | |
0b1, | |
0b11, | |
0b111, | |
0b1111, | |
0b11111, | |
0b111111, | |
0b1111111, | |
) | |
PADDING_TO_EMOJI = { | |
1: chr(0x1F447), | |
2: chr(0x1F448), | |
3: chr(0x1F449), | |
4: chr(0x1F44A), | |
5: chr(0x1F44D), | |
6: chr(0x1F44E), | |
} | |
EMOJI_TO_PADDING = {v: k for k, v in PADDING_TO_EMOJI.items()} | |
def to_emoji(value: int) -> str: | |
# 0 - 79 emoticons unicode block: 1F600 - 1F64F | |
if 0 <= value <= 79: | |
return chr(0x1F600 + value) | |
# 80 - 111 supplemental symbols block: 1F910 - 1F92F | |
elif 80 <= value <= 111: | |
return chr(0x1F910 + (value - 80)) | |
# 112 - 122 supplemental symbols block: 1F970 - 1F97A | |
elif 112 <= value <= 122: | |
return chr(0x1F970 + (value - 112)) | |
# 123 - 127 supplemental symbols block: 1F9D0 - 1F9D4 | |
else: | |
return chr(0x1F9D0 + (value - 123)) | |
def to_bits(char: str) -> int: | |
value = ord(char) | |
if 0x1F600 <= value <= 0x1F64F: | |
return value - 0x1F600 | |
elif 0x1F910 <= value <= 0x1F92F: | |
return (value - 0x1F910) + 80 | |
elif 0x1F970 <= value <= 0x1F97A: | |
return (value - 0x1F970) + 112 | |
else: | |
return (value - 0x1F9D0) + 123 | |
def encode_base_emoji(data: str | bytes) -> bytes: | |
if isinstance(data, str): | |
data = data.encode("utf-8") | |
# Break the data down into 7-bit chunks | |
leftover_bits = 0 | |
leftover_bits_length = 0 | |
bit_slices = [] | |
for byte in data: | |
extract_amount = 7 - leftover_bits_length | |
leftover_bits_length = 8 - extract_amount | |
bit_slices.append((leftover_bits << extract_amount) + (byte >> leftover_bits_length)) | |
leftover_bits = byte & LEFTOVER_MASK[leftover_bits_length] | |
# If we had a full 7 bits leftover, append that and reset | |
if leftover_bits_length == 7: | |
bit_slices.append(leftover_bits) | |
leftover_bits = 0 | |
leftover_bits_length = 0 | |
# If we have any leftover, note how many bits of padding were added | |
padded_bit_length = 0 | |
if leftover_bits_length > 0: | |
padded_bit_length = 7 - leftover_bits_length | |
bit_slices.append(leftover_bits << padded_bit_length) | |
# Now convert our bit slice integers into emojis | |
emoji_str = "".join(to_emoji(x) for x in bit_slices) | |
# And finally append our padding character | |
if padded_bit_length: | |
emoji_str = emoji_str + PADDING_TO_EMOJI[padded_bit_length] | |
return emoji_str.encode("utf-8") | |
def decode_base_emoji(data: str | bytes) -> bytes: | |
if isinstance(data, bytes): | |
data = data.decode("utf-8") | |
if not data: | |
return b"" | |
# Check for a padding suffix | |
if data[-1] in EMOJI_TO_PADDING: | |
final_bits_padding = EMOJI_TO_PADDING[data[-1]] | |
final_bits = to_bits(data[-2]) >> final_bits_padding | |
final_bits_length = 7 - final_bits_padding | |
data = data[:-2] | |
else: | |
final_bits = 0 | |
final_bits_length = 0 | |
# Convert emojis into their 7-bit equivalents, and recombine into 8-bit bytes | |
leftover_bits = 0 | |
leftover_bits_length = 0 | |
data_bytes = bytearray() | |
for char in data: | |
value = to_bits(char) | |
if leftover_bits_length == 0: | |
leftover_bits_length = 7 | |
leftover_bits = value | |
continue | |
shift_amount = 8 - leftover_bits_length | |
leftover_bits_length = 7 - shift_amount | |
data_bytes.append((leftover_bits << shift_amount) + (value >> leftover_bits_length)) | |
leftover_bits = value & LEFTOVER_MASK[leftover_bits_length] | |
# Add our final bits | |
if final_bits_length: | |
data_bytes.append((leftover_bits << final_bits_length) + final_bits) | |
# And return our bytes object | |
return bytes(data_bytes) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser( | |
prog="BaseEmoji", | |
description="Encodes and decodes arbitrary data to BaseEmoji. Like base64, but friendly!", | |
) | |
parser.add_argument( | |
"data", | |
nargs="?", | |
default=None, | |
help="The string or bytes you wish to encode to or decode from BaseEmoji (or omit and pass via stdin).", | |
) | |
group = parser.add_mutually_exclusive_group() | |
group.add_argument("-e", "--encode", action="store_true") | |
group.add_argument("-d", "--decode", action="store_true") | |
args = parser.parse_args() | |
if args.data is None and not sys.stdin.isatty(): | |
data = sys.stdin.buffer.read() | |
else: | |
data = args.data | |
if not data: | |
parser.error("data is required as positional argument or stdin") | |
with os.fdopen(sys.stdout.fileno(), "wb") as stdout: | |
if args.decode: | |
stdout.write(decode_base_emoji(data)) | |
else: | |
stdout.write(encode_base_emoji(data)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment