Created
January 11, 2013 01:39
-
-
Save amintos/4507279 to your computer and use it in GitHub Desktop.
Module providing Hiragana-like encodings for arbitrary data.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
KADOKUSEI Number and Byte-Stream Representation | |
(c) 2013 | Toni Mattis | MIT Licensed | |
This code allows abstract numbers, i.e. coordinates, public keys or hashes | |
to be represented in a pronounceable way. Composition is based on Hiragana. | |
Example: | |
>>> encode_number(718428) | |
'zusukyu' | |
>>> decode_number('zusukyu') | |
718428 | |
# Avoid phonetic similarities (k/g, s/z, t/d, v/w, ...) by setting | |
# the safe parameter to True. May increase code length: | |
>>> encode_number(718428, safe=True) | |
'tekyasho' | |
>>> decode_number('tekyasho', safe=True) | |
718428 | |
# Encode longer strings interpreted as binary data. Supports custom | |
# delimiters and phonetic safety: | |
>>> encode_string("This is not the string you're looking for!") | |
'ken idajiryon unyoryuryo oshahowen osanin tsutoryo osa ipebyagin zachun... | |
>>> decode_string('on-eryomyamu-gyapuchi-umisu-egukyahyun', | |
safe=True, delimiter='-') | |
'H3LL0 W0RLD' | |
""" | |
START_TOKENS = [ | |
'', 'a', 'i', 'u', 'e', 'o' | |
] | |
MID_TOKENS = [ | |
'ka', 'ki', 'ku', 'ke', 'ko', 'kya', 'kyu', 'kyo', | |
'sa', 'shi', 'su', 'se', 'so', 'sha', 'shu', 'sho', | |
'ta', 'chi', 'tsu', 'te', 'to', 'cha', 'chu', 'cho', | |
'na', 'ni', 'nu', 'ne', 'no', 'nya', 'nyu', 'nyo', | |
'ha', 'hi', 'fu', 'he', 'ho', 'hya', 'hyu', 'hyo', | |
'ma', 'mi', 'mu', 'me', 'mo', 'mya', 'myu', 'myo', | |
'ya', 'yu', 'yo', | |
'ra', 'ri', 'ru', 're', 'ro', 'rya', 'ryu', 'ryo', | |
'wa', 'wi', 'we', 'wo', | |
] | |
EXT_TOKENS = MID_TOKENS + [ | |
'ga', 'gi', 'gu', 'ge', 'go', 'gya', 'gyu', 'gyo', | |
'za', 'ji', 'zu', 'ze', 'zo', 'ja', 'ju', 'jo', | |
'da', 'de', 'do', | |
'ba', 'bi', 'bu', 'be', 'bo', 'bya', 'byu', 'byo', | |
'pa', 'pi', 'pu', 'pe', 'po', 'pya', 'pyu', 'pyo', | |
'vu', | |
] | |
END_TOKENS = ['', 'n'] | |
MID_SIZE = len(MID_TOKENS) | |
EXT_SIZE = len(EXT_TOKENS) | |
def encode_number(n, safe=False): | |
"""Encodes a small number to a pronounceable KADOKUSEI-Code""" | |
TOKENS, SIZE = (MID_TOKENS, MID_SIZE) if safe else (EXT_TOKENS, EXT_SIZE) | |
# The code starts with a vowel if n is not divisible by 6 | |
start = START_TOKENS[n % 6] | |
n /= 6 | |
# The code ends with an 'n' if the remainder is odd | |
end = END_TOKENS[n % 2] | |
n /= 2 | |
# The more significant information is translated by syllabary | |
mid = '' | |
while n > 0: | |
mid += TOKENS[n % SIZE] | |
n /= SIZE | |
return start + mid + end | |
# Inverts those lists to a dictionary mapping list-items onto their index | |
INDEX = lambda lst: {k : v for v, k in enumerate(lst)} | |
START_INVERTED = INDEX(START_TOKENS) | |
MID_INVERTED = INDEX(MID_TOKENS) | |
EXT_INVERTED = INDEX(EXT_TOKENS) | |
END_INVERTED = INDEX(END_TOKENS) | |
def decode_number(text, safe=False): | |
"""Decodes a pronounceable KADOKUSEI-Code to a number""" | |
result = 0 | |
INV, SIZE = (MID_INVERTED, MID_SIZE) if safe else (EXT_INVERTED, EXT_SIZE) | |
if not text: | |
return 0 | |
start = text[0] | |
if start in START_INVERTED: | |
run = 1 | |
else: | |
start = '' | |
run = 0 | |
limit = len(text) - 1 | |
end = text[-1] | |
if end in END_INVERTED: | |
limit = len(text) - 1 | |
else: | |
end = '' | |
limit = len(text) | |
base = 1 | |
while run < limit: | |
# try 2 letters | |
part = text[run : run + 2] | |
part_value = INV.get(part, None) | |
if part_value == None: | |
# opt for 3 letters at once | |
part = text[run : run + 3] | |
part_value = INV.get(part, None) | |
if part_value == None: | |
raise ValueError, "Unrecognized substring: %s" % part | |
result += part_value * base | |
base *= SIZE | |
run += len(part) | |
return result * 12 + END_INVERTED[end] * 6 + START_INVERTED[start] | |
# ------------------------------------------------------------------------------ | |
# | |
# FULL STRING ENCODING | |
# | |
# ------------------------------------------------------------------------------ | |
# (side note: | |
# The following generator yields different byte lengths in which the input | |
# is chunked. The sequence of chunk lengths itself is an error detection code. | |
def _default_generator(): | |
s = 2 | |
while True: | |
n = yield (s % 3) + 1 | |
s = (s * s + n + 1337) % 65537 | |
def encode_string(s, delimiter=' ', safe=False, chunk_generator=None): | |
"""Encodes an arbitrary string into words of the KADOKUSEI-Code""" | |
generator = chunk_generator or _default_generator() | |
chunksize = generator.next() | |
i = 0 | |
n = len(s) | |
result = [encode_number(n)] | |
while i < n: | |
buf = 0 | |
for j in range(min(chunksize, n - i)): | |
buf |= ord(s[i]) << (8 * j) | |
i += 1 | |
result.append(encode_number(buf)) | |
chunksize = generator.send(buf) | |
if i + chunksize >= n: chunksize = n - i | |
return delimiter.join(result) | |
def decode_string(s, delimiter=' ', safe=False, chunk_generator=None): | |
"""Decodes a string represented in KADOKUSEI-Code""" | |
generator = chunk_generator or _default_generator() | |
chunksize = generator.next() | |
result = [] | |
it = iter(s.split(delimiter)) | |
size = decode_number(it.next()) | |
for chunk in it: | |
buf = orig_buf = decode_number(chunk) | |
for j in range(chunksize): | |
result.append(chr(buf & 0xFF)) | |
buf >>= 8 | |
if buf > 0: | |
raise ValueError, "Code deemed invalid" | |
chunksize = generator.send(orig_buf) | |
return ''.join(result[:size]) | |
# ------------------------------------------------------------------------------ | |
# | |
# SELF TEST WHEN INVOKED STANDALONE | |
# | |
# ------------------------------------------------------------------------------ | |
if __name__ == "__main__": | |
import os, random | |
print "Running quick test..." | |
for i in xrange(20): | |
r = random.randint(2**i, 2**(i + 2)) | |
print r, encode_number(r), r == decode_number(encode_number(r)) and 'OK' | |
print r, encode_number(r, safe=True), \ | |
r == decode_number(encode_number(r, safe=True), | |
safe=True) and 'OK' | |
print "done. Running stress test...", | |
for i in xrange(100): | |
for j in xrange(100): | |
r = os.urandom(i) | |
assert decode_string(encode_string(r)) == r | |
print "done." | |
# ----------------------------------------------------------------------------- | |
# Copyright (C) 2013 | Toni Mattis | Licensed under the MIT License | |
# ----------------------------------------------------------------------------- | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment