Created
March 3, 2015 16:53
-
-
Save crackwitz/f69883d41735988d28e5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pprint; pp = pprint.pprint | |
def encode(s, words=None): | |
if words is None: | |
words = sorted(set(s)) | |
seq = [] | |
while s: | |
mlen, index = max((len(w), i) for i,w in enumerate(words) if s.startswith(w)) | |
seq.append(index) | |
s = s[mlen:] | |
if not s: break | |
# extend just matched word | |
candidate = words[index] + s[0] | |
if candidate not in words: | |
words.append(candidate) | |
return (words, seq) | |
def decode(indices, words): | |
# letter for literal letter | |
# number for position in decoded string | |
decoded = "" | |
words = list(words) | |
def isflat(word): | |
return all(isinstance(c, str) for c in word) | |
def update_char(x): | |
if isinstance(x, str): return x | |
if x < len(decoded): return decoded[x] | |
return x | |
def update_word(word): | |
res = map(update_char, word) | |
if isflat(res): res = ''.join(res) | |
return res | |
for index in indices: | |
assert isflat(words[index][:1]), words[index] | |
decoded += words[index][0] # this much is sure | |
# update all words with newly known chars | |
words = map(update_word, words) | |
assert isflat(words[index]), words[index] | |
decoded += words[index][1:] | |
# this word isn't in the dict yet (or else the *encoder* would have matched this) | |
words.append( | |
[x for x in words[index]] + [len(decoded)] | |
) | |
return words, decoded | |
s = 'bbbacacbaacb' | |
print "encode", s | |
(words1, indices) = encode(s) | |
print "dictionary:" | |
pp(dict(enumerate(words1))) | |
print "indices:" | |
pp(indices) | |
print "decode", indices | |
(words2, t) = decode(indices, 'abc') | |
print "decoded:", ("OK" if s == t else "differs") | |
pp(t) | |
print "dictionary:" | |
pp(dict(enumerate(words2))) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
encode bbbacacbaacb | |
dictionary: | |
{0: 'a', | |
1: 'b', | |
2: 'c', | |
3: 'bb', | |
4: 'bba', | |
5: 'ac', | |
6: 'ca', | |
7: 'acb', | |
8: 'ba', | |
9: 'aa'} | |
indices: | |
[1, 3, 0, 2, 5, 1, 0, 7] | |
decode [1, 3, 0, 2, 5, 1, 0, 7] | |
decoded: OK | |
'bbbacacbaacb' | |
dictionary: | |
{0: 'a', | |
1: 'b', | |
2: 'c', | |
3: 'bb', | |
4: 'bba', | |
5: 'ac', | |
6: 'ca', | |
7: 'acb', | |
8: 'ba', | |
9: 'aa', | |
10: ['a', 'c', 'b', 12]} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment