Skip to content

Instantly share code, notes, and snippets.

@crackwitz
Created March 3, 2015 16:53
Show Gist options
  • Save crackwitz/f69883d41735988d28e5 to your computer and use it in GitHub Desktop.
Save crackwitz/f69883d41735988d28e5 to your computer and use it in GitHub Desktop.
import pprint; pp = pprint.pprint
def encode(s, words=None):
if words is None:
words = sorted(set(s))
seq = []
while s:
mlen, index = max((len(w), i) for i,w in enumerate(words) if s.startswith(w))
seq.append(index)
s = s[mlen:]
if not s: break
# extend just matched word
candidate = words[index] + s[0]
if candidate not in words:
words.append(candidate)
return (words, seq)
def decode(indices, words):
# letter for literal letter
# number for position in decoded string
decoded = ""
words = list(words)
def isflat(word):
return all(isinstance(c, str) for c in word)
def update_char(x):
if isinstance(x, str): return x
if x < len(decoded): return decoded[x]
return x
def update_word(word):
res = map(update_char, word)
if isflat(res): res = ''.join(res)
return res
for index in indices:
assert isflat(words[index][:1]), words[index]
decoded += words[index][0] # this much is sure
# update all words with newly known chars
words = map(update_word, words)
assert isflat(words[index]), words[index]
decoded += words[index][1:]
# this word isn't in the dict yet (or else the *encoder* would have matched this)
words.append(
[x for x in words[index]] + [len(decoded)]
)
return words, decoded
s = 'bbbacacbaacb'
print "encode", s
(words1, indices) = encode(s)
print "dictionary:"
pp(dict(enumerate(words1)))
print "indices:"
pp(indices)
print
print "decode", indices
(words2, t) = decode(indices, 'abc')
print "decoded:", ("OK" if s == t else "differs")
pp(t)
print "dictionary:"
pp(dict(enumerate(words2)))
encode bbbacacbaacb
dictionary:
{0: 'a',
1: 'b',
2: 'c',
3: 'bb',
4: 'bba',
5: 'ac',
6: 'ca',
7: 'acb',
8: 'ba',
9: 'aa'}
indices:
[1, 3, 0, 2, 5, 1, 0, 7]
decode [1, 3, 0, 2, 5, 1, 0, 7]
decoded: OK
'bbbacacbaacb'
dictionary:
{0: 'a',
1: 'b',
2: 'c',
3: 'bb',
4: 'bba',
5: 'ac',
6: 'ca',
7: 'acb',
8: 'ba',
9: 'aa',
10: ['a', 'c', 'b', 12]}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment