crackwitz · March 3, 2015 16:53
diff --git a/dictcoder.py b/dictcoder.py
 import pprint; pp = pprint.pprint


 def encode(s, words=None):
 	if words is None:
 		words = sorted(set(s))

 	seq = []
 	
 	while s:
 		mlen, index = max((len(w), i) for i,w in enumerate(words) if s.startswith(w))
 		
 		seq.append(index)
 		s = s[mlen:]

 		if not s: break
 		
 		# extend just matched word
 		candidate = words[index] + s[0]
 		if candidate not in words:
 			words.append(candidate)
 	
 	return (words, seq)

 def decode(indices, words):
 	# letter for literal letter
 	# number for position in decoded string
 	
 	decoded = ""
 	
 	words = list(words)
 	
 	def isflat(word):
 		return all(isinstance(c, str) for c in word)
 	
 	def update_char(x):
 		if isinstance(x, str): return x
 		if x < len(decoded): return decoded[x]
 		return x
 		
 	def update_word(word):
 		res = map(update_char, word)
 		if isflat(res): res = ''.join(res)
 		return res

 	for index in indices:
 		assert isflat(words[index][:1]), words[index]
 		
 		decoded += words[index][0] # this much is sure
 		
 		# update all words with newly known chars
 		words = map(update_word, words)

 		assert isflat(words[index]), words[index]

 		decoded += words[index][1:]
 		
 		# this word isn't in the dict yet (or else the *encoder* would have matched this)
 		words.append(
 			[x for x in words[index]] + [len(decoded)]
 		)

 	return words, decoded

 s = 'bbbacacbaacb'

 print "encode", s
 (words1, indices) = encode(s)
 print "dictionary:"
 pp(dict(enumerate(words1)))
 print "indices:"
 pp(indices)
 print

 print "decode", indices
 (words2, t) = decode(indices, 'abc')
 print "decoded:", ("OK" if s == t else "differs")
 pp(t)
 print "dictionary:"
 pp(dict(enumerate(words2)))
diff --git a/output b/output
 encode bbbacacbaacb
 dictionary:
 {0: 'a',
 1: 'b',
 2: 'c',
 3: 'bb',
 4: 'bba',
 5: 'ac',
 6: 'ca',
 7: 'acb',
 8: 'ba',
 9: 'aa'}
 indices:
 [1, 3, 0, 2, 5, 1, 0, 7]

 decode [1, 3, 0, 2, 5, 1, 0, 7]
 decoded: OK
 'bbbacacbaacb'
 dictionary:
 {0: 'a',
 1: 'b',
 2: 'c',
 3: 'bb',
 4: 'bba',
 5: 'ac',
 6: 'ca',
 7: 'acb',
 8: 'ba',
 9: 'aa',
 10: ['a', 'c', 'b', 12]}
	import pprint; pp = pprint.pprint


	def encode(s, words=None):
	if words is None:
	words = sorted(set(s))

	seq = []

	while s:
	mlen, index = max((len(w), i) for i,w in enumerate(words) if s.startswith(w))

	seq.append(index)
	s = s[mlen:]

	if not s: break

	# extend just matched word
	candidate = words[index] + s[0]
	if candidate not in words:
	words.append(candidate)

	return (words, seq)

	def decode(indices, words):
	# letter for literal letter
	# number for position in decoded string

	decoded = ""

	words = list(words)

	def isflat(word):
	return all(isinstance(c, str) for c in word)

	def update_char(x):
	if isinstance(x, str): return x
	if x < len(decoded): return decoded[x]
	return x

	def update_word(word):
	res = map(update_char, word)
	if isflat(res): res = ''.join(res)
	return res

	for index in indices:
	assert isflat(words[index][:1]), words[index]

	decoded += words[index][0] # this much is sure

	# update all words with newly known chars
	words = map(update_word, words)

	assert isflat(words[index]), words[index]

	decoded += words[index][1:]

	# this word isn't in the dict yet (or else the encoder would have matched this)
	words.append(
	[x for x in words[index]] + [len(decoded)]
	)

	return words, decoded

	s = 'bbbacacbaacb'

	print "encode", s
	(words1, indices) = encode(s)
	print "dictionary:"
	pp(dict(enumerate(words1)))
	print "indices:"
	pp(indices)
	print

	print "decode", indices
	(words2, t) = decode(indices, 'abc')
	print "decoded:", ("OK" if s == t else "differs")
	pp(t)
	print "dictionary:"
	pp(dict(enumerate(words2)))
	encode bbbacacbaacb
	dictionary:
	{0: 'a',
	1: 'b',
	2: 'c',
	3: 'bb',
	4: 'bba',
	5: 'ac',
	6: 'ca',
	7: 'acb',
	8: 'ba',
	9: 'aa'}
	indices:
	[1, 3, 0, 2, 5, 1, 0, 7]

	decode [1, 3, 0, 2, 5, 1, 0, 7]
	decoded: OK
	'bbbacacbaacb'
	dictionary:
	{0: 'a',
	1: 'b',
	2: 'c',
	3: 'bb',
	4: 'bba',
	5: 'ac',
	6: 'ca',
	7: 'acb',
	8: 'ba',
	9: 'aa',
	10: ['a', 'c', 'b', 12]}