Last active
September 11, 2018 21:03
-
-
Save notwa/43dae410db99887ab09f1f22a7ec3ff2 to your computer and use it in GitHub Desktop.
Arithmetic coding
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Arithmetic coding compressor and decompressor for binary strings. | |
# via: http://www.inference.org.uk/mackay/python/compress/ac/ac_encode.py | |
# main page: http://www.inference.org.uk/mackay/python/compress/ | |
# this has been cleaned up (passes pycodestyle) and ported to python 3. | |
# default prior distribution | |
BETA0 = 1 | |
BETA1 = 1 | |
M = 30 | |
ONE = 1 << M | |
HALF = 1 << (M - 1) | |
QUARTER = 1 << (M - 2) | |
THREEQU = HALF + QUARTER | |
def clear(c, charstack): | |
# print out character c, and other queued characters | |
a = repr(c) + repr(1 - c) * charstack[0] | |
charstack[0] = 0 | |
return a | |
def encode(string, c0=BETA0, c1=BETA1, adaptive=True): | |
assert c0 > 0 | |
assert c1 > 0 | |
b = ONE | |
a = 0 | |
if not adaptive: | |
p0 = c0 / (c0 + c1) | |
ans = "" | |
charstack = [0] # how many undecided characters remain to print | |
for c in string: | |
w = b - a | |
if adaptive: | |
cT = c0 + c1 | |
p0 = c0 / cT | |
boundary = a + int(p0 * w) | |
# these warnings mean that some of the probabilities | |
# requested by the probabilistic model are so small | |
# (compared to our integers) that we had to round them up | |
# to bigger values. | |
if boundary == a: | |
boundary += 1 | |
print("warningA") | |
if boundary == b: | |
boundary -= 1 | |
print("warningB") | |
if c == '1': | |
a = boundary | |
if adaptive: | |
c1 += 1 | |
elif c == '0': | |
b = boundary | |
if adaptive: | |
c0 += 1 | |
# ignore other characters | |
while a >= HALF or b <= HALF: # output bits | |
if a >= HALF: | |
ans += clear(1, charstack) | |
a -= HALF | |
b -= HALF | |
else: | |
ans += clear(0, charstack) | |
a *= 2 | |
b *= 2 | |
assert a <= HALF | |
assert b >= HALF | |
assert a >= 0 | |
assert b <= ONE | |
# if the gap a-b is getting small, rescale it | |
while a > QUARTER and b < THREEQU: | |
charstack[0] += 1 | |
a *= 2 | |
b *= 2 | |
a -= HALF | |
b -= HALF | |
assert a <= HALF | |
assert b >= HALF | |
assert a >= 0 | |
assert b <= ONE | |
# terminate | |
if HALF - a > b - HALF: | |
w = HALF - a | |
ans += clear(0, charstack) | |
while w < HALF: | |
ans += clear(1, charstack) | |
w *= 2 | |
else: | |
w = b - HALF | |
ans += clear(1, charstack) | |
while w < HALF: | |
ans += clear(0, charstack) | |
w *= 2 | |
return ans | |
def decode(string, N, c0=BETA0, c1=BETA1, adaptive=True): | |
# must supply N, the number of source characters remaining. | |
assert c0 > 0 | |
assert c1 > 0 | |
b = ONE | |
a = 0 | |
model_needs_updating = True | |
if not adaptive: | |
p0 = c0 / (c0 + c1) | |
ans = "" | |
u = 0 | |
v = ONE | |
for c in string: | |
if N <= 0: | |
break # out of the string-reading loop | |
assert N > 0 | |
# (u,v) is the current "encoded alphabet" binary interval, | |
# and halfway is its midpoint. | |
# (a,b) is the current "source alphabet" interval, | |
# and boundary is the "midpoint" | |
assert u >= 0 | |
assert v <= ONE | |
halfway = u + (v - u) / 2 | |
if c == '1': | |
u = halfway | |
elif c == '0': | |
v = halfway | |
# Read bits until we can decide what the source symbol was. | |
# Then emulate the encoder's computations, | |
# and tie (u,v) to tag along for the ride. | |
while 1: # do-while | |
if model_needs_updating: | |
w = b - a | |
if adaptive: | |
cT = c0 + c1 | |
p0 = c0 / cT | |
boundary = a + int(p0 * w) | |
if boundary == a: | |
boundary += 1 | |
print("warningA") | |
if boundary == b: | |
boundary -= 1 | |
print("warningB") | |
model_needs_updating = False | |
if boundary <= u: | |
ans += "1" | |
if adaptive: | |
c1 += 1 | |
a = boundary | |
model_needs_updating = True | |
N -= 1 | |
elif boundary >= v: | |
ans += "0" | |
if adaptive: | |
c0 += 1 | |
b = boundary | |
model_needs_updating = True | |
N -= 1 | |
else: | |
# not enough bits have yet been read to know the decision. | |
pass | |
# emulate outputting of bits by the encoder, | |
# and tie (u,v) to tag along for the ride. | |
while a >= HALF or b <= HALF: | |
if a >= HALF: | |
a -= HALF | |
b -= HALF | |
u -= HALF | |
v -= HALF | |
a *= 2 | |
b *= 2 | |
u *= 2 | |
v *= 2 | |
model_needs_updating = True | |
assert a <= HALF | |
assert b >= HALF | |
assert a >= 0 | |
assert b <= ONE | |
# if the gap a-b is getting small, rescale it | |
while a > QUARTER and b < THREEQU: | |
a *= 2 | |
b *= 2 | |
u *= 2 | |
v *= 2 | |
a -= HALF | |
b -= HALF | |
u -= HALF | |
v -= HALF | |
# this is the condition for this do-while loop | |
if not (N > 0 and model_needs_updating): | |
break | |
return ans | |
def test(): | |
tests = [ | |
"1010", | |
"111", | |
"00001000000000000000", | |
"1", | |
"10", | |
"01", | |
"0", | |
"0000000", | |
""" | |
00000000000000010000000000000000 | |
00000000000000001000000000000000 | |
00011000000 | |
""", | |
] | |
for s in tests: | |
# an ugly way to remove whitespace and newlines from the test strings: | |
s = "".join(s.split()) | |
N = len(s) # required for decoding later. | |
print("original:", s) | |
e = encode(s, c0=10, c1=1) | |
print("encoded: ", e) | |
ds = decode(e, N, c0=10, c1=1) | |
print("decoded: ", ds) | |
if ds != s: | |
print("FAIL") | |
else: | |
print("PASS") | |
print() | |
if __name__ == '__main__': | |
test() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment