Skip to content

Instantly share code, notes, and snippets.

@prusnak
Last active April 1, 2023 18:32
Show Gist options
  • Save prusnak/f54f8f33503458ca1aa9883f71897072 to your computer and use it in GitHub Desktop.
Save prusnak/f54f8f33503458ca1aa9883f71897072 to your computer and use it in GitHub Desktop.
Quantization Benchmarks for GGML
import math
import random
import numpy as np
class Qx_0:
def __init__(self, name, bits):
self.name = name
self.bits = bits
def qin(self, x):
r = 2 ** (self.bits - 1) - 1
sf = np.abs(np.max(x)) / r
q = np.round(x / sf).astype(np.int8)
return sf, q
def qout(self, sf, q):
return sf * q
class Qx_1:
def __init__(self, name, bits):
self.name = name
self.bits = bits
def qin(self, x):
r = 2**self.bits - 1
o = np.min(x)
sf = (np.max(x) - o) / r
q = np.round((x - o) / sf).astype(np.uint8)
return o, sf, q
def qout(self, o, sf, q):
return o + sf * q
Q8_0 = Qx_0("Q8_0", 8)
Q8_1 = Qx_1("Q8_1", 8)
Q4_0 = Qx_0("Q4_0", 4)
Q4_1 = Qx_1("Q4_1", 4)
Q2_0 = Qx_0("Q2_0", 2)
Q2_1 = Qx_1("Q2_1", 2)
def RMSE(a, b):
assert len(a) == len(b)
return np.sqrt(np.mean((a - b) ** 2))
def benchmark(method, iter=100_000, QK=32):
avg = 0
for _ in range(iter):
a = np.clip(np.random.normal(0, 1, QK) * 65536, -65536, 65536)
q = method.qin(a)
x = method.qout(*q)
s = RMSE(a, x)
# print(a)
# print(q)
# print(x)
avg += s
return avg / iter
for m in [Q8_0, Q4_0, Q2_0, Q8_1, Q4_1, Q2_1]:
r = benchmark(m)
print(m.name, r)
@sw
Copy link

sw commented Apr 1, 2023

Is the data really clipped that much (to one standard deviation)? And why 2**16 specifically?

Anyway here's what I used for my experiments with Q2, which I shall call Q2_2 to avoid confusion:

class Qx_2:
    def __init__(self, name, bits):
        self.name = name
        self.bits = bits

    def qin(self, x):
        # calculate the signed maximum (= value of largest magnitude, without applying abs),
        # then assign the value -(2^(k-1)) to that maximum.
        # The sign of the shared scaling factor is adjusted to give the right sign of the result.
        r = -(2 ** (self.bits - 1))
        sf = x.flat[np.abs(x).argmax()] / r
        # contrary to the other methods, we may get +2^(k-1) here, so we need to clip
        q = np.round(x / sf).clip(r, -r-1).astype(np.int8)
        return sf, q

    def qout(self, sf, q):
        return sf * q

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment