thomasahle · October 21, 2022 20:53 · aribenjamin · Oct 27, 2022
diff --git a/rvq.py b/rvq.py
 import numpy as np
 from sklearn.cluster import KMeans
 from sklearn.metrics.pairwise import euclidean_distances
 import collections

 def kl_means(X, k:int, l:int, policy:str):
    n, d = X.shape
    km = KMeans(k).fit(X)
    centers = km.cluster_centers_ / l
    labels = np.stack([km.labels_] * l).T
    old_loss = 10**10
    t = 0
    while True:
        t += 1
        if policy == 'top':
            dists = euclidean_distances(X, centers, squared=True)
            new_labels = np.argsort(dists)[:, :l]
        elif policy == 'greedy':
            ls = []
            res = np.copy(X)
            for _ in range(l):
                dists = euclidean_distances(res, centers, squared=True)
                best = np.argmin(dists, axis=1)
                res -= centers[best]
                ls.append(best)
            new_labels = np.stack(ls).T
        elif policy == 'greedy2':
            ls = []
            Y = sum(centers[c] for c in labels.T)
            for i in range(l):
                Y -= centers[labels[:, i]]
                dists = euclidean_distances(X-Y, centers, squared=True)
                nl = np.argmin(dists, axis=1)
                ls.append(nl)
                Y += centers[nl]
            new_labels = np.stack(ls).T
        elif policy == 'rvq':
            Y = sum(centers[c] for c in labels.T)
            new_labels = np.copy(labels)

            shift = 0
            for i in range(l):
                # Remove the current part from Y so it becomes just
                # the parts we are not currently editing
                Y -= centers[labels[:, i]]

                residual = X - Y
                ki = (k + i) // l
                km = KMeans(ki).fit(residual)
                new_labels[:, i] = shift + km.labels_
                shift += ki

                # Add the current part back in, so Y is again the
                # current approximation to X
                Y += km.cluster_centers_[km.labels_]
        else:
            assert False

        labels = new_labels
        H = np.zeros((n, k))
        for col in labels.T:
            H[np.arange(n), col] += 1
        M, _res, _rank, _s = np.linalg.lstsq(H, X, rcond=None)
        centers = M

        loss = np.linalg.norm(X - sum(centers[c] for c in labels.T))
        if old_loss - loss < 1e-8:
            break
        else:
            old_loss = loss

    return labels, centers, loss, t

 n, d = 500, 10
 k = 20
 maxl = k-1
 reps = 3


 data = collections.defaultdict(list)
 ls = range(1, maxl+1)
 for _ in range(reps):
    X = np.random.randn(n, d)
    km = KMeans(k).fit(X)
    loss = np.linalg.norm(X - km.cluster_centers_[km.labels_])
    print('KMeans baseline:', loss)

    for l in ls:
        print('L:', l)
        for p in ['top', 'greedy', 'rvq']:
            labels, centers, loss, t =\
                    kl_means(X, k, l, p)
            print(f'{p}, {t}its, {loss:.3f}')
            data[p].append(loss)


 import matplotlib.pyplot as plt
 for label, series in data.items():
    ar = np.array(series).reshape(reps, -1)
    low, hi, mean = ar.min(axis=0), ar.max(axis=0), ar.mean(axis=0)
    plt.fill_between(ls, low, hi, alpha=.1)
    plt.plot(ls, mean, label=label)
 plt.legend()
 plt.title(f'L-dense {k}-means on ({n},{d}) randn')
 plt.xlabel('L')
 plt.ylabel('RMSE')
 plt.show()
	import numpy as np
	from sklearn.cluster import KMeans
	from sklearn.metrics.pairwise import euclidean_distances
	import collections

	def kl_means(X, k:int, l:int, policy:str):
	n, d = X.shape
	km = KMeans(k).fit(X)
	centers = km.cluster_centers_ / l
	labels = np.stack([km.labels_] * l).T
	old_loss = 10**10
	t = 0
	while True:
	t += 1
	if policy == 'top':
	dists = euclidean_distances(X, centers, squared=True)
	new_labels = np.argsort(dists)[:, :l]
	elif policy == 'greedy':
	ls = []
	res = np.copy(X)
	for _ in range(l):
	dists = euclidean_distances(res, centers, squared=True)
	best = np.argmin(dists, axis=1)
	res -= centers[best]
	ls.append(best)
	new_labels = np.stack(ls).T
	elif policy == 'greedy2':
	ls = []
	Y = sum(centers[c] for c in labels.T)
	for i in range(l):
	Y -= centers[labels[:, i]]
	dists = euclidean_distances(X-Y, centers, squared=True)
	nl = np.argmin(dists, axis=1)
	ls.append(nl)
	Y += centers[nl]
	new_labels = np.stack(ls).T
	elif policy == 'rvq':
	Y = sum(centers[c] for c in labels.T)
	new_labels = np.copy(labels)

	shift = 0
	for i in range(l):
	# Remove the current part from Y so it becomes just
	# the parts we are not currently editing
	Y -= centers[labels[:, i]]

	residual = X - Y
	ki = (k + i) // l
	km = KMeans(ki).fit(residual)
	new_labels[:, i] = shift + km.labels_
	shift += ki

	# Add the current part back in, so Y is again the
	# current approximation to X
	Y += km.cluster_centers_[km.labels_]
	else:
	assert False

	labels = new_labels
	H = np.zeros((n, k))
	for col in labels.T:
	H[np.arange(n), col] += 1
	M, _res, _rank, _s = np.linalg.lstsq(H, X, rcond=None)
	centers = M

	loss = np.linalg.norm(X - sum(centers[c] for c in labels.T))
	if old_loss - loss < 1e-8:
	break
	else:
	old_loss = loss

	return labels, centers, loss, t

	n, d = 500, 10
	k = 20
	maxl = k-1
	reps = 3


	data = collections.defaultdict(list)
	ls = range(1, maxl+1)
	for _ in range(reps):
	X = np.random.randn(n, d)
	km = KMeans(k).fit(X)
	loss = np.linalg.norm(X - km.cluster_centers_[km.labels_])
	print('KMeans baseline:', loss)

	for l in ls:
	print('L:', l)
	for p in ['top', 'greedy', 'rvq']:
	labels, centers, loss, t =\
	kl_means(X, k, l, p)
	print(f'{p}, {t}its, {loss:.3f}')
	data[p].append(loss)


	import matplotlib.pyplot as plt
	for label, series in data.items():
	ar = np.array(series).reshape(reps, -1)
	low, hi, mean = ar.min(axis=0), ar.max(axis=0), ar.mean(axis=0)
	plt.fill_between(ls, low, hi, alpha=.1)
	plt.plot(ls, mean, label=label)
	plt.legend()
	plt.title(f'L-dense {k}-means on ({n},{d}) randn')
	plt.xlabel('L')
	plt.ylabel('RMSE')
	plt.show()