Skip to content

Instantly share code, notes, and snippets.

@Erotemic
Created October 25, 2016 21:33
Show Gist options
  • Select an option

  • Save Erotemic/b476854955ca3c3ee892e2f6212cf93e to your computer and use it in GitHub Desktop.

Select an option

Save Erotemic/b476854955ca3c3ee892e2f6212cf93e to your computer and use it in GitHub Desktop.
benchmarks
# -*- coding: utf-8 -*-
from __future__ import print_function, division, absolute_import, unicode_literals
import time
import itertools as it
from sklearn.utils.extmath import row_norms
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import pairwise_distances_argmin_min
import numpy as np
class Timer(object):
def __init__(self):
self.tstart = -1
self.ellapsed = -1
self.default_timer = time.time
def tic(self):
self.tstart = self.default_timer()
def toc(self):
ellapsed = (self.default_timer() - self.tstart)
return ellapsed
def __enter__(self):
self.tic()
return self
def __exit__(self, type_, value, trace):
self.ellapsed = self.toc()
if trace is not None:
return False
def time_func(func_tup, iters=10):
times = []
func = func_tup[0]
args = func_tup[1:]
for i in range(iters):
with Timer() as t:
func(*args)
times.append(t.ellapsed)
ave_time = sum(times) / len(times)
return ave_time
def all_dict_combinations(varied_dict):
tups_list = [[(key, val) for val in val_list]
if isinstance(val_list, (list))
else [(key, val_list)]
for (key, val_list) in sorted(varied_dict.items())]
dict_list = [dict(tups) for tups in it.product(*tups_list)]
return dict_list
def new_labels_inertia_precompute_dense(X, x_squared_norms, centers, batch_size):
n_samples = X.shape[0] # NOQA
metric_kwargs = dict(squared=True)
labels, mindist = pairwise_distances_argmin_min(
batch_size=batch_size,
X=X, Y=centers, metric='euclidean', metric_kwargs=metric_kwargs)
labels = labels.astype(np.int32)
# Dont bother timing the lines that did not change
# if n_samples == distances.shape[0]:
# # distances will be changed in-place
# distances[:] = mindist
# inertia = mindist.sum()
# return labels, inertia
def old_labels_inertia_precompute_dense(X, x_squared_norms, centers):
n_samples = X.shape[0]
k = centers.shape[0]
all_distances = euclidean_distances(centers, X, x_squared_norms,
squared=True)
labels = np.empty(n_samples, dtype=np.int32)
labels.fill(-1)
mindist = np.empty(n_samples)
mindist.fill(np.infty)
for center_id in range(k):
dist = all_distances[center_id]
labels[dist < mindist] = center_id
mindist = np.minimum(dist, mindist)
# Dont bother timing the lines that did not change
# if n_samples == distances.shape[0]:
# # distances will be changed in-place
# distances[:] = mindist
# inertia = mindist.sum()
# return labels, inertia
def make_X(n_clusters=2000, n_features=128, n_samples=10, dtype=np.float32):
rng = np.random.RandomState(42)
X = rng.rand(n_samples, n_features).astype(dtype)
x_squared_norms = row_norms(X)
centers = rng.rand(n_clusters, n_features).astype(dtype)
return X, x_squared_norms, centers
def single_benchmark_part(n_clusters, n_features, n_samples, batch_size=500,
dtype=np.float32, niters=10):
X, x_squared_norms, centers = make_X(n_clusters, n_features, n_samples, dtype)
dtype_bytes = dtype(0).nbytes
measures = {}
size_old = X.shape[0] * centers.shape[0]
size_new = min(batch_size, X.shape[0]) * centers.shape[0]
# print(X.shape)
# print(centers.shape)
measures['MB_old'] = (size_old * dtype_bytes) / 2 ** 20
measures['MB_new'] = (size_new * dtype_bytes) / 2 ** 20
# print('measures = %r' % (measures,))
measures['old_speed'] = time_func(
(old_labels_inertia_precompute_dense,
X, x_squared_norms, centers), niters)
measures['new_speed'] = time_func(
(new_labels_inertia_precompute_dense,
X, x_squared_norms, centers, batch_size), niters)
return measures
def single_benchmark_full_minibatch(n_clusters, n_features, n_samples,
batch_size=500, dtype=np.float32,
niters=10):
rng = np.random.RandomState(42)
X = rng.rand(n_samples, n_features).astype(dtype)
measures = {}
# from sklearn.cluster.k_means_ import MiniBatchKMeans as MiniBatchKMeansNew
# from sklearn.cluster.k_means_master import MiniBatchKMeans as MiniBatchKMeansOld
from sklearn.cluster.k_means_ import KMeans as KMeansNew
from sklearn.cluster.k_means_master import KMeans as KMeansOld
params = {'n_clusters': n_clusters}
# params = {'n_clusters': n_clusters, 'verbose': 5}
# measures['old_speed'] = time_func((lambda: MiniBatchKMeansOld(**params).fit(X),), niters)
# measures['new_speed'] = time_func((lambda: MiniBatchKMeansNew(**params).fit(X),), niters)
measures['old_speed'] = time_func((lambda: KMeansOld(**params).fit(X),), niters)
measures['new_speed'] = time_func((lambda: KMeansNew(**params).fit(X),), niters)
return measures
def run_benchmark_grid(basis, name):
print('Running %s benchmark' % (name,))
import pandas as pd
pd.options.display.max_rows = 1000
pd.options.display.width = 1000
vals = []
try:
import utool as ut
ProgIter = ut.ProgIter
except ImportError:
def ProgIter(x):
return x
for kw in ProgIter(all_dict_combinations(basis)):
# print('---------')
# print('kw = %r' % (kw,))
# measures = single_benchmark_part(**kw)
try:
measures = single_benchmark_full_minibatch(**kw)
kw.update(measures)
vals.append(kw)
except ValueError:
print('skipped bad test')
pass
print('====')
print('Results for %s benchmark' % (name,))
df = pd.DataFrame.from_dict(vals)
df['percent_change'] = 100 * (df['old_speed'] - df['new_speed']) / df['old_speed']
new_keys = ['MB_new', 'MB_old', 'new_speed', 'old_speed', 'percent_change']
unused_keys = set(new_keys).difference(set(df.keys()))
new_keys = [k for k in new_keys if k not in unused_keys]
old_keys = sorted(set(df.columns) - set(new_keys))
df = df.reindex_axis(old_keys + new_keys, axis=1)
df['absolute_change'] = (df['old_speed'] - df['new_speed'])
print(df.sort_values('absolute_change', ascending=False))
def main2():
"""
git checkout master
cp sklearn/cluster/k_means_.py sklearn/cluster/k_means_master.py
git checkout km_batch_labels_inertia
"""
basis = {
'n_clusters': [2, 5, 10, 100][::-1],
'n_features': [32][::-1],
'n_samples': [10, 100, 1000][::-1],
'niters': [10],
}
import utool as ut
with ut.Timer():
run_benchmark_grid(basis, 'full minibatch test')
def main():
small_cluster_basis = {
'n_clusters': [2, 5, 10][::-1],
'n_features': [16, 32, 128][::-1],
'n_samples': [10, 20, 100, 1000, 50000][::-1],
'niters': [100],
}
run_benchmark_grid(small_cluster_basis, 'small clusters')
large_basis = {
'n_clusters': [10, 100, 1000][::-1],
'n_features': [16, 32, 128][::-1],
'n_samples': [10, 100, 1000, 10000, 50000][::-1],
'niters': [5],
}
run_benchmark_grid(large_basis, 'large clusters test')
batch_size_basis = {
'n_clusters': [1000][::-1],
'n_features': [32][::-1],
'n_samples': [10, 100, 1000, 10000][::-1],
'batch_size': [100, 500, 1000],
'niters': [5],
}
run_benchmark_grid(batch_size_basis, 'batch_size test')
if __name__ == '__main__':
# main()
main2()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment