Created
October 25, 2016 21:33
-
-
Save Erotemic/b476854955ca3c3ee892e2f6212cf93e to your computer and use it in GitHub Desktop.
benchmarks
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| from __future__ import print_function, division, absolute_import, unicode_literals | |
| import time | |
| import itertools as it | |
| from sklearn.utils.extmath import row_norms | |
| from sklearn.metrics.pairwise import euclidean_distances | |
| from sklearn.metrics.pairwise import pairwise_distances_argmin_min | |
| import numpy as np | |
| class Timer(object): | |
| def __init__(self): | |
| self.tstart = -1 | |
| self.ellapsed = -1 | |
| self.default_timer = time.time | |
| def tic(self): | |
| self.tstart = self.default_timer() | |
| def toc(self): | |
| ellapsed = (self.default_timer() - self.tstart) | |
| return ellapsed | |
| def __enter__(self): | |
| self.tic() | |
| return self | |
| def __exit__(self, type_, value, trace): | |
| self.ellapsed = self.toc() | |
| if trace is not None: | |
| return False | |
| def time_func(func_tup, iters=10): | |
| times = [] | |
| func = func_tup[0] | |
| args = func_tup[1:] | |
| for i in range(iters): | |
| with Timer() as t: | |
| func(*args) | |
| times.append(t.ellapsed) | |
| ave_time = sum(times) / len(times) | |
| return ave_time | |
| def all_dict_combinations(varied_dict): | |
| tups_list = [[(key, val) for val in val_list] | |
| if isinstance(val_list, (list)) | |
| else [(key, val_list)] | |
| for (key, val_list) in sorted(varied_dict.items())] | |
| dict_list = [dict(tups) for tups in it.product(*tups_list)] | |
| return dict_list | |
| def new_labels_inertia_precompute_dense(X, x_squared_norms, centers, batch_size): | |
| n_samples = X.shape[0] # NOQA | |
| metric_kwargs = dict(squared=True) | |
| labels, mindist = pairwise_distances_argmin_min( | |
| batch_size=batch_size, | |
| X=X, Y=centers, metric='euclidean', metric_kwargs=metric_kwargs) | |
| labels = labels.astype(np.int32) | |
| # Dont bother timing the lines that did not change | |
| # if n_samples == distances.shape[0]: | |
| # # distances will be changed in-place | |
| # distances[:] = mindist | |
| # inertia = mindist.sum() | |
| # return labels, inertia | |
| def old_labels_inertia_precompute_dense(X, x_squared_norms, centers): | |
| n_samples = X.shape[0] | |
| k = centers.shape[0] | |
| all_distances = euclidean_distances(centers, X, x_squared_norms, | |
| squared=True) | |
| labels = np.empty(n_samples, dtype=np.int32) | |
| labels.fill(-1) | |
| mindist = np.empty(n_samples) | |
| mindist.fill(np.infty) | |
| for center_id in range(k): | |
| dist = all_distances[center_id] | |
| labels[dist < mindist] = center_id | |
| mindist = np.minimum(dist, mindist) | |
| # Dont bother timing the lines that did not change | |
| # if n_samples == distances.shape[0]: | |
| # # distances will be changed in-place | |
| # distances[:] = mindist | |
| # inertia = mindist.sum() | |
| # return labels, inertia | |
| def make_X(n_clusters=2000, n_features=128, n_samples=10, dtype=np.float32): | |
| rng = np.random.RandomState(42) | |
| X = rng.rand(n_samples, n_features).astype(dtype) | |
| x_squared_norms = row_norms(X) | |
| centers = rng.rand(n_clusters, n_features).astype(dtype) | |
| return X, x_squared_norms, centers | |
| def single_benchmark_part(n_clusters, n_features, n_samples, batch_size=500, | |
| dtype=np.float32, niters=10): | |
| X, x_squared_norms, centers = make_X(n_clusters, n_features, n_samples, dtype) | |
| dtype_bytes = dtype(0).nbytes | |
| measures = {} | |
| size_old = X.shape[0] * centers.shape[0] | |
| size_new = min(batch_size, X.shape[0]) * centers.shape[0] | |
| # print(X.shape) | |
| # print(centers.shape) | |
| measures['MB_old'] = (size_old * dtype_bytes) / 2 ** 20 | |
| measures['MB_new'] = (size_new * dtype_bytes) / 2 ** 20 | |
| # print('measures = %r' % (measures,)) | |
| measures['old_speed'] = time_func( | |
| (old_labels_inertia_precompute_dense, | |
| X, x_squared_norms, centers), niters) | |
| measures['new_speed'] = time_func( | |
| (new_labels_inertia_precompute_dense, | |
| X, x_squared_norms, centers, batch_size), niters) | |
| return measures | |
| def single_benchmark_full_minibatch(n_clusters, n_features, n_samples, | |
| batch_size=500, dtype=np.float32, | |
| niters=10): | |
| rng = np.random.RandomState(42) | |
| X = rng.rand(n_samples, n_features).astype(dtype) | |
| measures = {} | |
| # from sklearn.cluster.k_means_ import MiniBatchKMeans as MiniBatchKMeansNew | |
| # from sklearn.cluster.k_means_master import MiniBatchKMeans as MiniBatchKMeansOld | |
| from sklearn.cluster.k_means_ import KMeans as KMeansNew | |
| from sklearn.cluster.k_means_master import KMeans as KMeansOld | |
| params = {'n_clusters': n_clusters} | |
| # params = {'n_clusters': n_clusters, 'verbose': 5} | |
| # measures['old_speed'] = time_func((lambda: MiniBatchKMeansOld(**params).fit(X),), niters) | |
| # measures['new_speed'] = time_func((lambda: MiniBatchKMeansNew(**params).fit(X),), niters) | |
| measures['old_speed'] = time_func((lambda: KMeansOld(**params).fit(X),), niters) | |
| measures['new_speed'] = time_func((lambda: KMeansNew(**params).fit(X),), niters) | |
| return measures | |
| def run_benchmark_grid(basis, name): | |
| print('Running %s benchmark' % (name,)) | |
| import pandas as pd | |
| pd.options.display.max_rows = 1000 | |
| pd.options.display.width = 1000 | |
| vals = [] | |
| try: | |
| import utool as ut | |
| ProgIter = ut.ProgIter | |
| except ImportError: | |
| def ProgIter(x): | |
| return x | |
| for kw in ProgIter(all_dict_combinations(basis)): | |
| # print('---------') | |
| # print('kw = %r' % (kw,)) | |
| # measures = single_benchmark_part(**kw) | |
| try: | |
| measures = single_benchmark_full_minibatch(**kw) | |
| kw.update(measures) | |
| vals.append(kw) | |
| except ValueError: | |
| print('skipped bad test') | |
| pass | |
| print('====') | |
| print('Results for %s benchmark' % (name,)) | |
| df = pd.DataFrame.from_dict(vals) | |
| df['percent_change'] = 100 * (df['old_speed'] - df['new_speed']) / df['old_speed'] | |
| new_keys = ['MB_new', 'MB_old', 'new_speed', 'old_speed', 'percent_change'] | |
| unused_keys = set(new_keys).difference(set(df.keys())) | |
| new_keys = [k for k in new_keys if k not in unused_keys] | |
| old_keys = sorted(set(df.columns) - set(new_keys)) | |
| df = df.reindex_axis(old_keys + new_keys, axis=1) | |
| df['absolute_change'] = (df['old_speed'] - df['new_speed']) | |
| print(df.sort_values('absolute_change', ascending=False)) | |
| def main2(): | |
| """ | |
| git checkout master | |
| cp sklearn/cluster/k_means_.py sklearn/cluster/k_means_master.py | |
| git checkout km_batch_labels_inertia | |
| """ | |
| basis = { | |
| 'n_clusters': [2, 5, 10, 100][::-1], | |
| 'n_features': [32][::-1], | |
| 'n_samples': [10, 100, 1000][::-1], | |
| 'niters': [10], | |
| } | |
| import utool as ut | |
| with ut.Timer(): | |
| run_benchmark_grid(basis, 'full minibatch test') | |
| def main(): | |
| small_cluster_basis = { | |
| 'n_clusters': [2, 5, 10][::-1], | |
| 'n_features': [16, 32, 128][::-1], | |
| 'n_samples': [10, 20, 100, 1000, 50000][::-1], | |
| 'niters': [100], | |
| } | |
| run_benchmark_grid(small_cluster_basis, 'small clusters') | |
| large_basis = { | |
| 'n_clusters': [10, 100, 1000][::-1], | |
| 'n_features': [16, 32, 128][::-1], | |
| 'n_samples': [10, 100, 1000, 10000, 50000][::-1], | |
| 'niters': [5], | |
| } | |
| run_benchmark_grid(large_basis, 'large clusters test') | |
| batch_size_basis = { | |
| 'n_clusters': [1000][::-1], | |
| 'n_features': [32][::-1], | |
| 'n_samples': [10, 100, 1000, 10000][::-1], | |
| 'batch_size': [100, 500, 1000], | |
| 'niters': [5], | |
| } | |
| run_benchmark_grid(batch_size_basis, 'batch_size test') | |
| if __name__ == '__main__': | |
| # main() | |
| main2() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment