Skip to content

Instantly share code, notes, and snippets.

@betatim
Last active October 26, 2022 14:30
Show Gist options
  • Save betatim/ddf6a2fbff158703f051b6c3114698f0 to your computer and use it in GitHub Desktop.
Save betatim/ddf6a2fbff158703f051b6c3114698f0 to your computer and use it in GitHub Desktop.
import time
import numpy as np
import cupy as cp
import sklearn
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
import cuml
# XXX Attempt to speed up first run
from sklearn_gpu.kmeans import KMeansEngine
n_features = 16# * 2
n_centers = 12
X, y_true = make_blobs(
n_samples=3_000_000, centers=n_centers, n_features=n_features, cluster_std=0.60, random_state=10
)
X, X_, y_true, y_true_ = train_test_split(X, y_true, random_state=42, test_size=3000)
#X, y_true = make_blobs(
# n_samples=300_000, centers=8, n_features=n_features, cluster_std=1.20, random_state=0
#)
kmeans_args = dict(n_clusters=n_centers, random_state=42, n_init=1)
#@profile
def default():
# Default implementation
km = KMeans(**kmeans_args)
tic = time.perf_counter_ns()
for _ in range(1):
km.fit(X)
toc = time.perf_counter_ns()
y_pred_ = km.predict(X_)
print("CPU run took:", (toc-tic)/1_000_000)
#@profile
def cuml_():
km3 = cuml.KMeans(**kmeans_args)
tic = time.perf_counter_ns()
for _ in range(10):
km3.fit(X)
toc = time.perf_counter_ns()
y_pred3_ = km3.predict(X_)
print("CUML run took:", (toc-tic)/1_000_000)
#@profile
def sklearn_gpu():
# Using the accelerated version
with sklearn.config_context(engine_provider="sklearn_gpu"):
km2 = KMeans(**kmeans_args)
tic = time.perf_counter_ns()
for _ in range(10):
km2.fit(X)
toc = time.perf_counter_ns()
y_pred2_ = km2.predict(X_)
# XXX the first run is always way way slow. Maybe because things need to be imported?
# XXX mitigated (a bit) by running the cuml KMeans first
print("GPU run took:", (toc-tic)/1_000_000)
if __name__ == "__main__":
default()
#cuml_()
#cuml_()
#sklearn_gpu()
#sklearn_gpu()
Timer unit: 1e-06 s
Total time: 0.59432 s
File: /home/nfs/thead/git/scikit-learn/sklearn/cluster/_kmeans.py
Function: fit at line 1525
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1525 @profile
1526 def fit(self, X, y=None, sample_weight=None):
1527 """Compute k-means clustering.
1528
1529 Parameters
1530 ----------
1531 X : {array-like, sparse matrix} of shape (n_samples, n_features)
1532 Training instances to cluster. It must be noted that the data
1533 will be converted to C ordering, which will cause a memory
1534 copy if the given data is not C-contiguous.
1535 If a sparse matrix is passed, a copy will be made if it's not in
1536 CSR format.
1537
1538 y : Ignored
1539 Not used, present here for API consistency by convention.
1540
1541 sample_weight : array-like of shape (n_samples,), default=None
1542 The weights for each observation in X. If None, all observations
1543 are assigned equal weight.
1544
1545 .. versionadded:: 0.20
1546
1547 Returns
1548 -------
1549 self : object
1550 Fitted estimator.
1551 """
1552 1 931.0 931.0 0.2 self._validate_params()
1553 1 31.0 31.0 0.0 engine = self._get_engine()
1554
1555 2 72906.0 36453.0 12.3 X, y, sample_weight = engine.prepare_fit(
1556 1 0.0 0.0 0.0 X,
1557 1 0.0 0.0 0.0 y=y,
1558 1 1.0 1.0 0.0 sample_weight=sample_weight,
1559 )
1560 1 27.0 27.0 0.0 self._check_params_vs_input(X)
1561
1562 1 1.0 1.0 0.0 best_inertia, best_labels = None, None
1563
1564 2 3.0 1.5 0.0 for i in range(self._n_init):
1565 # Initialize centers
1566 1 142986.0 142986.0 24.1 centers_init = engine.init_centroids(X)
1567 1 6.0 6.0 0.0 if self.verbose:
1568 print("Initialization complete")
1569
1570 # run a k-means once
1571 2 372880.0 186440.0 62.7 labels, inertia, centers, n_iter_ = engine.kmeans_single(
1572 1 1.0 1.0 0.0 X,
1573 1 2.0 2.0 0.0 sample_weight,
1574 1 2.0 2.0 0.0 centers_init,
1575 )
1576
1577 # determine if these results are the best so far
1578 # we chose a new run if it has a better inertia and the clustering is
1579 # different from the best so far (it's possible that the inertia is
1580 # slightly better even if the clustering is the same with potentially
1581 # permuted labels, due to rounding errors)
1582 1 3.0 3.0 0.0 if best_inertia is None or (
1583 inertia < best_inertia
1584 and not engine.is_same_clustering(labels, best_labels, self.n_clusters)
1585 ):
1586 1 1.0 1.0 0.0 best_labels = labels
1587 1 1.0 1.0 0.0 best_centers = centers
1588 1 1.0 1.0 0.0 best_inertia = inertia
1589 1 1.0 1.0 0.0 best_n_iter = n_iter_
1590
1591 1 32.0 32.0 0.0 engine.unshift_centers(X, best_centers)
1592
1593 1 4473.0 4473.0 0.8 distinct_clusters = np.unique(best_labels).shape[0]
1594 1 2.0 2.0 0.0 if distinct_clusters < self.n_clusters:
1595 warnings.warn(
1596 "Number of distinct clusters ({}) found smaller than "
1597 "n_clusters ({}). Possibly due to duplicate points "
1598 "in X.".format(distinct_clusters, self.n_clusters),
1599 ConvergenceWarning,
1600 stacklevel=2,
1601 )
1602
1603 1 12.0 12.0 0.0 self.cluster_centers_ = best_centers
1604 1 4.0 4.0 0.0 self._n_features_out = self.cluster_centers_.shape[0]
1605 1 4.0 4.0 0.0 self.labels_ = best_labels
1606 1 2.0 2.0 0.0 self.inertia_ = best_inertia
1607 1 6.0 6.0 0.0 self.n_iter_ = best_n_iter
1608 1 1.0 1.0 0.0 return self
Total time: 0.074355 s
File: /home/nfs/thead/git/scikit-learn/sklearn/cluster/_kmeans.py
Function: predict at line 1610
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1610 @profile
1611 def predict(self, X, sample_weight=None):
1612 """Predict the closest cluster each sample in X belongs to.
1613
1614 In the vector quantization literature, `cluster_centers_` is called
1615 the code book and each value returned by `predict` is the index of
1616 the closest code in the code book.
1617
1618 Parameters
1619 ----------
1620 X : {array-like, sparse matrix} of shape (n_samples, n_features)
1621 New data to predict.
1622
1623 sample_weight : array-like of shape (n_samples,), default=None
1624 The weights for each observation in X. If None, all observations
1625 are assigned equal weight.
1626
1627 Returns
1628 -------
1629 labels : ndarray of shape (n_samples,)
1630 Index of the cluster each sample belongs to.
1631 """
1632 1 78.0 78.0 0.1 check_is_fitted(self)
1633 1 42.0 42.0 0.1 engine = self._get_engine()
1634 1 627.0 627.0 0.8 X, sample_weight = engine.prepare_prediction(X, sample_weight)
1635 1 73608.0 73608.0 99.0 return engine.get_labels(X, sample_weight)
Timer unit: 1e-06 s
Total time: 2.99799 s
File: /home/nfs/thead/git/scikit-learn/sklearn/cluster/_kmeans.py
Function: fit at line 1525
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1525 @profile
1526 def fit(self, X, y=None, sample_weight=None):
1527 """Compute k-means clustering.
1528
1529 Parameters
1530 ----------
1531 X : {array-like, sparse matrix} of shape (n_samples, n_features)
1532 Training instances to cluster. It must be noted that the data
1533 will be converted to C ordering, which will cause a memory
1534 copy if the given data is not C-contiguous.
1535 If a sparse matrix is passed, a copy will be made if it's not in
1536 CSR format.
1537
1538 y : Ignored
1539 Not used, present here for API consistency by convention.
1540
1541 sample_weight : array-like of shape (n_samples,), default=None
1542 The weights for each observation in X. If None, all observations
1543 are assigned equal weight.
1544
1545 .. versionadded:: 0.20
1546
1547 Returns
1548 -------
1549 self : object
1550 Fitted estimator.
1551 """
1552 1 1042.0 1042.0 0.0 self._validate_params()
1553 1 37.0 37.0 0.0 engine = self._get_engine()
1554
1555 2 772429.0 386214.5 25.8 X, y, sample_weight = engine.prepare_fit(
1556 1 1.0 1.0 0.0 X,
1557 1 0.0 0.0 0.0 y=y,
1558 1 0.0 0.0 0.0 sample_weight=sample_weight,
1559 )
1560 1 23.0 23.0 0.0 self._check_params_vs_input(X)
1561
1562 1 1.0 1.0 0.0 best_inertia, best_labels = None, None
1563
1564 2 4.0 2.0 0.0 for i in range(self._n_init):
1565 # Initialize centers
1566 1 1735836.0 1735836.0 57.9 centers_init = engine.init_centroids(X)
1567 1 4.0 4.0 0.0 if self.verbose:
1568 print("Initialization complete")
1569
1570 # run a k-means once
1571 2 434253.0 217126.5 14.5 labels, inertia, centers, n_iter_ = engine.kmeans_single(
1572 1 0.0 0.0 0.0 X,
1573 1 1.0 1.0 0.0 sample_weight,
1574 1 1.0 1.0 0.0 centers_init,
1575 )
1576
1577 # determine if these results are the best so far
1578 # we chose a new run if it has a better inertia and the clustering is
1579 # different from the best so far (it's possible that the inertia is
1580 # slightly better even if the clustering is the same with potentially
1581 # permuted labels, due to rounding errors)
1582 1 4.0 4.0 0.0 if best_inertia is None or (
1583 inertia < best_inertia
1584 and not engine.is_same_clustering(labels, best_labels, self.n_clusters)
1585 ):
1586 1 1.0 1.0 0.0 best_labels = labels
1587 1 1.0 1.0 0.0 best_centers = centers
1588 1 1.0 1.0 0.0 best_inertia = inertia
1589 1 1.0 1.0 0.0 best_n_iter = n_iter_
1590
1591 1 47.0 47.0 0.0 engine.unshift_centers(X, best_centers)
1592
1593 1 54274.0 54274.0 1.8 distinct_clusters = np.unique(best_labels).shape[0]
1594 1 2.0 2.0 0.0 if distinct_clusters < self.n_clusters:
1595 warnings.warn(
1596 "Number of distinct clusters ({}) found smaller than "
1597 "n_clusters ({}). Possibly due to duplicate points "
1598 "in X.".format(distinct_clusters, self.n_clusters),
1599 ConvergenceWarning,
1600 stacklevel=2,
1601 )
1602
1603 1 14.0 14.0 0.0 self.cluster_centers_ = best_centers
1604 1 8.0 8.0 0.0 self._n_features_out = self.cluster_centers_.shape[0]
1605 1 3.0 3.0 0.0 self.labels_ = best_labels
1606 1 2.0 2.0 0.0 self.inertia_ = best_inertia
1607 1 3.0 3.0 0.0 self.n_iter_ = best_n_iter
1608 1 0.0 0.0 0.0 return self
Total time: 0.065059 s
File: /home/nfs/thead/git/scikit-learn/sklearn/cluster/_kmeans.py
Function: predict at line 1610
Line # Hits Time Per Hit % Time Line Contents
==============================================================
1610 @profile
1611 def predict(self, X, sample_weight=None):
1612 """Predict the closest cluster each sample in X belongs to.
1613
1614 In the vector quantization literature, `cluster_centers_` is called
1615 the code book and each value returned by `predict` is the index of
1616 the closest code in the code book.
1617
1618 Parameters
1619 ----------
1620 X : {array-like, sparse matrix} of shape (n_samples, n_features)
1621 New data to predict.
1622
1623 sample_weight : array-like of shape (n_samples,), default=None
1624 The weights for each observation in X. If None, all observations
1625 are assigned equal weight.
1626
1627 Returns
1628 -------
1629 labels : ndarray of shape (n_samples,)
1630 Index of the cluster each sample belongs to.
1631 """
1632 1 73.0 73.0 0.1 check_is_fitted(self)
1633 1 33.0 33.0 0.1 engine = self._get_engine()
1634 1 490.0 490.0 0.8 X, sample_weight = engine.prepare_prediction(X, sample_weight)
1635 1 64463.0 64463.0 99.1 return engine.get_labels(X, sample_weight)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment