Skip to content

Instantly share code, notes, and snippets.

@kemaleren
Last active December 20, 2015 03:29
Show Gist options
  • Save kemaleren/6063691 to your computer and use it in GitHub Desktop.
Save kemaleren/6063691 to your computer and use it in GitHub Desktop.
line profiler dumps for SpectralBiclustering and SpectralCoclustering on both dense and sparse matrices.
from sklearn.datasets import fetch_20newsgroups_vectorized
from sklearn.datasets import make_checkerboard
from sklearn.datasets import make_biclusters
from sklearn.cluster.bicluster import SpectralBiclustering
from sklearn.cluster.bicluster import SpectralCoclustering
newsgroups = fetch_20newsgroups_vectorized()
# Spectral Co-Clustering
a, rows, columns = make_biclusters((40000, 300), 10, noise=2)
model = SpectralCoclustering(10, mini_batch=False)
lprun -f model._fit model.fit(a)
model = SpectralCoclustering(20, mini_batch=False)
lprun -f model._fit model.fit(newsgroups.data)
model = SpectralCoclustering(20, mini_batch=True)
lprun -f model._fit model.fit(newsgroups.data)
# Spectral Biclustering
a, rows, columns = make_checkerboard((40000, 300), 10, noise=2)
model = SpectralBiclustering(10)
lprun -f model._fit model.fit(a)
model = SpectralBiclustering(20, mini_batch=False)
lprun -f model._fit model.fit(newsgroups.data)
model = SpectralBiclustering(20, mini_batch=True)
lprun -f model._fit model.fit(newsgroups.data)
* SpectralCoclustering dense test: 40000 x 300 matrix
File: sklearn/cluster/bicluster/spectral.py
Function: _fit at line 271
Total time: 2.62557 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
271 def _fit(self, X):
272 1 124484 124484.0 4.7 normalized_data, row_diag, col_diag = _scale_normalize(X)
273 1 48 48.0 0.0 n_sv = 1 + int(np.ceil(np.log2(self.n_clusters)))
274 1 1737672 1737672.0 66.2 u, v = self._svd(normalized_data, n_sv, n_discard=1)
275 1 830 830.0 0.0 z = np.vstack((row_diag[:, np.newaxis] * u,
276 1 243 243.0 0.0 col_diag[:, np.newaxis] * v))
277
278 1 761520 761520.0 29.0 _, labels = self._k_means(z, self.n_clusters)
279
280 1 4 4.0 0.0 n_rows = X.shape[0]
281 1 4 4.0 0.0 self.row_labels_ = labels[:n_rows]
282 1 2 2.0 0.0 self.column_labels_ = labels[n_rows:]
283
284 1 3 3.0 0.0 self.rows_ = np.vstack(self.row_labels_ == c
285 1 615 615.0 0.0 for c in range(self.n_clusters))
286 1 2 2.0 0.0 self.columns_ = np.vstack(self.column_labels_ == c
287 1 143 143.0 0.0 for c in range(self.n_clusters))
* SpectralCoclustering sparse test: 20newsgroups_vectorized, with mini_batch=False
File: sklearn/cluster/bicluster/spectral.py
Function: _fit at line 271
Total time: 32.9558 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
271 def _fit(self, X):
272 1 138058 138058.0 0.4 normalized_data, row_diag, col_diag = _scale_normalize(X)
273 1 17 17.0 0.0 n_sv = 1 + int(np.ceil(np.log2(self.n_clusters)))
274 1 380753 380753.0 1.2 u, v = self._svd(normalized_data, n_sv, n_discard=1)
275 1 259 259.0 0.0 z = np.vstack((row_diag[:, np.newaxis] * u,
276 1 3901 3901.0 0.0 col_diag[:, np.newaxis] * v))
277
278 1 32428991 32428991.0 98.4 _, labels = self._k_means(z, self.n_clusters)
279
280 1 6 6.0 0.0 n_rows = X.shape[0]
281 1 4 4.0 0.0 self.row_labels_ = labels[:n_rows]
282 1 3 3.0 0.0 self.column_labels_ = labels[n_rows:]
283
284 1 3 3.0 0.0 self.rows_ = np.vstack(self.row_labels_ == c
285 1 590 590.0 0.0 for c in range(self.n_clusters))
286 1 2 2.0 0.0 self.columns_ = np.vstack(self.column_labels_ == c
287 1 3263 3263.0 0.0 for c in range(self.n_clusters))
* SpectralCoclustering sparse test: 20newsgroups_vectorized, with mini_batch=True
File: sklearn/cluster/bicluster/spectral.py
Function: _fit at line 271
Total time: 1.92757 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
271 def _fit(self, X):
272 1 140340 140340.0 7.3 normalized_data, row_diag, col_diag = _scale_normalize(X)
273 1 17 17.0 0.0 n_sv = 1 + int(np.ceil(np.log2(self.n_clusters)))
274 1 401109 401109.0 20.8 u, v = self._svd(normalized_data, n_sv, n_discard=1)
275 1 262 262.0 0.0 z = np.vstack((row_diag[:, np.newaxis] * u,
276 1 3825 3825.0 0.2 col_diag[:, np.newaxis] * v))
277
278 1 1377933 1377933.0 71.5 _, labels = self._k_means(z, self.n_clusters)
279
280 1 9 9.0 0.0 n_rows = X.shape[0]
281 1 5 5.0 0.0 self.row_labels_ = labels[:n_rows]
282 1 2 2.0 0.0 self.column_labels_ = labels[n_rows:]
283
284 1 4 4.0 0.0 self.rows_ = np.vstack(self.row_labels_ == c
285 1 562 562.0 0.0 for c in range(self.n_clusters))
286 1 2 2.0 0.0 self.columns_ = np.vstack(self.column_labels_ == c
287 1 3501 3501.0 0.2 for c in range(self.n_clusters))
* SpectralBiclustering dense test: 40000 x 300 checkerboard matrix
File: sklearn/cluster/bicluster/spectral.py
Function: _fit at line 411
Total time: 9.88391 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
411 def _fit(self, X):
412 1 4 4.0 0.0 n_sv = self.n_components
413 1 3 3.0 0.0 if self.method == 'bistochastic':
414 1 3718143 3718143.0 37.6 normalized_data = _bistochastic_normalize(X)
415 1 3 3.0 0.0 n_sv += 1
416 elif self.method == 'scale':
417 normalized_data, _, _ = _scale_normalize(X)
418 n_sv += 1
419 elif self.method == 'log':
420 normalized_data = _log_normalize(X)
421 1 3 3.0 0.0 n_discard = 0 if self.method == 'log' else 1
422 1 2105260 2105260.0 21.3 u, v = self._svd(normalized_data, n_sv, n_discard)
423 1 3 3.0 0.0 ut = u.T
424 1 2 2.0 0.0 vt = v.T
425
426 1 2 2.0 0.0 try:
427 1 12 12.0 0.0 n_row_clusters, n_col_clusters = self.n_clusters
428 1 3 3.0 0.0 except TypeError:
429 1 2 2.0 0.0 n_row_clusters = n_col_clusters = self.n_clusters
430
431 1 2 2.0 0.0 best_ut = self._fit_best_piecewise(ut, self.n_best,
432 1 3033660 3033660.0 30.7 n_row_clusters)
433
434 1 5 5.0 0.0 best_vt = self._fit_best_piecewise(vt, self.n_best,
435 1 296716 296716.0 3.0 n_col_clusters)
436
437 1 3 3.0 0.0 self.row_labels_ = self._project_and_cluster(X, best_vt.T,
438 1 650654 650654.0 6.6 n_row_clusters)
439
440 1 9 9.0 0.0 self.column_labels_ = self._project_and_cluster(X.T, best_ut.T,
441 1 71899 71899.0 0.7 n_col_clusters)
* SpectralBiclustering sparse test: 20newsgroups_vectorized, with mini_batch=False
File: sklearn/cluster/bicluster/spectral.py
Function: _fit at line 411
Total time: 130.362 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
411 def _fit(self, X):
412 1 5 5.0 0.0 n_sv = self.n_components
413 1 3 3.0 0.0 if self.method == 'bistochastic':
414 1 168287 168287.0 0.1 normalized_data = _bistochastic_normalize(X)
415 1 3 3.0 0.0 n_sv += 1
416 elif self.method == 'scale':
417 normalized_data, _, _ = _scale_normalize(X)
418 n_sv += 1
419 elif self.method == 'log':
420 normalized_data = _log_normalize(X)
421 1 3 3.0 0.0 n_discard = 0 if self.method == 'log' else 1
422 1 421802 421802.0 0.3 u, v = self._svd(normalized_data, n_sv, n_discard)
423 1 3 3.0 0.0 ut = u.T
424 1 2 2.0 0.0 vt = v.T
425
426 1 2 2.0 0.0 try:
427 1 13 13.0 0.0 n_row_clusters, n_col_clusters = self.n_clusters
428 1 5 5.0 0.0 except TypeError:
429 1 1 1.0 0.0 n_row_clusters = n_col_clusters = self.n_clusters
430
431 1 2 2.0 0.0 best_ut = self._fit_best_piecewise(ut, self.n_best,
432 1 5315974 5315974.0 4.1 n_row_clusters)
433
434 1 4 4.0 0.0 best_vt = self._fit_best_piecewise(vt, self.n_best,
435 1 104340257 104340257.0 80.0 n_col_clusters)
436
437 1 10 10.0 0.0 self.row_labels_ = self._project_and_cluster(X, best_vt.T,
438 1 3851173 3851173.0 3.0 n_row_clusters)
439
440 1 149 149.0 0.0 self.column_labels_ = self._project_and_cluster(X.T, best_ut.T,
441 1 16182799 16182799.0 12.4 n_col_clusters)
* SpectralBiclustering sparse test: 20newsgroups_vectorized, with mini_batch=True
File: sklearn/cluster/bicluster/spectral.py
Function: _fit at line 411
Total time: 3.26277 s
Line # Hits Time Per Hit % Time Line Contents
==============================================================
411 def _fit(self, X):
412 1 3 3.0 0.0 n_sv = self.n_components
413 1 2 2.0 0.0 if self.method == 'bistochastic':
414 1 151776 151776.0 4.7 normalized_data = _bistochastic_normalize(X)
415 1 3 3.0 0.0 n_sv += 1
416 elif self.method == 'scale':
417 normalized_data, _, _ = _scale_normalize(X)
418 n_sv += 1
419 elif self.method == 'log':
420 normalized_data = _log_normalize(X)
421 1 3 3.0 0.0 n_discard = 0 if self.method == 'log' else 1
422 1 402177 402177.0 12.3 u, v = self._svd(normalized_data, n_sv, n_discard)
423 1 3 3.0 0.0 ut = u.T
424 1 2 2.0 0.0 vt = v.T
425
426 1 1 1.0 0.0 try:
427 1 13 13.0 0.0 n_row_clusters, n_col_clusters = self.n_clusters
428 1 4 4.0 0.0 except TypeError:
429 1 1 1.0 0.0 n_row_clusters = n_col_clusters = self.n_clusters
430
431 1 3 3.0 0.0 best_ut = self._fit_best_piecewise(ut, self.n_best,
432 1 760194 760194.0 23.3 n_row_clusters)
433
434 1 4 4.0 0.0 best_vt = self._fit_best_piecewise(vt, self.n_best,
435 1 1570605 1570605.0 48.1 n_col_clusters)
436
437 1 10 10.0 0.0 self.row_labels_ = self._project_and_cluster(X, best_vt.T,
438 1 120669 120669.0 3.7 n_row_clusters)
439
440 1 159 159.0 0.0 self.column_labels_ = self._project_and_cluster(X.T, best_ut.T,
441 1 174007 174007.0 5.3 n_col_clusters)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment