Created
December 29, 2014 05:34
-
-
Save cemoody/64ec3934e5b540e8f6e8 to your computer and use it in GitHub Desktop.
BH t-SNE Demo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
### Summary | |
# This notebook presents the Barnes-Hut implementation of t-SNE. t-SNE is used to visualize high-dimensional data in a low dimensional space that attempts preserve the pairwise high-dimensional similarities in a low-dimensional embedding. The Barnes-Hut algorithm, which is used by astrophysicists to perform N-body simulations, allows the calculation of the t-SNE embedding in $O(N log N)$ time instead of $O(N^{2})$. This effectively allows us to learn embeddings of data sets with millions of elements instead of tens of thousands. | |
### Install | |
# To try out out the BH version of t-SNE, do the following: | |
# | |
# Checkout out scikit-learn | |
# | |
# git clone https://github.com/cemoody/scikit-learn.git | |
# | |
# Change to my branch | |
# | |
# git checkout cemoody/bhtsne | |
# | |
# Rebuild the sources, especially the new Cython file sklearn/manifold/bhtsne.pyx | |
# | |
# python setup.py build_ext -i | |
# | |
# Download this notebook | |
# | |
# curl https://gist.githubusercontent.com/cemoody/01135ef2f26837548360/raw/57288dd4ebd97a1d6447b14c78b8fb743147b91e/Barnes-Hut%20t-SNE%20Demo.ipynb > bhtsne_demo.ipynb | |
# | |
# Start the iPython Notebook, and then open the notebook | |
# | |
# ipython notebook | |
# | |
# Alternatively, you can download the script version of this and run it: | |
# | |
# | |
### Demo of BH t-SNE | |
# In[1]: | |
get_ipython().magic(u'matplotlib inline') | |
# In[2]: | |
from sklearn.manifold import bhtsne | |
import sys | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from time import time | |
from matplotlib import offsetbox | |
from sklearn import (manifold, datasets, decomposition, ensemble, lda, random_projection) | |
from sklearn.datasets import fetch_mldata | |
# In[3]: | |
digits = datasets.load_digits(n_class=6) | |
X = digits.data | |
y = digits.target | |
n_samples, n_features = X.shape | |
n_neighbors = 30 | |
# In[4]: | |
def plot_embedding(X, title=None): | |
x_min, x_max = np.min(X, 0), np.max(X, 0) | |
X = (X - x_min) / (x_max - x_min) | |
plt.figure(figsize=(15,15)) | |
ax = plt.subplot(111) | |
for i in range(X.shape[0]): | |
plt.text(X[i, 0], X[i, 1], str(digits.target[i]), | |
color=plt.cm.Set1(y[i] / 10.), | |
fontdict={'weight': 'bold', 'size': 9}) | |
if hasattr(offsetbox, 'AnnotationBbox'): | |
# only print thumbnails with matplotlib > 1.0 | |
shown_images = np.array([[1., 1.]]) # just something big | |
for i in range(digits.data.shape[0]): | |
dist = np.sum((X[i] - shown_images) ** 2, 1) | |
if np.min(dist) < 4e-3: | |
# don't show points that are too close | |
continue | |
shown_images = np.r_[shown_images, [X[i]]] | |
imagebox = offsetbox.AnnotationBbox( | |
offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r), | |
X[i]) | |
ax.add_artist(imagebox) | |
plt.xticks([]), plt.yticks([]) | |
if title is not None: | |
plt.title(title) | |
# In[5]: | |
tsne = manifold.TSNE(n_components=2, init='pca', random_state=1, method='barnes_hut', n_iter=1000, verbose=20) | |
t0 = time() | |
Xtbh = tsne.fit_transform(X) | |
t1 = time() | |
dtbh = t1 - t0 | |
# In[6]: | |
plot_embedding(Xtbh, title="Barnes-Hut t-SNE visualization of MNIST digits in %1.1f sec" % dtbh) | |
### Comparison with standard t-SNE | |
# In[7]: | |
tsne = manifold.TSNE(n_components=2, init='pca', random_state=1, method='standard', n_iter=1000, verbose=20) | |
t0 = time() | |
Xts = tsne.fit_transform(X) | |
t1 = time() | |
dts = t1 - t0 | |
# In[8]: | |
plot_embedding(Xts, title="Standard t-SNE visualization of MNIST digits in %1.1f sec" % dts) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment