Skip to content

Instantly share code, notes, and snippets.

@goldingn
Last active January 3, 2016 05:49
Show Gist options
  • Save goldingn/8418141 to your computer and use it in GitHub Desktop.
Save goldingn/8418141 to your computer and use it in GitHub Desktop.
Playing with CUR decomposition (versus k-means) as a method for picking inducing points in sparse Gaussian processes
# clear the workspace
rm(list = ls())
# load the relevant libraries
# install.packages(rCUR)
library(rCUR) # for CUR decomposition
# install.packages(irlba)
library(irlba) # for fast svd
# ~~~~~~~~~~~~~~~~~
# generate data
# number of records
n <- 2000
# number of clusters/imputing points (20%)
ncl <- n * 0.2
# dataframe of datapoints (in covariate/feature space)
df <- data.frame(x = rnorm(n),
y = rnorm(n))
# compute k-means cluster centres (& time it)
kmn_time <- system.time(kmn <- kmeans(df, ncl))
# CUR - pre-calculations
# compute distance matrix & time it
dist_time <- system.time(d <- as.matrix(dist(df)))
# fast svd decomposition
svd_time <- system.time(svd <- irlba(d))
# compute the first CUR (optimal points) & time it
cur1_time <- system.time(cur1 <- CUR(d, ncl,
method = "ortho.top.scores",
sv = svd, alpha = 2))
# compute the second CUR (random weighted points) & time it
cur2_time <- system.time(cur2 <- CUR(d, ncl,
method = "exact.num.random",
sv = svd, alpha = 2))
# compute Cholesky pivots
chol_time <- system.time(ch <- attr(chol(exp(-0.5 * d),
pivot = TRUE),
'pivot'))
# set up plotting
par(mfrow = c(2, 2),
mar = c(1, 1, 4, 1),
pty = 's')
# compute and plot the k-means solution
# plot all data
plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.6), cex = 0.5)
# plot the centres
points(kmn$centers, pch = 16, col = 'blue', cex = 0.5)
title(main = 'kmeans')
# compute and plot the CUR solution with optimal selection of columns/rows
# plot all data
plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.4), cex = 0.5)
# plot inducing points
points(y ~ x, data = df[[email protected], ], pch = 16, col = 'blue', cex = 0.5)
title(main = 'CUR, ortho.top.scores')
# compute and plot the CUR solution with weighted random selection of columns/rows
# plot all data
plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.4), cex = 0.5)
# plot inducing points
points(y ~ x, data = df[[email protected], ], pch = 16, col = 'blue', cex = 0.5)
title(main = 'CUR, exact.num.random')
# compute and plot the CUR solution with weighted random selection of columns/rows
# plot all data
plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.4), cex = 0.5)
# plot inducing points
points(y ~ x, data = df[ch[1:ncl], ], pch = 16, col = 'blue', cex = 0.5)
title(main = 'Cholesky pivots')
# ~~~~~~~~~~
# timings
# distance matrix computation time
dist_time
# svd computation time
svd_time
# cur1 (optimal) computation time
cur1_time
# cur2 (random) computation time
cur2_time
# pivoted cholesky decomposition time
chol_time
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment