Last active
January 3, 2016 05:49
-
-
Save goldingn/8418141 to your computer and use it in GitHub Desktop.
Playing with CUR decomposition (versus k-means) as a method for picking inducing points in sparse Gaussian processes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# clear the workspace | |
rm(list = ls()) | |
# load the relevant libraries | |
# install.packages(rCUR) | |
library(rCUR) # for CUR decomposition | |
# install.packages(irlba) | |
library(irlba) # for fast svd | |
# ~~~~~~~~~~~~~~~~~ | |
# generate data | |
# number of records | |
n <- 2000 | |
# number of clusters/imputing points (20%) | |
ncl <- n * 0.2 | |
# dataframe of datapoints (in covariate/feature space) | |
df <- data.frame(x = rnorm(n), | |
y = rnorm(n)) | |
# compute k-means cluster centres (& time it) | |
kmn_time <- system.time(kmn <- kmeans(df, ncl)) | |
# CUR - pre-calculations | |
# compute distance matrix & time it | |
dist_time <- system.time(d <- as.matrix(dist(df))) | |
# fast svd decomposition | |
svd_time <- system.time(svd <- irlba(d)) | |
# compute the first CUR (optimal points) & time it | |
cur1_time <- system.time(cur1 <- CUR(d, ncl, | |
method = "ortho.top.scores", | |
sv = svd, alpha = 2)) | |
# compute the second CUR (random weighted points) & time it | |
cur2_time <- system.time(cur2 <- CUR(d, ncl, | |
method = "exact.num.random", | |
sv = svd, alpha = 2)) | |
# compute Cholesky pivots | |
chol_time <- system.time(ch <- attr(chol(exp(-0.5 * d), | |
pivot = TRUE), | |
'pivot')) | |
# set up plotting | |
par(mfrow = c(2, 2), | |
mar = c(1, 1, 4, 1), | |
pty = 's') | |
# compute and plot the k-means solution | |
# plot all data | |
plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.6), cex = 0.5) | |
# plot the centres | |
points(kmn$centers, pch = 16, col = 'blue', cex = 0.5) | |
title(main = 'kmeans') | |
# compute and plot the CUR solution with optimal selection of columns/rows | |
# plot all data | |
plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.4), cex = 0.5) | |
# plot inducing points | |
points(y ~ x, data = df[[email protected], ], pch = 16, col = 'blue', cex = 0.5) | |
title(main = 'CUR, ortho.top.scores') | |
# compute and plot the CUR solution with weighted random selection of columns/rows | |
# plot all data | |
plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.4), cex = 0.5) | |
# plot inducing points | |
points(y ~ x, data = df[[email protected], ], pch = 16, col = 'blue', cex = 0.5) | |
title(main = 'CUR, exact.num.random') | |
# compute and plot the CUR solution with weighted random selection of columns/rows | |
# plot all data | |
plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.4), cex = 0.5) | |
# plot inducing points | |
points(y ~ x, data = df[ch[1:ncl], ], pch = 16, col = 'blue', cex = 0.5) | |
title(main = 'Cholesky pivots') | |
# ~~~~~~~~~~ | |
# timings | |
# distance matrix computation time | |
dist_time | |
# svd computation time | |
svd_time | |
# cur1 (optimal) computation time | |
cur1_time | |
# cur2 (random) computation time | |
cur2_time | |
# pivoted cholesky decomposition time | |
chol_time |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment