goldingn · January 3, 2016 05:49
diff --git a/CUR4FIC b/CUR4FIC
 # clear the workspace
 rm(list = ls())

 # load the relevant libraries

 # install.packages(rCUR)
 library(rCUR)  # for CUR decomposition
 # install.packages(irlba)
 library(irlba)  # for fast svd

 # ~~~~~~~~~~~~~~~~~
 # generate data
 # number of records
 n <- 2000

 # number of clusters/imputing points (20%)
 ncl <- n * 0.2

 # dataframe of datapoints (in covariate/feature space)
 df <- data.frame(x = rnorm(n),
                 y = rnorm(n))



 # compute k-means cluster centres (& time it)
 kmn_time <- system.time(kmn <- kmeans(df, ncl))

 # CUR - pre-calculations

 # compute distance matrix & time it
 dist_time <- system.time(d <- as.matrix(dist(df)))

 # fast svd decomposition
 svd_time <- system.time(svd <- irlba(d))

 # compute the first CUR (optimal points) & time it
 cur1_time <- system.time(cur1 <- CUR(d, ncl,
                                     method = "ortho.top.scores",
                                     sv = svd, alpha = 2))

 # compute the second CUR (random weighted points) & time it
 cur2_time <- system.time(cur2 <- CUR(d, ncl,
                                     method = "exact.num.random",
                                     sv = svd, alpha = 2))

 # compute Cholesky pivots
 chol_time <- system.time(ch <- attr(chol(exp(-0.5 * d),
                                         pivot = TRUE),
                                    'pivot'))

 # set up plotting
 par(mfrow = c(2, 2),
    mar = c(1, 1, 4, 1),
    pty = 's')

 # compute and plot the k-means solution
 # plot all data
 plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.6), cex = 0.5)
 # plot the centres
 points(kmn$centers, pch = 16, col = 'blue', cex = 0.5)
 title(main = 'kmeans')

 # compute and plot the CUR solution with optimal selection of columns/rows
 # plot all data
 plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.4), cex = 0.5)
 # plot inducing points
 points(y ~ x, data = df[[email protected], ], pch = 16, col = 'blue', cex = 0.5)
 title(main = 'CUR, ortho.top.scores')

 # compute and plot the CUR solution with weighted random selection of columns/rows
 # plot all data
 plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.4), cex = 0.5)
 # plot inducing points
 points(y ~ x, data = df[[email protected], ], pch = 16, col = 'blue', cex = 0.5)
 title(main = 'CUR, exact.num.random')

 # compute and plot the CUR solution with weighted random selection of columns/rows
 # plot all data
 plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.4), cex = 0.5)
 # plot inducing points
 points(y ~ x, data = df[ch[1:ncl], ], pch = 16, col = 'blue', cex = 0.5)
 title(main = 'Cholesky pivots')


 # ~~~~~~~~~~
 # timings
 # distance matrix computation time
 dist_time

 # svd computation time
 svd_time

 # cur1 (optimal) computation time
 cur1_time

 # cur2 (random) computation time
 cur2_time

 # pivoted cholesky decomposition time
 chol_time
	# clear the workspace
	rm(list = ls())

	# load the relevant libraries

	# install.packages(rCUR)
	library(rCUR) # for CUR decomposition
	# install.packages(irlba)
	library(irlba) # for fast svd

	# ~~~~~~~~~~~~~~~~~
	# generate data
	# number of records
	n <- 2000

	# number of clusters/imputing points (20%)
	ncl <- n * 0.2

	# dataframe of datapoints (in covariate/feature space)
	df <- data.frame(x = rnorm(n),
	y = rnorm(n))



	# compute k-means cluster centres (& time it)
	kmn_time <- system.time(kmn <- kmeans(df, ncl))

	# CUR - pre-calculations

	# compute distance matrix & time it
	dist_time <- system.time(d <- as.matrix(dist(df)))

	# fast svd decomposition
	svd_time <- system.time(svd <- irlba(d))

	# compute the first CUR (optimal points) & time it
	cur1_time <- system.time(cur1 <- CUR(d, ncl,
	method = "ortho.top.scores",
	sv = svd, alpha = 2))

	# compute the second CUR (random weighted points) & time it
	cur2_time <- system.time(cur2 <- CUR(d, ncl,
	method = "exact.num.random",
	sv = svd, alpha = 2))

	# compute Cholesky pivots
	chol_time <- system.time(ch <- attr(chol(exp(-0.5 * d),
	pivot = TRUE),
	'pivot'))

	# set up plotting
	par(mfrow = c(2, 2),
	mar = c(1, 1, 4, 1),
	pty = 's')

	# compute and plot the k-means solution
	# plot all data
	plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.6), cex = 0.5)
	# plot the centres
	points(kmn$centers, pch = 16, col = 'blue', cex = 0.5)
	title(main = 'kmeans')

	# compute and plot the CUR solution with optimal selection of columns/rows
	# plot all data
	plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.4), cex = 0.5)
	# plot inducing points
	points(y ~ x, data = df[[email protected], ], pch = 16, col = 'blue', cex = 0.5)
	title(main = 'CUR, ortho.top.scores')

	# compute and plot the CUR solution with weighted random selection of columns/rows
	# plot all data
	plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.4), cex = 0.5)
	# plot inducing points
	points(y ~ x, data = df[[email protected], ], pch = 16, col = 'blue', cex = 0.5)
	title(main = 'CUR, exact.num.random')

	# compute and plot the CUR solution with weighted random selection of columns/rows
	# plot all data
	plot(y ~ x, data = df, pch = 16, col = rgb(0.4, 0.4, 0.4, 0.4), cex = 0.5)
	# plot inducing points
	points(y ~ x, data = df[ch[1:ncl], ], pch = 16, col = 'blue', cex = 0.5)
	title(main = 'Cholesky pivots')


	# ~~~~~~~~~~
	# timings
	# distance matrix computation time
	dist_time

	# svd computation time
	svd_time

	# cur1 (optimal) computation time
	cur1_time

	# cur2 (random) computation time
	cur2_time

	# pivoted cholesky decomposition time
	chol_time