brezniczky · July 16, 2016 14:33
diff --git a/random_forest_visualization.R b/random_forest_visualization.R
 # Here I will try to visualize how an xgboost random forest approximates the
 # training set in case of a binary classification as the number of weak
 # classifiers in the ensemble increase, and/or as the maximum depth of trees
 # change.

 library(animation)
 library(xgboost)

 n.iters = 400
 video.speed = 2 # factor, should be an integer

 # generate a sparse matrix describing a large rectangle of 0's, with two
 # rectangles of 1's inside

 # the input is within a 60 x 40 (wd x ht) rectangle
 x = rep(1:60, times = 40)
 y = rep(1:40, each  = 60)

 # the target value == TRUE areas are rectangles spanning over
 # (5, 7) .. (7, 13) and (25, 20) .. (30, 30)
 #
 # also adding some (~ 2%) uniformly distributed noise
 v =
  ((x >= 5) & (y >= 7) & (x <= 7) & (y <= 13)) |
  ((x >= 25) & (y >= 20) & (x <= 30) & (y <= 30)) |
  runif(2400, 0, 1) > 0.98

 mat = as.matrix(cbind(x = as.double(x), y = as.double(y)))

 train = xgb.DMatrix(mat, label = v)

 train.model = function(max.depth) {
  xgb.params = list(
    booster = "gbtree",
    max.depth = max.depth)

  set.seed(0)

  # train and return the resulting model
  xgb.train(
    params = xgb.params,
    data = train,
    nrounds = n.iters,
    nthread = 1)
 }

 draw.phase = function(model, frame.idx) {
  pred = predict(model, train, ntreelimit = frame.idx)
  image(y = 1:40, x = 1:60, z = t(matrix(nrow = 40, ncol = 60, byrow = TRUE,
                                         data = pred)), axes = FALSE,
        xlab = "x", ylab = "y", add = TRUE)
 }

 # a movie of 2x2 plots allowing to compare
 #
 # left to right, top to bottom
 #
 # 1: shows what underfitting is about
 # 2: nice enough
 # 3: seems perfect at 36 iterations
 # 4: visible overfitting almost from the starts: "stars" appear

 # observation:
 # finding the right max_depth parameter seems to be about some sort of
 # filtering (but not "blurring", at least in the given cases this sort of
 # "RF-style filtering" maintains the edges mostly very sharply, except for the
 # effect resulting from subsampling - which is possibly just another 'filtering'
 # parameter)

 models = list()
 for(max.depth in 1:4) {
  models[[max.depth]] = train.model(max.depth = max.depth)
 }

 cat("generating comparison movie\n")
 movie.name = "random_forest_binary_comp.mpg"
 saveVideo({
  ani.options(nmax = n.iters %/% video.speed)
  par(mfrow = c(2, 2))
  for(frame.idx in seq(1, n.iters, by = video.speed)) {
    for(max.depth in 1:4) {
      plot(x = c(), xlim = c(1, 60), ylim = c(1, 40),
           axes = FALSE, xlab = "", ylab = "",
           main = paste0("xgboost at iter #", frame.idx, " depth ", max.depth))
      draw.phase(models[[max.depth]], frame.idx)
    }
  }},
  interval = 0.1, movie.name = movie.name,
  ani.width = 600, ani.height = 400)
	# Here I will try to visualize how an xgboost random forest approximates the
	# training set in case of a binary classification as the number of weak
	# classifiers in the ensemble increase, and/or as the maximum depth of trees
	# change.

	library(animation)
	library(xgboost)

	n.iters = 400
	video.speed = 2 # factor, should be an integer

	# generate a sparse matrix describing a large rectangle of 0's, with two
	# rectangles of 1's inside

	# the input is within a 60 x 40 (wd x ht) rectangle
	x = rep(1:60, times = 40)
	y = rep(1:40, each = 60)

	# the target value == TRUE areas are rectangles spanning over
	# (5, 7) .. (7, 13) and (25, 20) .. (30, 30)
	#
	# also adding some (~ 2%) uniformly distributed noise
	v =
	((x >= 5) & (y >= 7) & (x <= 7) & (y <= 13)) \|
	((x >= 25) & (y >= 20) & (x <= 30) & (y <= 30)) \|
	runif(2400, 0, 1) > 0.98

	mat = as.matrix(cbind(x = as.double(x), y = as.double(y)))

	train = xgb.DMatrix(mat, label = v)

	train.model = function(max.depth) {
	xgb.params = list(
	booster = "gbtree",
	max.depth = max.depth)

	set.seed(0)

	# train and return the resulting model
	xgb.train(
	params = xgb.params,
	data = train,
	nrounds = n.iters,
	nthread = 1)
	}

	draw.phase = function(model, frame.idx) {
	pred = predict(model, train, ntreelimit = frame.idx)
	image(y = 1:40, x = 1:60, z = t(matrix(nrow = 40, ncol = 60, byrow = TRUE,
	data = pred)), axes = FALSE,
	xlab = "x", ylab = "y", add = TRUE)
	}

	# a movie of 2x2 plots allowing to compare
	#
	# left to right, top to bottom
	#
	# 1: shows what underfitting is about
	# 2: nice enough
	# 3: seems perfect at 36 iterations
	# 4: visible overfitting almost from the starts: "stars" appear

	# observation:
	# finding the right max_depth parameter seems to be about some sort of
	# filtering (but not "blurring", at least in the given cases this sort of
	# "RF-style filtering" maintains the edges mostly very sharply, except for the
	# effect resulting from subsampling - which is possibly just another 'filtering'
	# parameter)

	models = list()
	for(max.depth in 1:4) {
	models[[max.depth]] = train.model(max.depth = max.depth)
	}

	cat("generating comparison movie\n")
	movie.name = "random_forest_binary_comp.mpg"
	saveVideo({
	ani.options(nmax = n.iters %/% video.speed)
	par(mfrow = c(2, 2))
	for(frame.idx in seq(1, n.iters, by = video.speed)) {
	for(max.depth in 1:4) {
	plot(x = c(), xlim = c(1, 60), ylim = c(1, 40),
	axes = FALSE, xlab = "", ylab = "",
	main = paste0("xgboost at iter #", frame.idx, " depth ", max.depth))
	draw.phase(models[[max.depth]], frame.idx)
	}
	}},
	interval = 0.1, movie.name = movie.name,
	ani.width = 600, ani.height = 400)