Skip to content

Instantly share code, notes, and snippets.

@brezniczky
Last active July 16, 2016 14:33
Show Gist options
  • Save brezniczky/95edde389469e4d0ff8e2158701469fd to your computer and use it in GitHub Desktop.
Save brezniczky/95edde389469e4d0ff8e2158701469fd to your computer and use it in GitHub Desktop.
xgboost random forests shown converging in parallel
# Here I will try to visualize how an xgboost random forest approximates the
# training set in case of a binary classification as the number of weak
# classifiers in the ensemble increase, and/or as the maximum depth of trees
# change.
library(animation)
library(xgboost)
n.iters = 400
video.speed = 2 # factor, should be an integer
# generate a sparse matrix describing a large rectangle of 0's, with two
# rectangles of 1's inside
# the input is within a 60 x 40 (wd x ht) rectangle
x = rep(1:60, times = 40)
y = rep(1:40, each = 60)
# the target value == TRUE areas are rectangles spanning over
# (5, 7) .. (7, 13) and (25, 20) .. (30, 30)
#
# also adding some (~ 2%) uniformly distributed noise
v =
((x >= 5) & (y >= 7) & (x <= 7) & (y <= 13)) |
((x >= 25) & (y >= 20) & (x <= 30) & (y <= 30)) |
runif(2400, 0, 1) > 0.98
mat = as.matrix(cbind(x = as.double(x), y = as.double(y)))
train = xgb.DMatrix(mat, label = v)
train.model = function(max.depth) {
xgb.params = list(
booster = "gbtree",
max.depth = max.depth)
set.seed(0)
# train and return the resulting model
xgb.train(
params = xgb.params,
data = train,
nrounds = n.iters,
nthread = 1)
}
draw.phase = function(model, frame.idx) {
pred = predict(model, train, ntreelimit = frame.idx)
image(y = 1:40, x = 1:60, z = t(matrix(nrow = 40, ncol = 60, byrow = TRUE,
data = pred)), axes = FALSE,
xlab = "x", ylab = "y", add = TRUE)
}
# a movie of 2x2 plots allowing to compare
#
# left to right, top to bottom
#
# 1: shows what underfitting is about
# 2: nice enough
# 3: seems perfect at 36 iterations
# 4: visible overfitting almost from the starts: "stars" appear
# observation:
# finding the right max_depth parameter seems to be about some sort of
# filtering (but not "blurring", at least in the given cases this sort of
# "RF-style filtering" maintains the edges mostly very sharply, except for the
# effect resulting from subsampling - which is possibly just another 'filtering'
# parameter)
models = list()
for(max.depth in 1:4) {
models[[max.depth]] = train.model(max.depth = max.depth)
}
cat("generating comparison movie\n")
movie.name = "random_forest_binary_comp.mpg"
saveVideo({
ani.options(nmax = n.iters %/% video.speed)
par(mfrow = c(2, 2))
for(frame.idx in seq(1, n.iters, by = video.speed)) {
for(max.depth in 1:4) {
plot(x = c(), xlim = c(1, 60), ylim = c(1, 40),
axes = FALSE, xlab = "", ylab = "",
main = paste0("xgboost at iter #", frame.idx, " depth ", max.depth))
draw.phase(models[[max.depth]], frame.idx)
}
}},
interval = 0.1, movie.name = movie.name,
ani.width = 600, ani.height = 400)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment