Last active
July 16, 2016 14:33
-
-
Save brezniczky/95edde389469e4d0ff8e2158701469fd to your computer and use it in GitHub Desktop.
xgboost random forests shown converging in parallel
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Here I will try to visualize how an xgboost random forest approximates the | |
# training set in case of a binary classification as the number of weak | |
# classifiers in the ensemble increase, and/or as the maximum depth of trees | |
# change. | |
library(animation) | |
library(xgboost) | |
n.iters = 400 | |
video.speed = 2 # factor, should be an integer | |
# generate a sparse matrix describing a large rectangle of 0's, with two | |
# rectangles of 1's inside | |
# the input is within a 60 x 40 (wd x ht) rectangle | |
x = rep(1:60, times = 40) | |
y = rep(1:40, each = 60) | |
# the target value == TRUE areas are rectangles spanning over | |
# (5, 7) .. (7, 13) and (25, 20) .. (30, 30) | |
# | |
# also adding some (~ 2%) uniformly distributed noise | |
v = | |
((x >= 5) & (y >= 7) & (x <= 7) & (y <= 13)) | | |
((x >= 25) & (y >= 20) & (x <= 30) & (y <= 30)) | | |
runif(2400, 0, 1) > 0.98 | |
mat = as.matrix(cbind(x = as.double(x), y = as.double(y))) | |
train = xgb.DMatrix(mat, label = v) | |
train.model = function(max.depth) { | |
xgb.params = list( | |
booster = "gbtree", | |
max.depth = max.depth) | |
set.seed(0) | |
# train and return the resulting model | |
xgb.train( | |
params = xgb.params, | |
data = train, | |
nrounds = n.iters, | |
nthread = 1) | |
} | |
draw.phase = function(model, frame.idx) { | |
pred = predict(model, train, ntreelimit = frame.idx) | |
image(y = 1:40, x = 1:60, z = t(matrix(nrow = 40, ncol = 60, byrow = TRUE, | |
data = pred)), axes = FALSE, | |
xlab = "x", ylab = "y", add = TRUE) | |
} | |
# a movie of 2x2 plots allowing to compare | |
# | |
# left to right, top to bottom | |
# | |
# 1: shows what underfitting is about | |
# 2: nice enough | |
# 3: seems perfect at 36 iterations | |
# 4: visible overfitting almost from the starts: "stars" appear | |
# observation: | |
# finding the right max_depth parameter seems to be about some sort of | |
# filtering (but not "blurring", at least in the given cases this sort of | |
# "RF-style filtering" maintains the edges mostly very sharply, except for the | |
# effect resulting from subsampling - which is possibly just another 'filtering' | |
# parameter) | |
models = list() | |
for(max.depth in 1:4) { | |
models[[max.depth]] = train.model(max.depth = max.depth) | |
} | |
cat("generating comparison movie\n") | |
movie.name = "random_forest_binary_comp.mpg" | |
saveVideo({ | |
ani.options(nmax = n.iters %/% video.speed) | |
par(mfrow = c(2, 2)) | |
for(frame.idx in seq(1, n.iters, by = video.speed)) { | |
for(max.depth in 1:4) { | |
plot(x = c(), xlim = c(1, 60), ylim = c(1, 40), | |
axes = FALSE, xlab = "", ylab = "", | |
main = paste0("xgboost at iter #", frame.idx, " depth ", max.depth)) | |
draw.phase(models[[max.depth]], frame.idx) | |
} | |
}}, | |
interval = 0.1, movie.name = movie.name, | |
ani.width = 600, ani.height = 400) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment