Last active
July 13, 2018 02:07
-
-
Save xccds/432b8752a2f9c14e3148 to your computer and use it in GitHub Desktop.
Learn Gradient Boosting Model by coding
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Learn Gradient Boosting Model by coding | |
# good slide | |
# http://www.ccs.neu.edu/home/vip/teach/MLcourse/4_boosting/slides/gradient_boosting.pdf | |
# http://cran.r-project.org/web/packages/gbm/gbm.pdf | |
#1 Gradient Boosting for Regression | |
# generate data | |
generate_func = function(n=1000,size=0.5){ | |
# n: how many data points | |
# size: hold 50% of all data as train | |
set.seed(1) | |
x = seq(0,2*pi,length=n) # generate x | |
noise = rnorm(n,0,0.3) | |
y = sin(x) + noise # generate y | |
select_index = sample(1:n,size=size*n,replace = F) | |
# split data into train and test | |
train_x = x[select_index] | |
train_y = y[select_index] | |
test_x = x[-select_index] | |
test_y = y[-select_index] | |
data = list('train_x'=train_x, | |
'train_y'=train_y, | |
'test_x'=test_x, | |
'test_y'=test_y) | |
return(data) | |
} | |
data = generate_func() | |
objects(data) | |
# train boosting regression tree | |
GBR = function(x,y,rate=1,iter=100){ | |
# iter is iterate number, higher is better, but be carefule overfit. | |
# rate is learning speed, lower is better, but too slow will cause longer time. | |
library(rpart) | |
max_depth=1 # tree stump | |
mu = mean(y) # start with an initial model | |
dy = y - mu # residuals or error, These are the parts that existing model cannot do well. | |
learner = list() # define a learners holder | |
for (i in 1:iter) { | |
# use a weak model to improve | |
learner[[i]] = rpart(dy~x, # error is target variable | |
control=rpart.control(maxdepth=max_depth,cp=0)) # cp=0 means to growth a tree with any cost | |
# modify residuals for next iter | |
dy = dy - rate*predict(learner[[i]]) | |
} | |
result = list('learner'=learner, | |
'rate'=rate, | |
'iter'=iter) | |
return(result) | |
} | |
model = GBR(data$train_x,data$train_y,iter=1000) | |
objects(model) | |
# predict function | |
predict_GBR = function(x,y,model){ | |
predict_y = list() # hold predict result | |
mu = mean(y) # initial model | |
n = length(y) | |
iter = model$iter | |
rate = model$rate | |
predict_y[[1]] = rep(mu,n) | |
learner = model$learner | |
for (i in 2:iter) { | |
# add pre-model prediction to predict y | |
predict_y[[i]] = predict_y[[i-1]] + rate * predict(learner[[i-1]],newdata=data.frame(x)) | |
} | |
# mean sqare error | |
mse = sapply(predict_y,function(pre_y) round(mean((y-pre_y)^2),3)) | |
result = list('predict_y'=predict_y, | |
'mse'= mse) | |
return(result) | |
} | |
# predict train data | |
predict_train = predict_GBR(data$train_x,data$train_y,model=model) | |
objects(predict_train) | |
# more trees get less error | |
plot(predict_train$mse,type='l') | |
# plot the effect of boosing tree | |
plotfunc = function(x,y,predict,num){ | |
library(ggplot2) | |
pre = predict$predict_y[[num]] | |
plotdf = data.frame(x=x,y=y,pre = pre) | |
mse = round(mean((y-pre)^2),3) | |
p = ggplot(plotdf,aes(x,y)) + | |
geom_point(alpha=0.5) | |
p = p + geom_line(aes(x,pre)) + xlab(paste0('mse=',mse)) | |
plot(p) | |
} | |
# use mean of y to predict data | |
plotfunc(data$train_x,data$train_y,predict_train,num=1) | |
# add another tree model | |
plotfunc(data$train_x,data$train_y,predict_train,num=2) | |
# add more tree getter more result | |
plotfunc(data$train_x,data$train_y,predict_train,num=10) | |
plotfunc(data$train_x,data$train_y,predict_train,num=100) | |
# predict test data | |
predict_test = predict_GBR(data$test_x,data$test_y,model=model) | |
plot(predict_test$mse,type='l') | |
plotfunc(data$test_x,data$test_y,predict_test,10) | |
plotfunc(data$test_x,data$test_y,predict_test,100) | |
# compare different parametter | |
# create 2 models with different rate | |
model_1 = GBR(data$train_x,data$train_y,rate=1,iter=500) | |
model_2 = GBR(data$train_x,data$train_y,rate=0.1,iter=500) | |
# use train and test data, we have 4 results | |
predict_train_1 = predict_GBR(data$train_x,data$train_y,model=model_1) | |
predict_train_2 = predict_GBR(data$train_x,data$train_y,model=model_2) | |
predict_test_1 = predict_GBR(data$test_x,data$test_y,model=model_1) | |
predict_test_2 = predict_GBR(data$test_x,data$test_y,model=model_2) | |
# take out mse of these 4 results | |
train_error_1 = predict_train_1$mse | |
train_error_2 = predict_train_2$mse | |
test_error_1 = predict_test_1$mse | |
test_error_2 = predict_test_2$mse | |
iter = 1:model_1$iter | |
# compare these mse | |
plotdf = data.frame(iter,train_error_1,test_error_1,train_error_2,test_error_2) | |
p = ggplot(plotdf)+ | |
geom_line(aes(x=iter,y=train_error_1),color='blue')+ | |
geom_line(aes(x=iter,y=test_error_1),color='red') + | |
geom_line(aes(x=iter,y=train_error_2),linetype=2,color='blue')+ | |
geom_line(aes(x=iter,y=test_error_2),linetype=2,color='red') | |
print(p) | |
# test error is better model performance. | |
# less rate is better but need more iter | |
which.min(train_error_1) | |
which.min(train_error_2) | |
which.min(test_error_1) | |
which.min(test_error_2) | |
# 2. Gradient Boosting for Classification | |
# read data | |
data = subset(iris,Species!='virginica',select = c(1,2,5)) | |
data$y = ifelse(data$Species == 'setosa',1,-1) | |
data$Species = NULL | |
names(data)[1:2] = c('x1','x2') | |
head(data) | |
p = ggplot(data,aes(x1,x2,color=factor(y)))+ | |
geom_point() | |
print(p) | |
# train boosting tree for classification | |
GBC = function(data,rate=0.1,iter=100){ | |
library(rpart) | |
max_depth=1 | |
learner = list() | |
mu = mean(data$y==1) | |
# start with an initial model | |
# mu=p(y=1) -> f=w*x = log(mu/(1-mu)) | |
f = log(mu/(1-mu)) | |
data$dy = data$y/(1+exp(data$y*f)) # dy is negtive gradient of log loss funtion | |
for (i in 1:iter) { | |
# use a weak model to improve | |
learner[[i]] = rpart(dy~x1+x2,data=data, | |
control=rpart.control(maxdepth=max_depth,cp=0)) | |
# improve model | |
f = f + rate *predict(learner[[i]]) | |
# modify dy | |
data$dy = data$y/(1+exp(data$y*f)) | |
} | |
result = list('learner'=learner, | |
'rate'=rate, | |
'iter'=iter) | |
return(result) | |
} | |
model = GBC(data,rate=0.1,iter=1000) | |
objects(model) | |
# predict function | |
predict_GBC = function(data,model){ | |
predict_y = list() | |
mu = mean(data$y==1) | |
f = log(mu/(1-mu)) | |
n = nrow(data) | |
iter = model$iter | |
rate = model$rate | |
predict_y[[1]] = rep(f,n) | |
learner = model$learner | |
for (i in 2:iter) { | |
predict_y[[i]] = predict_y[[i-1]] + rate *predict(learner[[i-1]],newdata=data) | |
} | |
mse = sapply(predict_y,function(pre_y) sum(log(1+exp(-data$y*pre_y)))) # logistic loss function | |
result = list('predict_y'=predict_y, | |
'mse'= mse) | |
return(result) | |
} | |
# predict data | |
predict_train = predict_GBC(data,model=model) | |
objects(predict_train) | |
plot(predict_train$mse,type='l') | |
final = predict_train$predict_y[[1000]] | |
y_p = 1/(1+exp(-final)) | |
y_pred = ifelse(y_p>0.5,1,-1) | |
table(data$y, y_pred) | |
# # plot | |
plotfunc2 = function(data,num,rate=0.1){ | |
library(ggplot2) | |
model = GBC(data,rate=rate,iter=num) | |
predict_train = predict_GBC(data,model=model) | |
final = predict_train$predict_y[[num]] | |
y_p = 1/(1+exp(-final)) | |
data$y_pre = ifelse(y_p>0.5,1,-1) | |
tree_f = sapply(model$learner,function(x){ | |
temp = x$splits | |
return(row.names(temp)[1]) | |
}) | |
tree_v = sapply(model$learner,function(x){ | |
temp = x$splits | |
return(temp[1,'index']) | |
}) | |
x1value = tree_v[tree_f=='x1'] | |
x2value = tree_v[tree_f=='x2'] | |
p = ggplot(data,aes(x=x1,y=x2)) | |
p = p + geom_point(aes(color=factor(y_pre)),size=5) + | |
geom_point(aes(color=factor(y)),size=3)+ | |
geom_vline(xintercept = x1value,alpha=0.4) + | |
geom_hline(yintercept = x2value,alpha=0.4) + | |
scale_colour_brewer(type="seq", palette='Set1') + | |
theme_bw() | |
print(p) | |
} | |
plotfunc2(data,num=10) | |
plotfunc2(data,num=20) | |
plotfunc2(data,num=60) | |
plotfunc2(data,num=100) | |
plotfunc2(data,num=1000) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment