Skip to content

Instantly share code, notes, and snippets.

@allen-li1231
Last active May 10, 2020 21:37
Show Gist options
  • Save allen-li1231/7f257735e5b3ff1187de555a25dfdde5 to your computer and use it in GitHub Desktop.
Save allen-li1231/7f257735e5b3ff1187de555a25dfdde5 to your computer and use it in GitHub Desktop.
XGBoost model selection and fine tune from BUDT758T
library(xgboost)
airbnb_train <- read.csv()#TODO
airbnb_test <- read.csv()#TODO
train_test_split <- function(df, split_ = 0.7, seed_ = NULL)
{
if (!is.null(seed_))
{
set.seed(seed_)
}
train <- sample(nrow(df), split_ * nrow(df))
df_train <- df[train,]
df_test <- df[-train,]
return(list(train = df_train, test = df_test))
}
s <- train_test_split(airbnb_train, seed_ = 458)
train <- s[["train"]]
valid <- s[["test"]]
# XGBOOST
xgb_train <- model.matrix(high_booking_rate ~ . - 1, train)
xgb_valid <- model.matrix(high_booking_rate ~ . - 1, valid)
train_xgb_matrix <- xgb.DMatrix(xgb_train, label = train$high_booking_rate)
val_xgb <- list(train = xgb.DMatrix(data = xgb_train, label = train$high_booking_rate),
test = xgb.DMatrix(data = xgb_valid, label = valid$high_booking_rate))
random_grid_xgb <- function(data, label)
{
# param data: model matrix or xgb.DMatrix class, a table of dimensions only contains numerics
# param label: true Y for data
param <- list(booster = "gbtree",
objective = "binary:logistic",
eval_metric = "error",
max_depth = sample(6:10, 1),
num_parallel_tree = sample(c(50, 100, 150), 1),
eta = runif(1, .01, .3), # Learning rate, default: 0.3
subsample = runif(1, .6, .9),
colsample_bytree = runif(1, .5, .8),
min_child_weight = sample(1:20, 1),
max_delta_step = sample(1:10, 1),
lambda = sample(seq(0.1, 1, 0.01), 1),
nthread = 16)
seed.number <- sample.int(10000, 1) # set seed for the cv
set.seed(seed.number)
mdcv <- xgb.cv(data = data,
label = label,
params = param,
nfold = 5,
nrounds = 3,
prediction = TRUE,
verbose = 1,
early_stopping_rounds = 8,
maximize = FALSE)
max_acc_index <- mdcv$best_iteration
max_acc <- 1 - mdcv$evaluation_log[mdcv$best_iteration]$test_error_mean
return(list(acc=max_acc, acc_index=max_acc_index, seed=seed.number, param=param))
}
# randomly find best parameters
best_acc <- 0
for (iter in 1:100)
{
result <- random_grid_xgb(xgb_train, train$high_booking_rate)
if (result[["acc"]] > best_acc)
{
best_acc <- result[["acc"]]
best_acc_index <- result[["acc_index"]]
best_seed <- result[["seed"]]
best_param <- result[["param"]]
}
}
print(best_acc)
# after printed best_param we can get:
best_param = list(booster = "gbtree",
objective = "binary:logistic",
eval_metric = "error",
max_depth = 10,
num_parallel_tree = 50,
eta = 0.22,
subsample = 0.6918,
colsample_bytree = 0.6722,
min_child_weight = 14,
max_delta_step = 10,
lambda = 0.64,
nthread=8
)
# implement best params on model
best_seed = 7576
set.seed(best_seed)
xgb_mod1 <- xgb.train(params = best_param,
data = train_xgb_matrix,
watchlist = val_xgb,
early_stopping_rounds = 8,
lambda_bias = 0.01,
maximize = F,
nrounds = 100)
# check accuracy on all train data
xgb_pred <- predict(xgb_mod1, xgb_train)
acc <- sum((xgb_pred > 0.5) == train$high_booking_rate) / length(xgb_pred)
print(acc)
# XGBOOST output
xgb_test <- model.matrix(~. - 1, airbnb_test)
# fill uncategorized columns
for (col_name in colnames(xgb_train))
{
if (!col_name %in% colnames(xgb_test))
{
xgb_test <- cbind(xgb_test, as.vector(replicate(nrow(xgb_test), 0)))
colnames(xgb_test)[ncol(xgb_test)] <- col_name
}
}
# align column order with model
xgb_test <- xgb_test[, xgb_mod1[["feature_names"]]]
xgb_pred <- predict(xgb_mod1, xgb_test)
test_preds <- ifelse(xgb_pred > 0.5, 1, 0)
test_preds <- data.frame(index = airbnb_test_new$X, high_booking_rate = test_preds)
for (i in 1:max(airbnb_test_X$X))
{
if (sum(test_preds$index == i) == 0)
{
test_preds[nrow(test_preds) + 1,] <- c(i, 0)
}
}
test_preds <- test_preds[order(test_preds$index),]
write.csv(test_preds, "OUTPUT_PATH/FILENAME.csv", row.names = F)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment