Created
March 29, 2020 14:16
-
-
Save Laurae2/13d6b55848b84fcb16f5b83f8cb3a070 to your computer and use it in GitHub Desktop.
Benchmark xgboost and LightGBM using HIGGS dataset in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(xgboost) | |
library(lightgbm) | |
library(data.table) | |
setwd("/home/laurae/Documents/R/GBM-perf") | |
n_threads <- 16 | |
data <- fread("HIGGS.csv") | |
labels <- data$V1 | |
data[, V1 := NULL] | |
data <- as.matrix(data) | |
invisible(gc(verbose = FALSE)) | |
data_train <- data[1:10000000, ] | |
data_valid <- data[10000001:11000000, ] | |
labels_train <- labels[1:10000000] | |
labels_valid <- labels[10000001:11000000] | |
auc <- function(preds, labels) { | |
x1 = as.numeric(preds[labels == 1]) | |
n1 = as.numeric(length(x1)) | |
x2 = as.numeric(preds[labels == 0]) | |
n2 = as.numeric(length(x2)) | |
r = rank(c(x1,x2)) | |
return((sum(r[1:n1]) - n1 * (n1 + 1) / 2) / (n1 * n2)) | |
} | |
invisible(gc(verbose = FALSE)) | |
# 0.005 seconds | |
system.time({ | |
dtrain_lgb <- lgb.Dataset(data_train, label = labels_train) | |
dvalid_lgb <- lgb.Dataset.create.valid(dtrain_lgb, data_valid, label = labels_valid) | |
}) | |
invisible(gc(verbose = FALSE)) | |
valids_lgb <- list(valid = dvalid_lgb) | |
params_lgb <- list(max_depth = 0, | |
num_leaves = 255, | |
learning_rate = 0.1, | |
min_data_in_leaf = 1, | |
min_sum_hessian_in_leaf = 100, | |
lambda_l1 = 0, | |
lambda_l2 = 0, | |
min_gain_to_split = 0, | |
max_bin = 255, | |
force_row_wise = TRUE, | |
boosting = "gbdt", | |
objective = "regression", | |
metric = "na") | |
# [2646.234] 74.762 seconds (36 threads) / [1245.910] 79.184 seconds (16 threads) | |
system.time({ | |
model_lgb <- lgb.train( | |
params = params_lgb, | |
data = dtrain_lgb, | |
nrounds = 500, | |
num_threads = n_threads, | |
verbose = 2 | |
) | |
}) | |
invisible(gc(verbose = FALSE)) | |
# [58.849] 1.939 seconds (36 threads) / [61.698] 3.985 seconds (16 threads) | |
system.time({ | |
predict_lgb <- predict(model_lgb, data_valid, num_threads = n_threads) | |
}) | |
auc(predict_lgb, labels_valid) # 0.8422343 | |
invisible(gc(verbose = FALSE)) | |
params_lgb <- list(max_depth = 0, | |
num_leaves = 255, | |
learning_rate = 0.1, | |
min_data_in_leaf = 1, | |
min_sum_hessian_in_leaf = 100, | |
lambda_l1 = 0, | |
lambda_l2 = 0, | |
min_gain_to_split = 0, | |
max_bin = 255, | |
force_row_wise = TRUE, | |
boosting = "gbdt", | |
objective = "regression", | |
metric = "auc") | |
# [15073.675] 427.270 seconds (36 threads) / [6979.774] 444.395 seconds (16 threads) | |
system.time({ | |
model_lgb <- lgb.train( | |
params = params_lgb, | |
data = dtrain_lgb, | |
nrounds = 1000000, | |
valids = valids_lgb, | |
num_threads = n_threads, | |
early_stopping_rounds = 10, | |
verbose = 2 | |
) | |
}) | |
invisible(gc(verbose = FALSE)) | |
# [327.103] 9.868 seconds (36 threads) / [321.714] 20.912 seconds (16 threads) | |
system.time({ | |
predict_lgb <- predict(model_lgb, data_valid, num_threads = n_threads) | |
}) | |
auc(predict_lgb, labels_valid) # 0.8525457 | |
invisible(gc(verbose = FALSE)) | |
# 6.109 seconds | |
system.time({ | |
dtrain_xgb <- xgb.DMatrix(data_train, label = labels_train) | |
dvalid_xgb <- xgb.DMatrix(data_valid, label = labels_valid) | |
}) | |
invisible(gc(verbose = FALSE)) | |
valids_xgb <- list(test = dvalid_xgb) | |
params_xgb <- list(max_depth = 0, | |
max_leaves = 255, | |
eta = 0.1, | |
min_child_weight = 100, | |
alpha = 0, | |
lambda = 0, | |
gamma = 0, | |
max_bin = 255, | |
tree_method = "hist", | |
grow_policy = "lossguide", | |
objective = "reg:squarederror", | |
disable_default_eval_metric = 1) | |
# [3751.056] 105.180 seconds (36 threads) / [1826.430] 115.885 seconds (16 threads) | |
system.time({ | |
model_xgb <- xgb.train( | |
params = params_xgb, | |
data = dtrain_xgb, | |
nrounds = 500, | |
nthread = n_threads, | |
verbose = 2 | |
) | |
}) | |
invisible(gc(verbose = FALSE)) | |
# [40.590] 1.822 seconds (36 threads) / [39.515] 3.041 seconds (16 threads) | |
system.time({ | |
predict_xgb <- predict(model_xgb, data_valid) | |
}) | |
auc(predict_xgb, labels_valid) # 0.8425165 | |
invisible(gc(verbose = FALSE)) | |
params_xgb <- list(max_depth = 0, | |
max_leaves = 255, | |
eta = 0.1, | |
min_child_weight = 100, | |
alpha = 0, | |
lambda = 0, | |
gamma = 0, | |
max_bin = 255, | |
tree_method = "hist", | |
grow_policy = "lossguide", | |
objective = "reg:squarederror", | |
eval_metric = "auc") | |
# [21136.858] 627.479 seconds (36 threads) / [9528.641] 613.953 seconds (16 threads) | |
system.time({ | |
model_xgb <- xgb.train( | |
params = params_xgb, | |
data = dtrain_xgb, | |
nrounds = 1000000, | |
watchlist = valids_xgb, | |
nthread = n_threads, | |
early_stopping_rounds = 10, | |
verbose = 2 | |
) | |
}) | |
invisible(gc(verbose = FALSE)) | |
# [214.984] 7.067 seconds (36 threads) / [228.591] 15.334 seconds (16 threads) | |
system.time({ | |
predict_xgb <- predict(model_xgb, data_valid) | |
}) | |
auc(predict_xgb, labels_valid) # 0.8525526 | |
invisible(gc(verbose = FALSE)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment