Last active
September 25, 2020 20:38
-
-
Save Laurae2/7195cebe65887907a06e9118a3ec7f96 to your computer and use it in GitHub Desktop.
Install LightGBM with CUDA on R (walk on thin ice)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Walk on thin ice in R for gcc-7/g++-7 + CUDA 10.0.130 | |
Note: OpenCL can run on Single Precision (gpu_use_dp = FALSE) or Double Precision (gpu_use_dp = TRUE) whereas CUDA is strictly Double Precision in LightGBM. | |
-- | |
STEP 1: HACK IN SOME FILES | |
Hack in the following: | |
``` | |
cmake_args <- c(cmake_args, "-DUSE_CUDA=ON", "-DCUDAToolkit_ROOT=/usr/lib/cuda", "-DCMAKE_C_COMPILER=/usr/bin/gcc-7", "-DCMAKE_CXX_COMPILER=/usr/bin/g++-7", "-DCMAKE_CUDA_COMPILER=/usr/lib/cuda/bin/nvcc", "-DCUDA_TOOLKIT_ROOT_DIR=/usr/lib/cuda") | |
``` | |
In this line: https://github.com/microsoft/LightGBM/blob/571cad7e5512ab6dafc410ac490a0e34f0a98113/R-package/src/install.libs.R#L171 | |
-- | |
STEP 2: HACK IN THE COMPILATION | |
I use the root user to install R package for everyone (and everyone can't uninstall them, except root): | |
``` | |
CUDAHOSTC=/usr/bin/gcc-7 CUDAHOSTCXX=/usr/bin/g++-7 sudo -E Rscript build_r.R | |
``` | |
-- | |
STEP 3: TRY THE INSTALLATION | |
Run the following: | |
``` | |
library(lightgbm) | |
data(agaricus.train, package = "lightgbm") | |
train <- agaricus.train | |
dtrain <- lgb.Dataset(train$data, label = train$label) | |
data(agaricus.test, package = "lightgbm") | |
test <- agaricus.test | |
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label) | |
params <- list(objective = "regression", metric = "l2", device = "cuda") | |
valids <- list(test = dtest) | |
model <- lgb.train(params, | |
dtrain, | |
100, | |
valids, | |
min_data = 1, | |
learning_rate = 1, | |
early_stopping_rounds = 10) | |
``` | |
You will get something like this, which is good: | |
``` | |
[LightGBM] [Warning] CUDA currently requires double precision calculations. | |
[LightGBM] [Warning] CUDA currently requires double precision calculations. | |
[LightGBM] [Warning] CUDA currently requires double precision calculations. | |
[LightGBM] [Info] LightGBM using CUDA trainer with DP float!! | |
[LightGBM] [Info] Total Bins 232 | |
[LightGBM] [Info] Number of data points in the train set: 6513, number of used features: 116 | |
[LightGBM] [Info] Start training from score 0.482113 | |
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf | |
[1]: test's l2:6.44165e-17 | |
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf | |
[2]: test's l2:1.97215e-31 | |
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf | |
[3]: test's l2:0 | |
``` | |
However it complains about CUDA double precision on CPU training sometimes... have to find out why (maybe need to destroy the GPU dataset first). | |
Some stuff tested: | |
* Interrupting training on GPU: pass | |
* Multi GPU training: pass | |
* Switching back and forth CPU and GPU training in an interactive session: pass | |
* Switching back and forth GPU and multi GPu training in an interactive session: pass | |
* Releasing GPU memory: FAIL | |
-- | |
STEP 4: DO SOME BENCHMARKS | |
Using HIGGS (10m) and 255 leaves... | |
Hardware setup: | |
- Dual Xeon 6154 (2x 18C/36T @3.7 GHz fixed non AVX-256/512 turbo, 205W) | |
- 768GB RAM DDR4 @2666 MHz | |
- Quad Quadro P1000 (4x 640 CUDA cores @1544 MHz, 4GB RAM @2504 MHz, 47W, 3@16x PCI-e) | |
Benchmark results on 1 run: | |
36T Dual Xeon 6154 CPU: 85.054s / 0.8422343 AUC | |
1x Quadro P1000 GPU: 253.607s / 0.8422343 AUC / 435 MB GPU (457 MB GPU total) | |
2x Quadro P1000 GPU: 175.719s / 0.8422343 AUC / 301 MB GPU (323 MB GPU total) | |
3x Quadro P1000 GPU: 150.415s / 0.8422343 AUC / 253 MB GPU (275 MB GPU total) | |
4x Quadro P1000 GPU: 143.037s / 0.8422343 AUC / 235 MB GPU (257 MB GPU total) | |
OpenCL Quadro P1000 sp: 105.843s / 0.8426538 AUC / 479 MB GPU (501 MB GPU total) | |
OpenCL Quadro P1000 dp: 161.937s / 0.8422342 AUC / 487 MB GPU (509 MB GPU total) | |
(more CPU threads during GPU training should make metric/loss compute faster) | |
(around 50% GPU usage, whatever the number of GPU) | |
Here is the code: | |
``` | |
library(lightgbm) | |
library(data.table) | |
data <- fread("/home/laurae/Documents/R/GBM-perf/HIGGS.csv") # Force spawn all the requested threads | |
labels <- data$V1 | |
data[, V1 := NULL] | |
data <- as.matrix(data) | |
data_train <- data[1:10000000, ] | |
data_valid <- data[10000001:11000000, ] | |
labels_train <- labels[1:10000000] | |
labels_valid <- labels[10000001:11000000] | |
dtrain_lgb <- lgb.Dataset(data_train, label = labels_train) | |
dvalid_lgb <- lgb.Dataset.create.valid(dtrain_lgb, data_valid, label = labels_valid) | |
valids_lgb <- list(valid = dvalid_lgb) | |
auc <- function(preds, labels) { | |
x1 = as.numeric(preds[labels == 1]) | |
n1 = as.numeric(length(x1)) | |
x2 = as.numeric(preds[labels == 0]) | |
n2 = as.numeric(length(x2)) | |
r = rank(c(x1,x2)) | |
return((sum(r[1:n1]) - n1 * (n1 + 1) / 2) / (n1 * n2)) | |
} | |
# 1 GPU training | |
speed <- system.time({ | |
model <- lgb.train( | |
params = list(max_depth = 0, | |
num_leaves = 255, | |
learning_rate = 0.1, | |
min_data_in_leaf = 1, | |
min_sum_hessian_in_leaf = 100, | |
lambda_l1 = 0, | |
lambda_l2 = 0, | |
min_gain_to_split = 0, | |
max_bin = 255, | |
force_row_wise = TRUE, | |
boosting = "gbdt", | |
objective = "regression", | |
metric = "auc", | |
device = "cuda"), | |
data = dtrain_lgb, | |
valids = valids_lgb, | |
nrounds = 500, | |
verbose = 1, | |
num_thread = 36 | |
) | |
}) | |
model_predict <- predict(model, data_valid) | |
auc(model_predict, labels_valid) | |
rm(model) | |
invisible(gc(verbose = FALSE)) | |
speed | |
# 4 GPU training | |
speed <- system.time({ | |
model <- lgb.train( | |
params = list(max_depth = 0, | |
num_leaves = 255, | |
learning_rate = 0.1, | |
min_data_in_leaf = 1, | |
min_sum_hessian_in_leaf = 100, | |
lambda_l1 = 0, | |
lambda_l2 = 0, | |
min_gain_to_split = 0, | |
max_bin = 255, | |
force_row_wise = TRUE, | |
boosting = "gbdt", | |
objective = "regression", | |
metric = "auc", | |
device = "cuda", | |
num_gpu = 4), | |
data = dtrain_lgb, | |
valids = valids_lgb, | |
nrounds = 500, | |
verbose = 1, | |
num_thread = 36 | |
) | |
}) | |
model_predict <- predict(model, data_valid) | |
auc(model_predict, labels_valid) | |
rm(model) | |
invisible(gc(verbose = FALSE)) | |
speed | |
# 36 CPU threads training | |
speed <- system.time({ | |
model <- lgb.train( | |
params = list(max_depth = 0, | |
num_leaves = 255, | |
learning_rate = 0.1, | |
min_data_in_leaf = 1, | |
min_sum_hessian_in_leaf = 100, | |
lambda_l1 = 0, | |
lambda_l2 = 0, | |
min_gain_to_split = 0, | |
max_bin = 255, | |
force_row_wise = TRUE, | |
boosting = "gbdt", | |
objective = "regression", | |
metric = "auc", | |
device = "cpu"), | |
data = dtrain_lgb, | |
valids = valids_lgb, | |
nrounds = 500, | |
verbose = 1, | |
num_thread = 36 | |
) | |
}) | |
model_predict <- predict(model, data_valid) | |
auc(model_predict, labels_valid) | |
rm(model) | |
invisible(gc(verbose = FALSE)) | |
speed | |
``` | |
On Airline dataset + OHE, see for ref: https://github.com/szilard/GBM-perf/issues/12 | |
Benchmark results on 1 run: | |
18T Dual Xeon 6154 CPU: 15.872s / 0.7745457 AUC | |
1x Quadro P1000 GPU: 43.767s / 0.7736450 AUC | |
2x Quadro P1000 GPU: 32.291s / 0.7736450 AUC / 215 MB GPU (237 MB GPU total) | |
3x Quadro P1000 GPU: 29.732s / 0.7736450 AUC / 219-229 MB GPU (197-207 MB GPU total) | |
4x Quadro P1000 GPU: 29.515s / 0.7736450 AUC / 209-219 MB GPU (187-197 MB GPU total) | |
OpenCL Quadro P1000 sp: 25.810s / 0.7760418 AUC / 329 MB GPU (351 MB GPU total) | |
OpenCL Quadro P1000 dp: 40.080s / 0.7747921 AUC / 337 MB GPU (359 MB GPU total) | |
Copy-pasta spaghetti code and edit yourself: | |
``` | |
library(data.table) | |
library(lightgbm) | |
library(Matrix) | |
library(ROCR) | |
set.seed(123) | |
d_train <- fread("/home/laurae/Documents/R/GBM-perf/train-10m.csv", showProgress=FALSE) | |
d_test <- fread("/home/laurae/Documents/R/GBM-perf/test.csv", showProgress=FALSE) | |
X_train_test <- sparse.model.matrix(dep_delayed_15min ~ .-1, data = rbind(d_train, d_test)) | |
n1 <- nrow(d_train) | |
n2 <- nrow(d_test) | |
X_train <- X_train_test[1:n1,] | |
X_test <- X_train_test[(n1 + 1):(n1 + n2),] | |
labels <- as.numeric(d_train$dep_delayed_15min == "Y") | |
dlgb_train <- lgb.Dataset(data = X_train, label = labels, nthread = 18, device = "cuda") | |
cat(system.time({lgb.Dataset.construct(dlgb_train)})[[3]], " ", sep = "") | |
cat(system.time({ | |
md <- lgb.train(data = dlgb_train, | |
objective = "binary", | |
nrounds = 100, num_leaves = 512, learning_rate = 0.1, | |
device = "cuda", num_gpu = 1, | |
nthread = 18, | |
verbose = 0) | |
})[[3]], " ", sep = "") | |
phat <- predict(md, data = X_test) | |
rocr_pred <- prediction(phat, d_test$dep_delayed_15min) | |
cat(performance(rocr_pred, "auc")@y.values[[1]], "\n") | |
invisible(gc(verbose = FALSE)) | |
rm(md, dlgb_train, phat, rocr_pred) | |
gc(verbose = FALSE) | |
``` | |
Airline dataset + Categoricals: | |
Benchmark results on 1 run: | |
18T Dual Xeon 6154 CPU: 18.281s / 0.7922730 AUC | |
1x Quadro P1000 GPU: 53.890s / 0.7922730 AUC / 245 MB GPU (267 MB GPU total) | |
2x Quadro P1000 GPU: 39.789s / 0.7922730 AUC / 207 MB GPU (229 MB GPU total) | |
3x Quadro P1000 GPU: 38.705s / 0.7922730 AUC / 197 MB GPU (219 MB GPU total) | |
4x Quadro P1000 GPU: 36.903s / 0.7922730 AUC / 187 MB GPU (209 MB GPU total) | |
OpenCL Quadro P1000 sp: 23.896s / 0.7924575 AUC / 329 MB GPU (351 MB GPU total) | |
OpenCL Quadro P1000 dp: 35.693s / 0.7920217 AUC / 337 MB GPU (359 MB GPU total) | |
(around 50% GPU usage, more GPU usage up to aroun 75% for 1 GPU only) | |
Copy-pasta spaghetti code and edit yourself: | |
``` | |
library(data.table) | |
library(lightgbm) | |
library(Matrix) | |
library(ROCR) | |
set.seed(123) | |
d_train <- fread("/home/laurae/Documents/R/GBM-perf/train-10m.csv", showProgress=FALSE) | |
d_test <- fread("/home/laurae/Documents/R/GBM-perf/test.csv", showProgress=FALSE) | |
d_all <- rbind(d_train, d_test) | |
d_all$dep_delayed_15min <- ifelse(d_all$dep_delayed_15min=="Y",1,0) | |
d_all_wrules <- lgb.convert_with_rules(d_all) | |
d_all <- d_all_wrules$data | |
cols_cats <- names(d_all_wrules$rules) | |
d_train <- d_all[1:nrow(d_train)] | |
d_test <- d_all[(nrow(d_train)+1):(nrow(d_train)+nrow(d_test))] | |
p <- ncol(d_all)-1 | |
dlgb_train <- lgb.Dataset(data = as.matrix(d_train[,1:p]), label = d_train$dep_delayed_15min, device = "cuda") | |
cat(system.time({ | |
md <- lgb.train(data = dlgb_train, | |
objective = "binary", | |
nrounds = 100, num_leaves = 512, learning_rate = 0.1, | |
categorical_feature = cols_cats, | |
device = "cuda", num_gpu = 1, | |
nthread = 18, | |
verbose = 0) | |
})[[3]]," ",sep="") | |
phat <- predict(md, data = as.matrix(d_test[,1:p])) | |
rocr_pred <- prediction(phat, d_test$dep_delayed_15min) | |
cat(performance(rocr_pred, "auc")@y.values[[1]],"\n") | |
rm(md, dlgb_train, phat, rocr_pred) | |
gc(verbose = FALSE) | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment