Created
July 24, 2017 16:40
-
-
Save ledell/71e0b8861d4fa35b59dde2af282815a5 to your computer and use it in GitHub Desktop.
Demo of how to use grid search on H2O's XGBoost: http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/xgboost.html
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(h2o) | |
h2o.init() | |
# Load the HIGGS dataset | |
train <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv") | |
test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv") | |
y <- "response" | |
x <- setdiff(names(train), y) | |
family <- "binomial" | |
#For binary classification, response should be a factor | |
train[,y] <- as.factor(train[,y]) | |
test[,y] <- as.factor(test[,y]) | |
# Some XGboost/GBM hyperparameters | |
hyper_params <- list(ntrees = seq(10, 1000, 1), | |
learn_rate = seq(0.0001, 0.2, 0.0001), | |
max_depth = seq(1, 20, 1), | |
sample_rate = seq(0.5, 1.0, 0.0001), | |
col_sample_rate = seq(0.2, 1.0, 0.0001)) | |
search_criteria <- list(strategy = "RandomDiscrete", | |
max_models = 10, | |
seed = 1) | |
# Train the grid | |
xgb_grid <- h2o.grid(algorithm = "xgboost", | |
x = x, y = y, | |
training_frame = train, | |
nfolds = 5, | |
seed = 1, | |
hyper_params = hyper_params, | |
search_criteria = search_criteria) | |
# Sort the grid by CV AUC | |
grid <- h2o.getGrid(grid_id = xgb_grid@grid_id, sort_by = "AUC", decreasing = TRUE) | |
grid_top_model <- grid@summary_table[1, "model_ids"] |
Hi Steviey, xgboost from H2o is not available for windows as of now. check the limitations section below
http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/xgboost.html
I am also in the same boat of yours, I did 2 things, one is ubuntu via virtualbox and the other one is ubuntu machine on aws. This works over there. Hope this helps.
Thanks rknimmakayala,
thats's a little bit to much for me. I do it native in r via caret grid search. Works like a charme.
set.seed(42)
# xgboost train as.matrix + grid.search...
# https://www.r-bloggers.com/r-setup-a-grid-search-for-xgboost/
bootControl <- trainControl(number=4, verboseIter=TRUE)
# set up the cross-validated hyper-parameter search
xgb_grid_1 = expand.grid(
eta = c(0.01, 0.001, 0.0001),
max_depth = c(2, 4, 6, 8, 10, 15, 45),
gamma = 1,
nrounds = 1400,
colsample_bytree = 1, #default=1
min_child_weight = 1, #default=1
subsample = 1
)
# pack the training control parameters
xgb_trcontrol_1 = trainControl(
method = "repeatedCV",
number = 15,
verboseIter = TRUE,
returnData = FALSE,
returnResamp = "all",# save losses across all models
#classProbs = TRUE,# set to TRUE for AUC to be computed
summaryFunction = twoClassSummary,
allowParallel = TRUE,
savePred="final",
repeats=5
)
# train the model for each parameter combination in the grid,
modFit = train(
#x = as.matrix(f),
#y = as.factor(y),
#x = as.matrix(f),
#y = as.factor(y),
f,
data=training,
#data=as.matrix(training),
trControl = xgb_trcontrol_1,
tuneGrid = xgb_grid_1,
method = "xgbTree",
metric=metric
)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Helo,
does not work under Win7 12 core. Tried H2o 3.16 and 3.17. Any hint available?