-
-
Save Libardo1/770b8baaa3b7bd006fad3d54d45a14ad to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Lending Club example using cleaned up dataset & h2o.ensemble ### | |
library(h2o) | |
h2o.init(nthreads = -1, max_mem_size = "8G") | |
loan_csv <- "https://raw.githubusercontent.com/h2oai/app-consumer-loan/master/data/loan.csv" | |
data <- h2o.importFile(loan_csv) # 163994 x 15 | |
data$bad_loan <- as.factor(data$bad_loan) | |
rand <- h2o.runif(data, seed = 1) | |
train <- data[rand$rnd <= 0.8, ] | |
valid <- data[rand$rnd > 0.8, ] | |
y <- "bad_loan" | |
x <- setdiff(names(data), c(y, "int_rate")) | |
library(h2oEnsemble) | |
# Specify the base learner library & the metalearner | |
learner <- c("h2o.glm.wrapper", "h2o.randomForest.wrapper", | |
"h2o.gbm.wrapper", "h2o.deeplearning.wrapper") | |
metalearner <- "h2o.deeplearning.wrapper" | |
family <- "binomial" | |
# Train the ensemble using 5-fold CV to generate level-one data | |
# More CV folds will take longer to train, but should increase performance | |
fit <- h2o.ensemble(x = x, y = y, | |
training_frame = train, | |
validation_frame = NULL, | |
family = family, | |
learner = learner, | |
metalearner = metalearner, | |
cvControl = list(V = 5, shuffle = TRUE)) | |
# Generate predictions on the test set | |
pred <- predict.h2o.ensemble(fit, valid) | |
predictions <- as.data.frame(pred$pred)[,3] #third column, p1 is P(Y==1) | |
labels <- as.data.frame(valid[,c(y)])[,1] | |
# Ensemble test AUC | |
cvAUC::AUC(predictions = predictions , labels = labels) | |
# 0.6802715 | |
# Base learner test AUC (for comparison) | |
learner <- names(fit$basefits) | |
L <- length(learner) | |
auc <- sapply(seq(L), function(l) cvAUC::AUC(predictions = as.data.frame(pred$basepred)[,l], labels = labels)) | |
data.frame(learner, auc) | |
#learner auc | |
#1 h2o.glm.wrapper 0.6721662 | |
#2 h2o.randomForest.wrapper 0.6673966 | |
#3 h2o.gbm.wrapper 0.6737319 | |
#4 h2o.deeplearning.wrapper 0.6696115 | |
# Now let's try again with a more extensive set of base learners | |
# Here is an example of how to generate a custom learner wrappers: | |
h2o.glm.1 <- function(..., alpha = 0.0) h2o.glm.wrapper(..., alpha = alpha) | |
h2o.glm.2 <- function(..., alpha = 0.5) h2o.glm.wrapper(..., alpha = alpha) | |
h2o.glm.3 <- function(..., alpha = 1.0) h2o.glm.wrapper(..., alpha = alpha) | |
h2o.randomForest.1 <- function(..., ntrees = 200, nbins = 50, seed = 1) h2o.randomForest.wrapper(..., ntrees = ntrees, nbins = nbins, seed = seed) | |
h2o.randomForest.2 <- function(..., ntrees = 200, sample_rate = 0.75, seed = 1) h2o.randomForest.wrapper(..., ntrees = ntrees, sample_rate = sample_rate, seed = seed) | |
h2o.randomForest.3 <- function(..., ntrees = 200, sample_rate = 0.85, seed = 1) h2o.randomForest.wrapper(..., ntrees = ntrees, sample_rate = sample_rate, seed = seed) | |
h2o.gbm.1 <- function(..., ntrees = 100, nbins = 100, seed = 1) h2o.gbm.wrapper(..., ntrees = ntrees, nbins = nbins, seed = seed) | |
h2o.gbm.2 <- function(..., ntrees = 200, nbins = 50, seed = 1) h2o.gbm.wrapper(..., ntrees = ntrees, nbins = nbins, seed = seed) | |
h2o.gbm.3 <- function(..., ntrees = 100, max_depth = 10, seed = 1) h2o.gbm.wrapper(..., ntrees = ntrees, max_depth = max_depth, seed = seed) | |
h2o.gbm.4 <- function(..., ntrees = 100, col_sample_rate = 0.8, seed = 1) h2o.gbm.wrapper(..., ntrees = ntrees, col_sample_rate = col_sample_rate, seed = seed) | |
h2o.gbm.5 <- function(..., ntrees = 200, col_sample_rate = 0.8, seed = 1) h2o.gbm.wrapper(..., ntrees = ntrees, col_sample_rate = col_sample_rate, seed = seed) | |
h2o.gbm.6 <- function(..., ntrees = 200, col_sample_rate = 0.7, seed = 1) h2o.gbm.wrapper(..., ntrees = ntrees, col_sample_rate = col_sample_rate, seed = seed) | |
h2o.deeplearning.1 <- function(..., hidden = c(500,500), activation = "Rectifier", seed = 1) h2o.deeplearning.wrapper(..., hidden = hidden, activation = activation, seed = seed) | |
h2o.deeplearning.2 <- function(..., hidden = c(200,200,200), activation = "Tanh", seed = 1) h2o.deeplearning.wrapper(..., hidden = hidden, activation = activation, seed = seed) | |
h2o.deeplearning.3 <- function(..., hidden = c(500,500), activation = "RectifierWithDropout", seed = 1) h2o.deeplearning.wrapper(..., hidden = hidden, activation = activation, seed = seed) | |
learner <- c("h2o.glm.1", "h2o.glm.2", "h2o.glm.3", | |
"h2o.randomForest.1", "h2o.randomForest.2", "h2o.randomForest.3", | |
"h2o.gbm.1", "h2o.gbm.2", "h2o.gbm.3", "h2o.gbm.4", "h2o.gbm.5", "h2o.gbm.6", | |
"h2o.deeplearning.1", "h2o.deeplearning.2", "h2o.deeplearning.3") | |
metalearner <- "h2o.deeplearning.wrapper" | |
family <- "binomial" | |
# Train the ensemble using 5-fold CV to generate level-one data | |
# More CV folds will take longer to train, but should increase performance | |
fit <- h2o.ensemble(x = x, y = y, | |
training_frame = train, | |
validation_frame = NULL, | |
family = family, | |
learner = learner, | |
metalearner = metalearner, | |
cvControl = list(V = 5, shuffle = TRUE)) | |
# Generate predictions on the test set | |
pred <- predict.h2o.ensemble(fit, valid) | |
predictions <- as.data.frame(pred$pred)[,3] #third column, p1 is P(Y==1) | |
labels <- as.data.frame(valid[,c(y)])[,1] | |
# Ensemble test AUC | |
cvAUC::AUC(predictions = predictions , labels = labels) | |
# 0.6832164 | |
# Base learner test AUC (for comparison) | |
L <- length(learner) | |
auc <- sapply(seq(L), function(l) cvAUC::AUC(predictions = as.data.frame(pred$basepred)[,l], labels = labels)) | |
data.frame(learner, auc) | |
# learner auc | |
# 1 h2o.glm.1 0.6742993 | |
# 2 h2o.glm.2 0.6743306 | |
# 3 h2o.glm.3 0.6743672 | |
# 4 h2o.randomForest.1 0.6702020 | |
# 5 h2o.randomForest.2 0.6698333 | |
# 6 h2o.randomForest.3 0.6698533 | |
# 7 h2o.gbm.1 0.6801405 | |
# 8 h2o.gbm.2 0.6785209 | |
# 9 h2o.gbm.3 0.6699785 | |
# 10 h2o.gbm.4 0.6805663 | |
# 11 h2o.gbm.5 0.6790248 | |
# 12 h2o.gbm.6 0.6790248 | |
# 13 h2o.deeplearning.1 0.6772474 | |
# 14 h2o.deeplearning.2 0.6696069 | |
# 15 h2o.deeplearning.3 0.6802065 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment