Last active
March 3, 2022 03:02
-
-
Save ledell/4d4cd24b6a993a47069c511ba86b05bd to your computer and use it in GitHub Desktop.
KaggleDays SF: H2O AutoML solution
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Kaggle Days SF: Hackathon submission (8th place) | |
# I used the latest version of H2O (3.24.0.1) | |
# Latest stable always here: http://h2o-release.s3.amazonaws.com/h2o/latest_stable.html | |
# H2O 3.24.0.1: http://h2o-release.s3.amazonaws.com/h2o/rel-yates/1/index.html | |
# If you are a Python user, you can use the demo Python code available on the H2O AutoML User Guide | |
# instead: http://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html | |
# Unfortunately it was a private competition, so the data is not publicly available! | |
library(h2o) | |
h2o.init(max_mem_size = "40G") | |
# Import training data | |
train_file <- "/home/ledell/code/kaggle/kaggledays-sf-hackathon/input/train.csv" | |
train <- h2o.importFile(train_file) | |
# Identify predictors and response | |
y <- "target" | |
x <- setdiff(names(train), c(y, "id")) | |
# For binary classification, response should be a factor | |
train[,y] <- as.factor(train[,y]) | |
# Train H2O AutoML for 6000 seconds (100 mins) | |
# If you want the same result that I had (43 models + 2 ensembles), | |
# set max_models = 43, max_runtime_secs = 9999999 (unlimited). | |
# Since you may have more/fewer cores than I do on my machine, | |
# running for 100 mins on your hardware may produce more/fewer | |
# models in the same amount of time. | |
aml <- h2o.automl(x = x, y = y, | |
training_frame = train, | |
seed = 1, | |
max_runtime_secs = 6000) | |
# When it finishes, check out the leaderboard | |
print(aml@leaderboard) | |
# Import test data | |
test_file <- "/home/ledell/code/kaggle/kaggledays-sf-hackathon/input/test.csv" | |
test <- h2o.importFile(test_file) | |
test_id <- test[,"id"] #save for submission file | |
test <- test[, x] | |
# Predict on the test set | |
pred <- predict(aml, test) | |
# Create submission file | |
submission <- h2o.cbind(test_id, pred[,"p1"]) | |
names(submission) <- c("id", "target") | |
sub_file <- "/home/ledell/code/kaggle/kaggledays-sf-hackathon/submissions/h2o_automl_6000.csv" | |
h2o.exportFile(submission, path = sub_file) | |
# Here is a snapshot (head) of the leaderboard: | |
# > aml@leaderboard | |
# model_id auc logloss | |
# 1 StackedEnsemble_BestOfFamily_AutoML_20190411_152415 0.6476812 0.03137691 | |
# 2 StackedEnsemble_AllModels_AutoML_20190411_152415 0.6447939 0.03134611 | |
# 3 XGBoost_grid_1_AutoML_20190411_152415_model_5 0.6439845 0.03133052 | |
# 4 XGBoost_1_AutoML_20190411_152415 0.6430735 0.03134268 | |
# 5 GBM_grid_1_AutoML_20190411_152415_model_10 0.6428834 0.03140852 | |
# 6 XGBoost_grid_1_AutoML_20190411_152415_model_1 0.6413726 0.03128838 | |
# mean_per_class_error rmse mse | |
# 1 0.4699024 0.07126896 0.005079264 | |
# 2 0.4801189 0.07128310 0.005081280 | |
# 3 0.4835419 0.07120733 0.005070484 | |
# 4 0.4780277 0.07121024 0.005070898 | |
# 5 0.4859112 0.07120488 0.005070135 | |
# 6 0.4867847 0.07120804 0.005070586 | |
# | |
# [45 rows x 6 columns] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment