ledell · March 3, 2022 03:02
diff --git a/kaggledays-sf_h2o_automl_6000.R b/kaggledays-sf_h2o_automl_6000.R
 ### Kaggle Days SF: Hackathon submission (8th place) 

 # I used the latest version of H2O (3.24.0.1)
 # Latest stable always here: http://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
 # H2O 3.24.0.1: http://h2o-release.s3.amazonaws.com/h2o/rel-yates/1/index.html
 # If you are a Python user, you can use the demo Python code available on the H2O AutoML User Guide 
 # instead: http://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html

 # Unfortunately it was a private competition, so the data is not publicly available! 

 library(h2o)

 h2o.init(max_mem_size = "40G")

 # Import training data
 train_file <- "/home/ledell/code/kaggle/kaggledays-sf-hackathon/input/train.csv"
 train <- h2o.importFile(train_file)

 # Identify predictors and response
 y <- "target"
 x <- setdiff(names(train), c(y, "id"))

 # For binary classification, response should be a factor
 train[,y] <- as.factor(train[,y])

 # Train H2O AutoML for 6000 seconds (100 mins)
 # If you want the same result that I had (43 models + 2 ensembles), 
 # set max_models = 43, max_runtime_secs = 9999999 (unlimited).  
 # Since you may have more/fewer cores than I do on my machine, 
 # running for 100 mins on your hardware may produce more/fewer 
 # models in the same amount of time.
 aml <- h2o.automl(x = x, y = y, 
                  training_frame = train, 
                  seed = 1,
                  max_runtime_secs = 6000)

 # When it finishes, check out the leaderboard
 print(aml@leaderboard)

 # Import test data
 test_file <- "/home/ledell/code/kaggle/kaggledays-sf-hackathon/input/test.csv"
 test <- h2o.importFile(test_file)
 test_id <- test[,"id"]  #save for submission file
 test <- test[, x]

 # Predict on the test set
 pred <- predict(aml, test)

 # Create submission file
 submission <- h2o.cbind(test_id, pred[,"p1"])
 names(submission) <- c("id", "target")
 sub_file <- "/home/ledell/code/kaggle/kaggledays-sf-hackathon/submissions/h2o_automl_6000.csv"
 h2o.exportFile(submission, path = sub_file)


 # Here is a snapshot (head) of the leaderboard:

 # > aml@leaderboard
 # model_id       auc    logloss
 # 1 StackedEnsemble_BestOfFamily_AutoML_20190411_152415 0.6476812 0.03137691
 # 2    StackedEnsemble_AllModels_AutoML_20190411_152415 0.6447939 0.03134611
 # 3       XGBoost_grid_1_AutoML_20190411_152415_model_5 0.6439845 0.03133052
 # 4                    XGBoost_1_AutoML_20190411_152415 0.6430735 0.03134268
 # 5          GBM_grid_1_AutoML_20190411_152415_model_10 0.6428834 0.03140852
 # 6       XGBoost_grid_1_AutoML_20190411_152415_model_1 0.6413726 0.03128838
 # mean_per_class_error       rmse         mse
 # 1            0.4699024 0.07126896 0.005079264
 # 2            0.4801189 0.07128310 0.005081280
 # 3            0.4835419 0.07120733 0.005070484
 # 4            0.4780277 0.07121024 0.005070898
 # 5            0.4859112 0.07120488 0.005070135
 # 6            0.4867847 0.07120804 0.005070586
 #
 # [45 rows x 6 columns]
	### Kaggle Days SF: Hackathon submission (8th place)

	# I used the latest version of H2O (3.24.0.1)
	# Latest stable always here: http://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
	# H2O 3.24.0.1: http://h2o-release.s3.amazonaws.com/h2o/rel-yates/1/index.html
	# If you are a Python user, you can use the demo Python code available on the H2O AutoML User Guide
	# instead: http://docs.h2o.ai/h2o/latest-stable/h2o-docs/automl.html

	# Unfortunately it was a private competition, so the data is not publicly available!

	library(h2o)

	h2o.init(max_mem_size = "40G")

	# Import training data
	train_file <- "/home/ledell/code/kaggle/kaggledays-sf-hackathon/input/train.csv"
	train <- h2o.importFile(train_file)

	# Identify predictors and response
	y <- "target"
	x <- setdiff(names(train), c(y, "id"))

	# For binary classification, response should be a factor
	train[,y] <- as.factor(train[,y])

	# Train H2O AutoML for 6000 seconds (100 mins)
	# If you want the same result that I had (43 models + 2 ensembles),
	# set max_models = 43, max_runtime_secs = 9999999 (unlimited).
	# Since you may have more/fewer cores than I do on my machine,
	# running for 100 mins on your hardware may produce more/fewer
	# models in the same amount of time.
	aml <- h2o.automl(x = x, y = y,
	training_frame = train,
	seed = 1,
	max_runtime_secs = 6000)

	# When it finishes, check out the leaderboard
	print(aml@leaderboard)

	# Import test data
	test_file <- "/home/ledell/code/kaggle/kaggledays-sf-hackathon/input/test.csv"
	test <- h2o.importFile(test_file)
	test_id <- test[,"id"] #save for submission file
	test <- test[, x]

	# Predict on the test set
	pred <- predict(aml, test)

	# Create submission file
	submission <- h2o.cbind(test_id, pred[,"p1"])
	names(submission) <- c("id", "target")
	sub_file <- "/home/ledell/code/kaggle/kaggledays-sf-hackathon/submissions/h2o_automl_6000.csv"
	h2o.exportFile(submission, path = sub_file)


	# Here is a snapshot (head) of the leaderboard:

	# > aml@leaderboard
	# model_id auc logloss
	# 1 StackedEnsemble_BestOfFamily_AutoML_20190411_152415 0.6476812 0.03137691
	# 2 StackedEnsemble_AllModels_AutoML_20190411_152415 0.6447939 0.03134611
	# 3 XGBoost_grid_1_AutoML_20190411_152415_model_5 0.6439845 0.03133052
	# 4 XGBoost_1_AutoML_20190411_152415 0.6430735 0.03134268
	# 5 GBM_grid_1_AutoML_20190411_152415_model_10 0.6428834 0.03140852
	# 6 XGBoost_grid_1_AutoML_20190411_152415_model_1 0.6413726 0.03128838
	# mean_per_class_error rmse mse
	# 1 0.4699024 0.07126896 0.005079264
	# 2 0.4801189 0.07128310 0.005081280
	# 3 0.4835419 0.07120733 0.005070484
	# 4 0.4780277 0.07121024 0.005070898
	# 5 0.4859112 0.07120488 0.005070135
	# 6 0.4867847 0.07120804 0.005070586
	#
	# [45 rows x 6 columns]