Created
October 23, 2019 01:36
-
-
Save kperry2215/b5a704564182b7562de36795f0593350 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def run_h2o_automl(dataframe, variable_to_predict, | |
max_number_models): | |
""" | |
This function initiates an h2o cluster, converts | |
the dataframe to an h2o dataframe, and then runs | |
the autoML function to generate a list of optimal | |
predictor models. The best models are displayed via a | |
scoreboard. | |
Arguments: | |
dataframe: Pandas dataframe. | |
variable_to_predict: String. Name of the dataframe that we're predicting. | |
max_number_models: Int. Total number of models to run. | |
Outputs: | |
Leader board of best performing models in the console, plus performance of | |
best fit model on the test data, including confusion matrix | |
""" | |
h2o.init() | |
#Convert the dataframe to an h2o dataframe | |
dataframe = h2o.H2OFrame(dataframe) | |
#Convert the variable we're predicting to a factor; otherwise this | |
#will run as a regression problem | |
dataframe[variable_to_predict] = dataframe[variable_to_predict].asfactor() | |
#Declare the x- and y- variables for the database. | |
#x-variables are predictor variables, and y-variable is what | |
#we wish to predict | |
x = dataframe.columns | |
y = variable_to_predict | |
x.remove(y) | |
#Pull the training and test data out at a 75/25 split. | |
train, test, validate = dataframe.split_frame(ratios=[.75, .125]) | |
# Run AutoML (limited to 1 hour max runtime by default) | |
aml = H2OAutoML(max_models=max_number_models, seed=1) | |
aml.train(x=x, y=y, training_frame = train, validation_frame = validate) | |
# View the AutoML Leaderboard | |
lb = aml.leaderboard | |
print(lb.head(rows=lb.nrows)) | |
#Get performance on test data | |
performance = aml.leader.model_performance(test) | |
print(performance) | |
################################################################################################# | |
###RUN run_h2o_automl() FUNCTION IN MAIN | |
run_h2o_automl(dataframe=df, | |
variable_to_predict='Deg_Malig', | |
max_number_models=10) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment