kperry2215 · October 23, 2019 01:36
diff --git a/h2o_automl.py b/h2o_automl.py
 def run_h2o_automl(dataframe, variable_to_predict,
                   max_number_models):
    """
    This function initiates an h2o cluster, converts
    the dataframe to an h2o dataframe, and then runs
    the autoML function to generate a list of optimal 
    predictor models. The best models are displayed via a 
    scoreboard.
    Arguments:
        dataframe: Pandas dataframe. 
        variable_to_predict: String. Name of the dataframe that we're predicting.
        max_number_models: Int. Total number of models to run.
    Outputs:
        Leader board of best performing models in the console, plus performance of
        best fit model on the test data, including confusion matrix
    """
    h2o.init()
    #Convert the dataframe to an h2o dataframe
    dataframe = h2o.H2OFrame(dataframe)
    #Convert the variable we're predicting to a factor; otherwise this
    #will run as a regression problem
    dataframe[variable_to_predict] = dataframe[variable_to_predict].asfactor()
    #Declare the x- and y- variables for the database. 
    #x-variables are predictor variables, and y-variable is what
    #we wish to predict
    x = dataframe.columns
    y = variable_to_predict
    x.remove(y)
    #Pull the training and test data out at a 75/25 split.
    train, test, validate = dataframe.split_frame(ratios=[.75, .125])
    # Run AutoML (limited to 1 hour max runtime by default)
    aml = H2OAutoML(max_models=max_number_models, seed=1)
    aml.train(x=x, y=y, training_frame = train, validation_frame = validate)
    # View the AutoML Leaderboard
    lb = aml.leaderboard
    print(lb.head(rows=lb.nrows))
    #Get performance on test data
    performance = aml.leader.model_performance(test)
    print(performance)

 #################################################################################################
 ###RUN run_h2o_automl() FUNCTION IN MAIN
 run_h2o_automl(dataframe=df, 
               variable_to_predict='Deg_Malig',
               max_number_models=10)
	def run_h2o_automl(dataframe, variable_to_predict,
	max_number_models):
	"""
	This function initiates an h2o cluster, converts
	the dataframe to an h2o dataframe, and then runs
	the autoML function to generate a list of optimal
	predictor models. The best models are displayed via a
	scoreboard.
	Arguments:
	dataframe: Pandas dataframe.
	variable_to_predict: String. Name of the dataframe that we're predicting.
	max_number_models: Int. Total number of models to run.
	Outputs:
	Leader board of best performing models in the console, plus performance of
	best fit model on the test data, including confusion matrix
	"""
	h2o.init()
	#Convert the dataframe to an h2o dataframe
	dataframe = h2o.H2OFrame(dataframe)
	#Convert the variable we're predicting to a factor; otherwise this
	#will run as a regression problem
	dataframe[variable_to_predict] = dataframe[variable_to_predict].asfactor()
	#Declare the x- and y- variables for the database.
	#x-variables are predictor variables, and y-variable is what
	#we wish to predict
	x = dataframe.columns
	y = variable_to_predict
	x.remove(y)
	#Pull the training and test data out at a 75/25 split.
	train, test, validate = dataframe.split_frame(ratios=[.75, .125])
	# Run AutoML (limited to 1 hour max runtime by default)
	aml = H2OAutoML(max_models=max_number_models, seed=1)
	aml.train(x=x, y=y, training_frame = train, validation_frame = validate)
	# View the AutoML Leaderboard
	lb = aml.leaderboard
	print(lb.head(rows=lb.nrows))
	#Get performance on test data
	performance = aml.leader.model_performance(test)
	print(performance)

	#################################################################################################
	###RUN run_h2o_automl() FUNCTION IN MAIN
	run_h2o_automl(dataframe=df,
	variable_to_predict='Deg_Malig',
	max_number_models=10)