daveebbelaar · June 29, 2024 09:17
diff --git a/optuna_xgboost.py b/optuna_xgboost.py
 import optuna
 import xgboost as xgb
 from sklearn.metrics import mean_squared_error # or any other metric
 from sklearn.model_selection import train_test_split

 # Load the dataset
 X, y = ... # load your  own
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

 # Define the objective function for Optuna
 def objective(trial):
    # Define the search space for hyperparameters
    param = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'eta': trial.suggest_float('eta', 0.01, 0.3),
        'num_boost_round': 100000, # Fix the boosting round and use early stopping
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 10.0),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 10.0),
        'lambda': trial.suggest_float('lambda', 0.1, 10.0),
        'alpha': trial.suggest_float('alpha', 0.0, 10.0),
    }
    
    # Split the data into further training and validation sets (three sets are preferable)
    train_data, valid_data, train_target, valid_target = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    
    # Convert the data into DMatrix format
    dtrain = xgb.DMatrix(train_data, label=train_target)
    dvalid = xgb.DMatrix(valid_data, label=valid_target)
    
    # Define the pruning callback for early stopping
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'validation-rmse')
    
    # Train the model with early stopping
    model = xgb.train(param, dtrain, evals=[(dvalid, 'validation')], early_stopping_rounds=100, callbacks=[pruning_callback])
    
    # Make predictions on the test set
    dtest = xgb.DMatrix(valid_data)
    y_pred = model.predict(dtest)
    
    # Calculate the root mean squared error
    rmse = mean_squared_error(valid_test, y_pred, squared=False)
    
    return rmse

 # Create an Optuna study and optimize the objective function
 study = optuna.create_study(direction='minimize')
 study.optimize(objective, n_trials=100) # Control the number of trials

 # Print the best hyperparameters and the best RMSE
 best_params = study.best_params
 best_rmse = study.best_value
 print("Best Hyperparameters: ", best_params)
 print("Best RMSE: ", best_rmse)

 #---------------------------------------------------------------------#
 # You can also tune for multiple metrics. See here: https://stackoverflow.com/questions/69071684/how-to-optimize-for-multiple-metrics-in-optuna
	import optuna
	import xgboost as xgb
	from sklearn.metrics import mean_squared_error # or any other metric
	from sklearn.model_selection import train_test_split

	# Load the dataset
	X, y = ... # load your own
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Define the objective function for Optuna
	def objective(trial):
	# Define the search space for hyperparameters
	param = {
	'objective': 'reg:squarederror',
	'eval_metric': 'rmse',
	'eta': trial.suggest_float('eta', 0.01, 0.3),
	'num_boost_round': 100000, # Fix the boosting round and use early stopping
	'max_depth': trial.suggest_int('max_depth', 3, 10),
	'subsample': trial.suggest_float('subsample', 0.5, 1.0),
	'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
	'gamma': trial.suggest_float('gamma', 0.0, 10.0),
	'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 10.0),
	'lambda': trial.suggest_float('lambda', 0.1, 10.0),
	'alpha': trial.suggest_float('alpha', 0.0, 10.0),
	}

	# Split the data into further training and validation sets (three sets are preferable)
	train_data, valid_data, train_target, valid_target = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

	# Convert the data into DMatrix format
	dtrain = xgb.DMatrix(train_data, label=train_target)
	dvalid = xgb.DMatrix(valid_data, label=valid_target)

	# Define the pruning callback for early stopping
	pruning_callback = optuna.integration.XGBoostPruningCallback(trial, 'validation-rmse')

	# Train the model with early stopping
	model = xgb.train(param, dtrain, evals=[(dvalid, 'validation')], early_stopping_rounds=100, callbacks=[pruning_callback])

	# Make predictions on the test set
	dtest = xgb.DMatrix(valid_data)
	y_pred = model.predict(dtest)

	# Calculate the root mean squared error
	rmse = mean_squared_error(valid_test, y_pred, squared=False)

	return rmse

	# Create an Optuna study and optimize the objective function
	study = optuna.create_study(direction='minimize')
	study.optimize(objective, n_trials=100) # Control the number of trials

	# Print the best hyperparameters and the best RMSE
	best_params = study.best_params
	best_rmse = study.best_value
	print("Best Hyperparameters: ", best_params)
	print("Best RMSE: ", best_rmse)

	#---------------------------------------------------------------------#
	# You can also tune for multiple metrics. See here: https://stackoverflow.com/questions/69071684/how-to-optimize-for-multiple-metrics-in-optuna