Skip to content

Instantly share code, notes, and snippets.

@Eligijus112
Created March 26, 2022 09:19
Show Gist options
  • Save Eligijus112/c2e7587458926821556374f8d1d09948 to your computer and use it in GitHub Desktop.
Save Eligijus112/c2e7587458926821556374f8d1d09948 to your computer and use it in GitHub Desktop.
Gradient boosting for regression implementation
# The base class with the weak learner
from regression.tree import Tree
# Data wrangling
import pandas as pd
# Python infinity
from math import inf
class RegressionGB():
"""
Class that implements the regression gradient boosting algorithm
"""
def __init__(
self,
d: pd.DataFrame,
y_var: str,
x_vars: list,
max_depth: int = 4,
min_sample_leaf: int = 2,
learning_rate: float = 0.4,
):
# Saving the names of y variable and X features
self.y_var = y_var
self.features = x_vars
# Saving the node data to memory
self.d = d[[y_var] + x_vars].copy()
# Saving the data to the node
self.Y = d[y_var].values.tolist()
# Saving the number of observations in data
self.n = len(d)
# Saving the tree hyper parameters
self.max_depth = max_depth
self.min_sample_leaf = min_sample_leaf
# Saving the learning rate
self.learning_rate = learning_rate
# Weak learner list
self.weak_learners = []
# Setting the current iteration m to 0
self.cur_m = 0
# Saving the mean of y
self.y_mean = self.get_mean(self.Y)
# Saving the y_mean as the most recent prediction
self._predictions = [self.y_mean] * self.n
@staticmethod
def get_mean(x: list) -> float:
"""
Calculates the mean over a list of float elements
"""
# Initiating the sum counter
_sum = 0
# Infering the lenght of list
_n = len(x)
if _n == 0:
return inf
# Iterating through the y values
for _x in x:
_sum += _x
# Returning the mean
return _sum / _n
def fit(
self,
m: int = 10
):
"""
Applies the iterative algorithm
"""
# Converting the X to suitable inputs
_inputs = self.d[self.features].to_dict('records')
# Saving the gamma list to memory
self.gamma = []
# Iterating over the number of estimators
for _ in range(self.cur_m, self.cur_m + m):
# Calculating the residuals
_residuals = [self.Y[i] - self._predictions[i] for i in range(self.n)]
# Saving the current iterations residuals to the original dataframe
_r_name = f"residuals"
self.d[_r_name] = _residuals
# Creating a weak learner
_weak_learner = Tree(
d = self.d.copy(),
y_var = _r_name,
x_vars = self.features,
max_depth = self.max_depth,
min_sample_leaf = self.min_sample_leaf,
)
# Growing the tree on the residuals
_weak_learner.fit()
# Appending the weak learner to the list
self.weak_learners.append(_weak_learner)
# Getting the weak learner predictions
_predictions_wl = [_weak_learner.predict(_x) for _x in _inputs]
# Updating the current predictions
self._predictions = [self._predictions[i] + self.learning_rate * _predictions_wl[i] for i in range(self.n)]
# Incrementing the current iteration
self.cur_m += m
def predict(self, x: dict) -> float:
"""
Given the dictionary, predict the value of the y variable
"""
# Starting from the mean
yhat = self.y_mean
for _m in range(self.cur_m):
yhat += self.learning_rate * self.weak_learners[_m].predict(x)
return yhat
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment