Created
March 26, 2022 09:19
-
-
Save Eligijus112/c2e7587458926821556374f8d1d09948 to your computer and use it in GitHub Desktop.
Gradient boosting for regression implementation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # The base class with the weak learner | |
| from regression.tree import Tree | |
| # Data wrangling | |
| import pandas as pd | |
| # Python infinity | |
| from math import inf | |
| class RegressionGB(): | |
| """ | |
| Class that implements the regression gradient boosting algorithm | |
| """ | |
| def __init__( | |
| self, | |
| d: pd.DataFrame, | |
| y_var: str, | |
| x_vars: list, | |
| max_depth: int = 4, | |
| min_sample_leaf: int = 2, | |
| learning_rate: float = 0.4, | |
| ): | |
| # Saving the names of y variable and X features | |
| self.y_var = y_var | |
| self.features = x_vars | |
| # Saving the node data to memory | |
| self.d = d[[y_var] + x_vars].copy() | |
| # Saving the data to the node | |
| self.Y = d[y_var].values.tolist() | |
| # Saving the number of observations in data | |
| self.n = len(d) | |
| # Saving the tree hyper parameters | |
| self.max_depth = max_depth | |
| self.min_sample_leaf = min_sample_leaf | |
| # Saving the learning rate | |
| self.learning_rate = learning_rate | |
| # Weak learner list | |
| self.weak_learners = [] | |
| # Setting the current iteration m to 0 | |
| self.cur_m = 0 | |
| # Saving the mean of y | |
| self.y_mean = self.get_mean(self.Y) | |
| # Saving the y_mean as the most recent prediction | |
| self._predictions = [self.y_mean] * self.n | |
| @staticmethod | |
| def get_mean(x: list) -> float: | |
| """ | |
| Calculates the mean over a list of float elements | |
| """ | |
| # Initiating the sum counter | |
| _sum = 0 | |
| # Infering the lenght of list | |
| _n = len(x) | |
| if _n == 0: | |
| return inf | |
| # Iterating through the y values | |
| for _x in x: | |
| _sum += _x | |
| # Returning the mean | |
| return _sum / _n | |
| def fit( | |
| self, | |
| m: int = 10 | |
| ): | |
| """ | |
| Applies the iterative algorithm | |
| """ | |
| # Converting the X to suitable inputs | |
| _inputs = self.d[self.features].to_dict('records') | |
| # Saving the gamma list to memory | |
| self.gamma = [] | |
| # Iterating over the number of estimators | |
| for _ in range(self.cur_m, self.cur_m + m): | |
| # Calculating the residuals | |
| _residuals = [self.Y[i] - self._predictions[i] for i in range(self.n)] | |
| # Saving the current iterations residuals to the original dataframe | |
| _r_name = f"residuals" | |
| self.d[_r_name] = _residuals | |
| # Creating a weak learner | |
| _weak_learner = Tree( | |
| d = self.d.copy(), | |
| y_var = _r_name, | |
| x_vars = self.features, | |
| max_depth = self.max_depth, | |
| min_sample_leaf = self.min_sample_leaf, | |
| ) | |
| # Growing the tree on the residuals | |
| _weak_learner.fit() | |
| # Appending the weak learner to the list | |
| self.weak_learners.append(_weak_learner) | |
| # Getting the weak learner predictions | |
| _predictions_wl = [_weak_learner.predict(_x) for _x in _inputs] | |
| # Updating the current predictions | |
| self._predictions = [self._predictions[i] + self.learning_rate * _predictions_wl[i] for i in range(self.n)] | |
| # Incrementing the current iteration | |
| self.cur_m += m | |
| def predict(self, x: dict) -> float: | |
| """ | |
| Given the dictionary, predict the value of the y variable | |
| """ | |
| # Starting from the mean | |
| yhat = self.y_mean | |
| for _m in range(self.cur_m): | |
| yhat += self.learning_rate * self.weak_learners[_m].predict(x) | |
| return yhat |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment