Last active
October 8, 2022 19:08
-
-
Save Eligijus112/41bf9d74b94f50978f9ab3bb742156b2 to your computer and use it in GitHub Desktop.
Data generator for NYC data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| import numpy as np | |
| import keras | |
| # Defining the class for the batches creation | |
| class DataGenerator(keras.utils.Sequence): | |
| def __init__( | |
| self, | |
| csv_generator: pd.io.parsers.readers.TextFileReader, | |
| n_batches: int, | |
| dummy_features: list, | |
| cat_encoders: dict, | |
| y_var: str, | |
| y_min: float, | |
| y_max: float, | |
| final_features: list | |
| ): | |
| """ | |
| Method to create an iterator in memory | |
| Arguments | |
| --------- | |
| csv_generator: pd.io.parsers.readers.TextFileReader | |
| The csv generator from pandas | |
| n_batches: int | |
| The number of batches that are available in the csv_generator | |
| dummy_features: list | |
| The list of categorical features that need to be one hot encoded | |
| cat_encoders: dict | |
| The dictionary of the one hot encoders for the categorical features used for transformation | |
| y_var: str | |
| The name of the target variable | |
| y_min: float | |
| The minimum value of the target variable (used in min max scaling) | |
| y_max: float | |
| The maximum value of the target variable (used in min max scaling) | |
| final_features: list | |
| The list of the final features that are used for training | |
| """ | |
| self.csv_generator = csv_generator | |
| self.n_batches = n_batches | |
| self.dummy_features = dummy_features | |
| self.cat_encoders = cat_encoders | |
| self.y_var = y_var | |
| self.y_min = y_min | |
| self.y_max = y_max | |
| self.final_features = final_features | |
| def __len__(self): | |
| """ | |
| The total length of the iterator | |
| """ | |
| return self.n_batches | |
| def __getitem__(self, idx): | |
| """ | |
| The batch generator | |
| """ | |
| # Getting the batch | |
| chunk = self.csv_generator.get_chunk() | |
| # Reseting the index | |
| chunk = chunk.reset_index(drop=True) | |
| # Creating the date variables | |
| chunk = create_date_vars(chunk) | |
| # Creating the distance variable | |
| chunk = distance_calculation(chunk) | |
| # Creating the dummy variables | |
| for cat_feature in self.dummy_features: | |
| # Extracting the values | |
| x = chunk[cat_feature].values | |
| # Transforming the data | |
| out = custom_transform(self.cat_encoders[cat_feature], x, cat_feature) | |
| # Concatenating the data | |
| chunk = pd.concat([chunk, out], axis=1) | |
| # Deleting the out, x from memory | |
| del out, x | |
| # Getting the target var | |
| y = chunk[self.y_var].values | |
| # Min max transforming the y | |
| y = (y - self.y_min) / (self.y_max - self.y_min) | |
| # If any of the final features are missing we fill them with 0 | |
| missing_cols = set(self.final_features) - set(chunk.columns) | |
| for c in missing_cols: | |
| chunk[c] = 0 | |
| # Extracting the final features | |
| x = chunk[self.final_features].values | |
| # Deleting the chunk from memory | |
| del chunk | |
| # Returning x and y | |
| return x, y |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment