Skip to content

Instantly share code, notes, and snippets.

@Eligijus112
Last active October 8, 2022 19:08
Show Gist options
  • Save Eligijus112/41bf9d74b94f50978f9ab3bb742156b2 to your computer and use it in GitHub Desktop.
Save Eligijus112/41bf9d74b94f50978f9ab3bb742156b2 to your computer and use it in GitHub Desktop.
Data generator for NYC data
import pandas as pd
import numpy as np
import keras
# Defining the class for the batches creation
class DataGenerator(keras.utils.Sequence):
def __init__(
self,
csv_generator: pd.io.parsers.readers.TextFileReader,
n_batches: int,
dummy_features: list,
cat_encoders: dict,
y_var: str,
y_min: float,
y_max: float,
final_features: list
):
"""
Method to create an iterator in memory
Arguments
---------
csv_generator: pd.io.parsers.readers.TextFileReader
The csv generator from pandas
n_batches: int
The number of batches that are available in the csv_generator
dummy_features: list
The list of categorical features that need to be one hot encoded
cat_encoders: dict
The dictionary of the one hot encoders for the categorical features used for transformation
y_var: str
The name of the target variable
y_min: float
The minimum value of the target variable (used in min max scaling)
y_max: float
The maximum value of the target variable (used in min max scaling)
final_features: list
The list of the final features that are used for training
"""
self.csv_generator = csv_generator
self.n_batches = n_batches
self.dummy_features = dummy_features
self.cat_encoders = cat_encoders
self.y_var = y_var
self.y_min = y_min
self.y_max = y_max
self.final_features = final_features
def __len__(self):
"""
The total length of the iterator
"""
return self.n_batches
def __getitem__(self, idx):
"""
The batch generator
"""
# Getting the batch
chunk = self.csv_generator.get_chunk()
# Reseting the index
chunk = chunk.reset_index(drop=True)
# Creating the date variables
chunk = create_date_vars(chunk)
# Creating the distance variable
chunk = distance_calculation(chunk)
# Creating the dummy variables
for cat_feature in self.dummy_features:
# Extracting the values
x = chunk[cat_feature].values
# Transforming the data
out = custom_transform(self.cat_encoders[cat_feature], x, cat_feature)
# Concatenating the data
chunk = pd.concat([chunk, out], axis=1)
# Deleting the out, x from memory
del out, x
# Getting the target var
y = chunk[self.y_var].values
# Min max transforming the y
y = (y - self.y_min) / (self.y_max - self.y_min)
# If any of the final features are missing we fill them with 0
missing_cols = set(self.final_features) - set(chunk.columns)
for c in missing_cols:
chunk[c] = 0
# Extracting the final features
x = chunk[self.final_features].values
# Deleting the chunk from memory
del chunk
# Returning x and y
return x, y
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment