Eligijus112 · October 8, 2022 19:08
diff --git a/DataGeneratorNYC.py b/DataGeneratorNYC.py
 import pandas as pd 
 import numpy as np 
 import keras

 # Defining the class for the batches creation 
 class DataGenerator(keras.utils.Sequence):
    def __init__(
        self, 
        csv_generator: pd.io.parsers.readers.TextFileReader,
        n_batches: int,
        dummy_features: list,
        cat_encoders: dict,
        y_var: str,
        y_min: float, 
        y_max: float,
        final_features: list
        ):
        """
        Method to create an iterator in memory 

        Arguments
        ---------
        csv_generator: pd.io.parsers.readers.TextFileReader
            The csv generator from pandas
        n_batches: int
            The number of batches that are available in the csv_generator
        dummy_features: list
            The list of categorical features that need to be one hot encoded
        cat_encoders: dict
            The dictionary of the one hot encoders for the categorical features used for transformation
        y_var: str
            The name of the target variable
        y_min: float
            The minimum value of the target variable (used in min max scaling)
        y_max: float
            The maximum value of the target variable (used in min max scaling)
        final_features: list
            The list of the final features that are used for training
        """
        self.csv_generator = csv_generator
        self.n_batches = n_batches
        self.dummy_features = dummy_features
        self.cat_encoders = cat_encoders
        self.y_var = y_var
        self.y_min = y_min
        self.y_max = y_max
        self.final_features = final_features

    def __len__(self):
        """
        The total length of the iterator
        """
        return self.n_batches

    def __getitem__(self, idx):
        """
        The batch generator 
        """
        # Getting the batch
        chunk = self.csv_generator.get_chunk()

        # Reseting the index
        chunk = chunk.reset_index(drop=True)

        # Creating the date variables
        chunk = create_date_vars(chunk)

        # Creating the distance variable
        chunk = distance_calculation(chunk) 

        # Creating the dummy variables
        for cat_feature in self.dummy_features:
            # Extracting the values
            x = chunk[cat_feature].values

            # Transforming the data
            out = custom_transform(self.cat_encoders[cat_feature], x, cat_feature)

            # Concatenating the data
            chunk = pd.concat([chunk, out], axis=1)

            # Deleting the out, x from memory
            del out, x

        # Getting the target var 
        y = chunk[self.y_var].values

        # Min max transforming the y 
        y = (y - self.y_min) / (self.y_max - self.y_min)

        # If any of the final features are missing we fill them with 0
        missing_cols = set(self.final_features) - set(chunk.columns)
        for c in missing_cols:
            chunk[c] = 0

        # Extracting the final features
        x = chunk[self.final_features].values

        # Deleting the chunk from memory
        del chunk

        # Returning x and y 
        return x, y
	import pandas as pd
	import numpy as np
	import keras

	# Defining the class for the batches creation
	class DataGenerator(keras.utils.Sequence):
	def __init__(
	self,
	csv_generator: pd.io.parsers.readers.TextFileReader,
	n_batches: int,
	dummy_features: list,
	cat_encoders: dict,
	y_var: str,
	y_min: float,
	y_max: float,
	final_features: list
	):
	"""
	Method to create an iterator in memory

	Arguments
	---------
	csv_generator: pd.io.parsers.readers.TextFileReader
	The csv generator from pandas
	n_batches: int
	The number of batches that are available in the csv_generator
	dummy_features: list
	The list of categorical features that need to be one hot encoded
	cat_encoders: dict
	The dictionary of the one hot encoders for the categorical features used for transformation
	y_var: str
	The name of the target variable
	y_min: float
	The minimum value of the target variable (used in min max scaling)
	y_max: float
	The maximum value of the target variable (used in min max scaling)
	final_features: list
	The list of the final features that are used for training
	"""
	self.csv_generator = csv_generator
	self.n_batches = n_batches
	self.dummy_features = dummy_features
	self.cat_encoders = cat_encoders
	self.y_var = y_var
	self.y_min = y_min
	self.y_max = y_max
	self.final_features = final_features

	def __len__(self):
	"""
	The total length of the iterator
	"""
	return self.n_batches

	def __getitem__(self, idx):
	"""
	The batch generator
	"""
	# Getting the batch
	chunk = self.csv_generator.get_chunk()

	# Reseting the index
	chunk = chunk.reset_index(drop=True)

	# Creating the date variables
	chunk = create_date_vars(chunk)

	# Creating the distance variable
	chunk = distance_calculation(chunk)

	# Creating the dummy variables
	for cat_feature in self.dummy_features:
	# Extracting the values
	x = chunk[cat_feature].values

	# Transforming the data
	out = custom_transform(self.cat_encoders[cat_feature], x, cat_feature)

	# Concatenating the data
	chunk = pd.concat([chunk, out], axis=1)

	# Deleting the out, x from memory
	del out, x

	# Getting the target var
	y = chunk[self.y_var].values

	# Min max transforming the y
	y = (y - self.y_min) / (self.y_max - self.y_min)

	# If any of the final features are missing we fill them with 0
	missing_cols = set(self.final_features) - set(chunk.columns)
	for c in missing_cols:
	chunk[c] = 0

	# Extracting the final features
	x = chunk[self.final_features].values

	# Deleting the chunk from memory
	del chunk

	# Returning x and y
	return x, y