Skip to content

Instantly share code, notes, and snippets.

@Eligijus112
Last active October 8, 2022 16:52
Show Gist options
  • Save Eligijus112/b68d858735d6cab2a36da6a828e366ff to your computer and use it in GitHub Desktop.
Save Eligijus112/b68d858735d6cab2a36da6a828e366ff to your computer and use it in GitHub Desktop.
Trains a deep learning model using iterators in Python
# Data wrangling
import pandas as pd
# Deep learning
import tensorflow as tf
import keras
# Import feature engineering functions
from utils import create_date_vars, distance_calculation, custom_transform
# Iteration tracking
from tqdm import tqdm
# Array math
import numpy as np
# Object RAM tracking
import sys
from memory_profiler import profile
# One hot encoder
from sklearn.preprocessing import OneHotEncoder
# Argument parsing
import argparse
# Model creation
from model import create_model
@profile
def train_generator(
path_to_csv,
n_batches,
final_features,
dummy_features,
cat_encoders,
y_var,
y_min,
y_max,
epochs: int = 10,
batch_size: int = 128
):
# Defining a simple feed forward network
model = create_model(len(final_features), 128, 'adam', 0.001)
for _ in range(epochs):
# Creating the generator
d = pd.read_csv(path_to_csv, chunksize=batch_size, iterator=True)
generator = DataGenerator(
csv_generator=d,
n_batches=n_batches,
dummy_features=dummy_features,
cat_encoders=cat_encoders,
y_var=y_var,
y_min=y_min,
y_max=y_max,
final_features=final_features
)
# Fitting the model
model.fit(generator, epochs=1, verbose=1, batch_size=batch_size)
# Returning the model
return model
if __name__ == '__main__':
# Parsing the number of rows to use
parser = argparse.ArgumentParser()
parser.add_argument('--rows', type=int, default=None)
args = parser.parse_args()
# Defining the hps
batch_size = 512
epochs = 10
# Reading the data
d = pd.read_csv('data/train.csv', chunksize=batch_size, iterator=True)
# Defining a list of dummy features
dummy_features = [
'pickup_dayofweek',
]
# Defining the target name
target = 'fare_amount'
# Getting the size of the object in memory
print(f"The main dataframe takes: {sys.getsizeof(d) / 10**6} MB in memory")
# Iterating over the chunks to get the final number of batches
n_batches = 0
predefined_batch_size = False
# If the rows are not none
if args.rows is not None:
# Calculating the amount of chunks to call to cover the rows
n_batches = int(args.rows / batch_size)
predefined_batch_size = True
# Creating the min-max constants for y
min = np.inf
max = -np.inf
# Creating a dictionary for the categorical features that will store unique values
cat_dict = {}
for i, chunk in tqdm(enumerate(d)):
# Searching for the min and max values of y
if chunk[target].min() < min:
min = chunk[target].min()
if chunk[target].max() > max:
max = chunk[target].max()
# Creating the date variables
chunk = create_date_vars(chunk, verbose=False)
# Iterating over the cate features and getting the unique values
for cat in dummy_features:
if cat not in cat_dict.keys():
cat_dict[cat] = list(set(chunk[cat].unique()))
else:
# Extracting the current unique values
current_unique = list(set(chunk[cat].unique()))
# Getting the new unique values
new_unique = list(set(current_unique) - set(cat_dict[cat]))
# Adding the new unique values to the dictionary
cat_dict[cat].extend(new_unique)
if predefined_batch_size:
if i == n_batches:
break
else:
n_batches += 1
print(f"The number of batches is: {n_batches}")
# Creating a one hot encoder for the categorical features
cat_encoders = {}
for cat_feature in cat_dict.keys():
# Creating the one hot encoder
one_hot = OneHotEncoder(categories='auto')
# Fitting the one hot encoder
one_hot.fit(np.array(cat_dict[cat_feature]).reshape(-1, 1))
# Saving the encoder to the dictionary
cat_encoders[cat_feature] = one_hot
# Defining the final feature list
final_features = [
'distance',
'passenger_count',
'pickup_hour_sin',
'pickup_hour_cos',
'pickup_dayofyear_sin',
'pickup_dayofyear_cos',
]
# Adding the final features from the one hot encoders
for cat_feature in cat_encoders.keys():
# Extracting all original values
original_values = cat_dict[cat_feature]
# Getting the transformed values
out_values = cat_encoders[cat_feature].get_feature_names_out().tolist()
# Adding the names of the feature as a prefix
new_features = [f"{cat_feature}_{value.split('_')[-1]}" for value in out_values]
# Adding the new features to the list
final_features.extend(new_features)
# Training the model
model = train_generator(
path_to_csv='data/train.csv',
n_batches=n_batches - 1,
final_features=final_features,
dummy_features=dummy_features,
cat_encoders=cat_encoders,
y_var=target,
y_min=min,
y_max=max,
epochs=epochs,
batch_size=batch_size
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment