Last active
October 8, 2022 16:52
-
-
Save Eligijus112/b68d858735d6cab2a36da6a828e366ff to your computer and use it in GitHub Desktop.
Trains a deep learning model using iterators in Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Data wrangling | |
import pandas as pd | |
# Deep learning | |
import tensorflow as tf | |
import keras | |
# Import feature engineering functions | |
from utils import create_date_vars, distance_calculation, custom_transform | |
# Iteration tracking | |
from tqdm import tqdm | |
# Array math | |
import numpy as np | |
# Object RAM tracking | |
import sys | |
from memory_profiler import profile | |
# One hot encoder | |
from sklearn.preprocessing import OneHotEncoder | |
# Argument parsing | |
import argparse | |
# Model creation | |
from model import create_model | |
@profile | |
def train_generator( | |
path_to_csv, | |
n_batches, | |
final_features, | |
dummy_features, | |
cat_encoders, | |
y_var, | |
y_min, | |
y_max, | |
epochs: int = 10, | |
batch_size: int = 128 | |
): | |
# Defining a simple feed forward network | |
model = create_model(len(final_features), 128, 'adam', 0.001) | |
for _ in range(epochs): | |
# Creating the generator | |
d = pd.read_csv(path_to_csv, chunksize=batch_size, iterator=True) | |
generator = DataGenerator( | |
csv_generator=d, | |
n_batches=n_batches, | |
dummy_features=dummy_features, | |
cat_encoders=cat_encoders, | |
y_var=y_var, | |
y_min=y_min, | |
y_max=y_max, | |
final_features=final_features | |
) | |
# Fitting the model | |
model.fit(generator, epochs=1, verbose=1, batch_size=batch_size) | |
# Returning the model | |
return model | |
if __name__ == '__main__': | |
# Parsing the number of rows to use | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--rows', type=int, default=None) | |
args = parser.parse_args() | |
# Defining the hps | |
batch_size = 512 | |
epochs = 10 | |
# Reading the data | |
d = pd.read_csv('data/train.csv', chunksize=batch_size, iterator=True) | |
# Defining a list of dummy features | |
dummy_features = [ | |
'pickup_dayofweek', | |
] | |
# Defining the target name | |
target = 'fare_amount' | |
# Getting the size of the object in memory | |
print(f"The main dataframe takes: {sys.getsizeof(d) / 10**6} MB in memory") | |
# Iterating over the chunks to get the final number of batches | |
n_batches = 0 | |
predefined_batch_size = False | |
# If the rows are not none | |
if args.rows is not None: | |
# Calculating the amount of chunks to call to cover the rows | |
n_batches = int(args.rows / batch_size) | |
predefined_batch_size = True | |
# Creating the min-max constants for y | |
min = np.inf | |
max = -np.inf | |
# Creating a dictionary for the categorical features that will store unique values | |
cat_dict = {} | |
for i, chunk in tqdm(enumerate(d)): | |
# Searching for the min and max values of y | |
if chunk[target].min() < min: | |
min = chunk[target].min() | |
if chunk[target].max() > max: | |
max = chunk[target].max() | |
# Creating the date variables | |
chunk = create_date_vars(chunk, verbose=False) | |
# Iterating over the cate features and getting the unique values | |
for cat in dummy_features: | |
if cat not in cat_dict.keys(): | |
cat_dict[cat] = list(set(chunk[cat].unique())) | |
else: | |
# Extracting the current unique values | |
current_unique = list(set(chunk[cat].unique())) | |
# Getting the new unique values | |
new_unique = list(set(current_unique) - set(cat_dict[cat])) | |
# Adding the new unique values to the dictionary | |
cat_dict[cat].extend(new_unique) | |
if predefined_batch_size: | |
if i == n_batches: | |
break | |
else: | |
n_batches += 1 | |
print(f"The number of batches is: {n_batches}") | |
# Creating a one hot encoder for the categorical features | |
cat_encoders = {} | |
for cat_feature in cat_dict.keys(): | |
# Creating the one hot encoder | |
one_hot = OneHotEncoder(categories='auto') | |
# Fitting the one hot encoder | |
one_hot.fit(np.array(cat_dict[cat_feature]).reshape(-1, 1)) | |
# Saving the encoder to the dictionary | |
cat_encoders[cat_feature] = one_hot | |
# Defining the final feature list | |
final_features = [ | |
'distance', | |
'passenger_count', | |
'pickup_hour_sin', | |
'pickup_hour_cos', | |
'pickup_dayofyear_sin', | |
'pickup_dayofyear_cos', | |
] | |
# Adding the final features from the one hot encoders | |
for cat_feature in cat_encoders.keys(): | |
# Extracting all original values | |
original_values = cat_dict[cat_feature] | |
# Getting the transformed values | |
out_values = cat_encoders[cat_feature].get_feature_names_out().tolist() | |
# Adding the names of the feature as a prefix | |
new_features = [f"{cat_feature}_{value.split('_')[-1]}" for value in out_values] | |
# Adding the new features to the list | |
final_features.extend(new_features) | |
# Training the model | |
model = train_generator( | |
path_to_csv='data/train.csv', | |
n_batches=n_batches - 1, | |
final_features=final_features, | |
dummy_features=dummy_features, | |
cat_encoders=cat_encoders, | |
y_var=target, | |
y_min=min, | |
y_max=max, | |
epochs=epochs, | |
batch_size=batch_size | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment