Skip to content

Instantly share code, notes, and snippets.

@Eligijus112
Created October 6, 2022 19:48
Show Gist options
  • Save Eligijus112/d1b502c94acfaae9849953df8932d782 to your computer and use it in GitHub Desktop.
Save Eligijus112/d1b502c94acfaae9849953df8932d782 to your computer and use it in GitHub Desktop.
Function to create dummy variables for the NYC dataset
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import numpy as np
# Defining the function for dummy creation
def create_dummy(df: pd.DataFrame, dummy_var_list: list) -> Tuple:
"""
Creates dummy variables for the variables in dummy_var_list
Returns a tuple of the following
* df - The dataframe with the dummy variables
* dummy_var_list - The list of dummy variables created
"""
# Placeholder for the dummy variables
added_features = []
for var in dummy_var_list:
dummy = pd.get_dummies(df[var], prefix=var, drop_first=True)
# Adding the new features to list
added_features.extend(dummy.columns)
# Adding the dummy variables to the dataframe
df = pd.concat([df, dummy], axis=1)
df.drop(var, axis=1, inplace=True)
# Returning the dataframe
return df, added_features
# Defining a custom label encoding function
def custom_transform(
enc: OneHotEncoder,
x: np.array,
prefix: str
) -> pd.DataFrame:
"""
Applies a custom transformation to the data by
appending the created dummies to the dataframe
"""
# Transforming the data
out = enc.transform(x.reshape(-1, 1))
# Getting the transformed values
out_values = enc.get_feature_names_out().tolist()
# Adding the names of the feature as a prefix
out_values = [f"{prefix}_{value.split('_')[-1]}" for value in out_values]
# Converting to a dataframe
out = pd.DataFrame(out.toarray(), columns=out_values)
# Changing the datatype to uint8
out = out.astype('uint8')
# Returning the transformed data
return out
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment