Created
October 6, 2022 19:48
-
-
Save Eligijus112/d1b502c94acfaae9849953df8932d782 to your computer and use it in GitHub Desktop.
Function to create dummy variables for the NYC dataset
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.preprocessing import OneHotEncoder | |
import numpy as np | |
# Defining the function for dummy creation | |
def create_dummy(df: pd.DataFrame, dummy_var_list: list) -> Tuple: | |
""" | |
Creates dummy variables for the variables in dummy_var_list | |
Returns a tuple of the following | |
* df - The dataframe with the dummy variables | |
* dummy_var_list - The list of dummy variables created | |
""" | |
# Placeholder for the dummy variables | |
added_features = [] | |
for var in dummy_var_list: | |
dummy = pd.get_dummies(df[var], prefix=var, drop_first=True) | |
# Adding the new features to list | |
added_features.extend(dummy.columns) | |
# Adding the dummy variables to the dataframe | |
df = pd.concat([df, dummy], axis=1) | |
df.drop(var, axis=1, inplace=True) | |
# Returning the dataframe | |
return df, added_features | |
# Defining a custom label encoding function | |
def custom_transform( | |
enc: OneHotEncoder, | |
x: np.array, | |
prefix: str | |
) -> pd.DataFrame: | |
""" | |
Applies a custom transformation to the data by | |
appending the created dummies to the dataframe | |
""" | |
# Transforming the data | |
out = enc.transform(x.reshape(-1, 1)) | |
# Getting the transformed values | |
out_values = enc.get_feature_names_out().tolist() | |
# Adding the names of the feature as a prefix | |
out_values = [f"{prefix}_{value.split('_')[-1]}" for value in out_values] | |
# Converting to a dataframe | |
out = pd.DataFrame(out.toarray(), columns=out_values) | |
# Changing the datatype to uint8 | |
out = out.astype('uint8') | |
# Returning the transformed data | |
return out |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment