Skip to content

Instantly share code, notes, and snippets.

@datavudeja
Forked from simonespa/one_hot_encoding.py
Created October 6, 2025 13:14
Show Gist options
  • Save datavudeja/f032b2acc3b4ead9a71efcd3637d4961 to your computer and use it in GitHub Desktop.
Save datavudeja/f032b2acc3b4ead9a71efcd3637d4961 to your computer and use it in GitHub Desktop.
Pandas and HashingEncoder
# Splits the values and expands them in multiple numbered columns
temp_df = df[column].str.split("|", expand=True).fillna('')
# One-Hot encodes all the values for each column
temp_df = pd.get_dummies(temp_df).astype('uint8')
# Removes the "N_" prefixe for each column to expose duplicates
temp_df = remove_prefixes(temp_df)
# Merges the duplicate columns
temp_df = merge_columns(temp_df)
# For each row, the duplicate columns must be either all zeros or have 1 set in only one of them.
# If more than one column has 1, the sum will be greater than 1, indicating an error in the
# split/expansion and hot-encoding process. If this happens, it will be fixed by setting the resulting
# column to 1
error_detected = df[column_control].gt(1).sum().copy()
if error_detected > 0:
print(f"Detected {error_detected} rows with duplicates for {column} column. Fixing it now.")
# set everything greater than zero as "1", otherwise leave it "0"
df[column_control] = np.where(df[column_control] > 0, 1, 0)
from category_encoders import HashingEncoder
for column in columns:
temp_df = pd.concat([
temp_df,
pd.get_dummies(
df[column].str.split("|", expand=True).fillna('')
).astype('uint8')
], axis='columns')
from category_encoders import HashingEncoder
N = unique[column]
encoder = HashingEncoder(
cols=[column],
n_components=math.ceil(math.log2(N)), # the number of bits required to encode N elements
hash_method='sha256' # https://docs.python.org/3/library/hashlib.html#constructors
)
df = encoder.fit_transform(df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment