simonespa · October 6, 2025 13:15
diff --git a/one_hot_encoding.py b/one_hot_encoding.py
 # Splits the values and expands them in multiple numbered columns
 temp_df = df[column].str.split("|", expand=True).fillna('')

 # One-Hot encodes all the values for each column
 temp_df = pd.get_dummies(temp_df).astype('uint8')

 # Removes the "N_" prefixe for each column to expose duplicates
 temp_df = remove_prefixes(temp_df)

 # Merges the duplicate columns
 temp_df = merge_columns(temp_df)

 # For each row, the duplicate columns must be either all zeros or have 1 set in only one of them.
 # If more than one column has 1, the sum will be greater than 1, indicating an error in the
 # split/expansion and hot-encoding process. If this happens, it will be fixed by setting the resulting
 # column to 1
 error_detected = df[column_control].gt(1).sum().copy()
 if error_detected > 0:
    print(f"Detected {error_detected} rows with duplicates for {column} column. Fixing it now.")
    # set everything greater than zero as "1", otherwise leave it "0"
    df[column_control] = np.where(df[column_control] > 0, 1, 0)
diff --git a/snippet.py b/snippet.py
 from category_encoders import HashingEncoder

 for column in columns:
  temp_df = pd.concat([
    temp_df,
    pd.get_dummies(
      df[column].str.split("|", expand=True).fillna('')
    ).astype('uint8')
  ], axis='columns')
diff --git a/snippet2.py b/snippet2.py
 from category_encoders import HashingEncoder

 N = unique[column]
 encoder = HashingEncoder(
  cols=[column],
  n_components=math.ceil(math.log2(N)), # the number of bits required to encode N elements
  hash_method='sha256' # https://docs.python.org/3/library/hashlib.html#constructors
 )
 df = encoder.fit_transform(df)
diff --git a/util.py b/util.py
	# Splits the values and expands them in multiple numbered columns
	temp_df = df[column].str.split("\|", expand=True).fillna('')

	# One-Hot encodes all the values for each column
	temp_df = pd.get_dummies(temp_df).astype('uint8')

	# Removes the "N_" prefixe for each column to expose duplicates
	temp_df = remove_prefixes(temp_df)

	# Merges the duplicate columns
	temp_df = merge_columns(temp_df)

	# For each row, the duplicate columns must be either all zeros or have 1 set in only one of them.
	# If more than one column has 1, the sum will be greater than 1, indicating an error in the
	# split/expansion and hot-encoding process. If this happens, it will be fixed by setting the resulting
	# column to 1
	error_detected = df[column_control].gt(1).sum().copy()
	if error_detected > 0:
	print(f"Detected {error_detected} rows with duplicates for {column} column. Fixing it now.")
	# set everything greater than zero as "1", otherwise leave it "0"
	df[column_control] = np.where(df[column_control] > 0, 1, 0)
	from category_encoders import HashingEncoder

	for column in columns:
	temp_df = pd.concat([
	temp_df,
	pd.get_dummies(
	df[column].str.split("\|", expand=True).fillna('')
	).astype('uint8')
	], axis='columns')
	from category_encoders import HashingEncoder

	N = unique[column]
	encoder = HashingEncoder(
	cols=[column],
	n_components=math.ceil(math.log2(N)), # the number of bits required to encode N elements
	hash_method='sha256' # https://docs.python.org/3/library/hashlib.html#constructors
	)
	df = encoder.fit_transform(df)