Last active
March 31, 2019 19:19
-
-
Save khuangaf/d01c6f4992705c44151dca2091e13a1b to your computer and use it in GitHub Desktop.
This class is intended for faster and simpler categorical/ label encoding on large data, in which sklearn's LabelEncoder might be too slow.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Author: Kung-hsiang, Huang (Steeve) | |
Date: 2019/Mar/15 | |
''' | |
class CategoricalEncoder(): | |
''' | |
This class is for those operating on large data, in which sklearn's LabelEncoder class may take too much time. | |
This encoder is only suitable for 1-d array/ list. You may modify it to become n-d compatible. | |
''' | |
def __init__(self): | |
self.f_dict = {} | |
self.r_dict = {} | |
def fit(self, array): | |
''' | |
:param array: list or np array | |
:return: None | |
''' | |
unique_elements = set(array) | |
self.n_elements = 0 | |
for e in unique_elements: | |
self.f_dict[e] = self.n_elements | |
self.r_dict[self.n_elements] = e | |
self.n_elements += 1 | |
def reverse_transform(self, transformed_array, to_np=False): | |
''' | |
:param transformed_array: list or np array | |
:return: array: np array with the same shape as input | |
''' | |
array = [self.r_dict[e] for e in transformed_array] | |
if to_np: | |
array = np.array(array) | |
return array | |
def transform(self, array, to_np=False): | |
''' | |
:param array: array list or np array | |
:return: list or np array with the same shape as the input | |
''' | |
transformed_array = [self.f_dict[e] for e in array] | |
if to_np: | |
transformed_array = np.array(transformed_array) | |
return transformed_array | |
def fit_transform(self, array, to_np=False): | |
''' | |
:param array: array list or np array | |
:return: list or np array with the same shape as the input | |
''' | |
self.fit(array) | |
return self.transform(array, to_np) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment