Last active
July 18, 2022 19:44
-
-
Save achinta/d56aa05a3185248cb9beff314dcbf1f6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.base import TransformerMixin | |
class CuCategoryEncoder(TransformerMixin): | |
""" | |
Runs on GPU using cudf | |
Once fit method is called, sklearn.preprocessing.LabelEncoder cannot encode new categories. | |
In this category encoder, fit can be called any number times. It encodes categories which it has not seen before, | |
without changing the encoding of existing categories. | |
""" | |
# categories as series | |
cats = {} | |
def __init__(self, cols, auto_fit=False, share_cats=False): | |
""" | |
""" | |
assert type(cols) == list | |
self.cols = cols | |
self.auto_fit = auto_fit | |
self.share_cats = share_cats | |
def fit(self, df): | |
for col in self.cols: | |
# set the key for the category df | |
if self.share_cats: | |
cat_key = 'shared' | |
else: | |
cat_key = col | |
# use the right datatype | |
dtype = df[col].dtype | |
if self.cats.get(cat_key) is None: | |
self.cats[cat_key] = cudf.DataFrame({'cats': []}, dtype=dtype) | |
# join values with categories and filter out the matches | |
joined = df.merge(self.cats[cat_key], left_on=col, right_on='cats', how='left') | |
new_cats = cudf.DataFrame({'cats': joined[joined.cats.isnull()][col].unique()}) | |
# append new cats to exiting cats | |
self.cats[cat_key] = cudf.concat([self.cats[cat_key], new_cats], ignore_index=True) | |
return self | |
def transform(self, df): | |
if self.auto_fit: | |
self.fit(df) | |
for col in self.cols: | |
# set the key for the category df | |
if self.share_cats: | |
cat_key = 'shared' | |
else: | |
cat_key = col | |
df = df.merge(self.cats[cat_key].reset_index(), left_on=col, right_on='cats', how='left')\ | |
.drop([col,'cats'],axis=1).rename(columns={'index':col}) | |
return df | |
def inverse_transform(self, df): | |
for col in self.cols: | |
# set the key for the category df | |
if self.share_cats: | |
cat_key = 'shared' | |
else: | |
cat_key = col | |
df = df.merge(self.cats[cat_key].reset_index(), how='left', left_on=col, right_on='index')\ | |
.drop(['index',col]).rename(columns={'cats':col}) | |
return df | |
# Testing it | |
pdf = pd.DataFrame({ | |
'sa': np.random.randint(1,10,3), | |
'da': np.random.randint(10,20,3) | |
}) | |
df = cudf.from_pandas(pdf) | |
encoder = CuCategoryEncoder(['sa','da'],auto_fit=True, share_cats=True) | |
# keep repeating the following lines | |
encoder.fit(df) | |
print(f'df:\n ', df) | |
print(f'cats:\n ', encoder.cats) | |
transformed = encoder.transform(df) | |
print(f'trans:\n ', transformed.head()) | |
print(f'inv:\n ', encoder.inverse_transform(transformed)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment