Last active
March 22, 2020 09:51
-
-
Save pjoter/9f897116322dbc8aa891032a0e1c5c00 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTENC.html | |
# SMOTENC (SMOTE) for Pandas DataFrame | |
# - this codes uses SMOTENC (imbalanced-learn library) for oversampling imbalanced data | |
# - it preserves dataframe object, columns names and dtypes | |
from imblearn.over_sampling import SMOTENC | |
def col_ins(ds, var): | |
# column names to indices | |
return [ds.columns.get_loc(col) for col in var] | |
def smotenc(X, y, cat_var_ins): | |
sm = SMOTENC(random_state=42, categorical_features=cat_var_ins) | |
return sm.fit_sample(X, y) | |
def df_smotenc(df, dep_var, cat_var): | |
y = df[dep_var] | |
X = df.drop(dep_var, axis=1) | |
cat_var_ins = col_ins(X, cat_var) | |
# smotenc | |
X_res, y_res = smotenc(X, y, cat_var_ins) | |
# back to DataFrame (SMOTENC uses numpy) | |
X_res = pd.DataFrame(X_res, columns=X.columns) | |
y_res = pd.DataFrame(y_res, columns=[dep_var]) | |
df_res = y_res.merge(X_res, left_index=True, right_index=True) | |
# set dtypes (which are lost when SMOTENC uses numpy) | |
df_res = df_res.astype((ds.dtypes)) | |
return df_res |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment