Skip to content

Instantly share code, notes, and snippets.

@veqtor
Created August 26, 2020 10:38
Show Gist options
  • Save veqtor/e5258c36259515bbb9841e9fe965fb13 to your computer and use it in GitHub Desktop.
Save veqtor/e5258c36259515bbb9841e9fe965fb13 to your computer and use it in GitHub Desktop.
Impute anything (pandas, scikit learn)
from sklearn.impute import KNNImputer
def impute_missing(df, inplace=False):
def get_dummies(df, col):
dummies = pd.get_dummies(df[col], prefix=col, dummy_na=True)
dummies[dummies[f'{col}_nan'] == 1] = np.nan
dummies = dummies.drop(labels=f'{col}_nan', axis=1)
return dummies
if not inplace:
df = df.copy()
df = df.replace([np.inf, -np.inf], np.nan)
imputer = KNNImputer(n_neighbors=5, weights='distance')
oh_cols = df.dtypes[df.dtypes != np.float].index.tolist()
num_cols = df.dtypes[df.dtypes == np.float].index.tolist()
df_nums = df[num_cols]
impute_res = imputer.fit_transform(pd.concat([pd.concat([get_dummies(df, ohc) for ohc in oh_cols], axis=1), df_nums], axis=1))
df[num_cols] = impute_res[num_cols]
for ohc in oh_cols:
filter_col = [col for col in impute_res if col.startswith(ohc+'_')]
filter_vals = {i:fc.split(ohc+'_')[1] for i,fc in enumerate(filter_cols)}
ohi_df = impute_res[filter_col].max(axis=1).map(filter_vals)
df[ohc] = ohi_df
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment