Skip to content

Instantly share code, notes, and snippets.

@davipatti
Created September 1, 2021 14:54
Show Gist options
  • Save davipatti/e2f2da8bdb24225f75cff534ce9c18e7 to your computer and use it in GitHub Desktop.
Save davipatti/e2f2da8bdb24225f75cff534ce9c18e7 to your computer and use it in GitHub Desktop.
[Remove missing data from pandas DataFrame] #pandas #python
import pandas as pd
def remove_missing(df: pd.DataFrame) -> pd.DataFrame:
"""
Remove missing data from a DataFrame.
Iteratively remove the column or row that contains the highest proportion of
missing data, until there is no missing data left.
"""
n_rows, n_cols = df.shape
n_row_na = df.isna().sum(axis=1) / n_cols
n_col_na = df.isna().sum(axis=0) / n_rows
row_na_max = n_row_na.max()
col_na_max = n_col_na.max()
if row_na_max == col_na_max == 0:
return df
elif row_na_max >= col_na_max:
return remove_missing(df.iloc[n_row_na.values != row_na_max])
else:
return remove_missing(df.iloc[:, n_col_na.values != col_na_max])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment