Created
September 1, 2021 14:54
-
-
Save davipatti/e2f2da8bdb24225f75cff534ce9c18e7 to your computer and use it in GitHub Desktop.
[Remove missing data from pandas DataFrame] #pandas #python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
def remove_missing(df: pd.DataFrame) -> pd.DataFrame: | |
""" | |
Remove missing data from a DataFrame. | |
Iteratively remove the column or row that contains the highest proportion of | |
missing data, until there is no missing data left. | |
""" | |
n_rows, n_cols = df.shape | |
n_row_na = df.isna().sum(axis=1) / n_cols | |
n_col_na = df.isna().sum(axis=0) / n_rows | |
row_na_max = n_row_na.max() | |
col_na_max = n_col_na.max() | |
if row_na_max == col_na_max == 0: | |
return df | |
elif row_na_max >= col_na_max: | |
return remove_missing(df.iloc[n_row_na.values != row_na_max]) | |
else: | |
return remove_missing(df.iloc[:, n_col_na.values != col_na_max]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment