Created
October 11, 2017 13:47
-
-
Save sminez/8270a87b0532ecbab6a73faf274bc73b to your computer and use it in GitHub Desktop.
Data manipulation tools
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
DEFAULT_CMAP = 'RdYlBu_r' | |
def show_missing_data(df, cmap=DEFAULT_CMAP): | |
''' | |
Display a quick heatmap of where we have missing data in a dataframe. | |
(If not used in a notebook, this will open a plot window.) | |
''' | |
if not isinstance(df, pd.DataFrame): | |
raise ValueError('df must be a pandas DataFrame') | |
plt.figure(figsize=(16, 8)) | |
sns.heatmap(df.isnull(), yticklabels=False, cmap=cmap) | |
def show_percentage_null(df): | |
''' | |
Find the percentage of nulls in each dataframe column and print them. | |
''' | |
if not isinstance(df, pd.DataFrame): | |
raise ValueError('df must be a pandas DataFrame') | |
for col in df.columns: | |
nulls = df[df[col].isnull()] | |
if len(nulls) > 0: | |
perc = round(len(nulls) / len(df) * 100, 2) | |
print("{col}: {perc}% NULL".format(col=col, perc=perc)) | |
def fill_na_with_grouped_means(df, fill_column, grouping_column, dp=2): | |
''' | |
Compute the grouped means of a `fill_column` when partitioned by | |
`grouping_column` and use these to fill any missing values. | |
Mean values will be rounded to `dp` decimal places. | |
''' | |
if not isinstance(df, pd.DataFrame): | |
raise ValueError('df must be a pandas DataFrame') | |
means = df.groupby(grouping_column).mean()[fill_column] | |
def filler(row): | |
target = row.loc[fill_column] | |
key = row.loc[grouping_column] | |
return means[key] if pd.isnull(target) else target | |
df[fill_column] = df.apply(filler, axis=1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment