Last active
February 3, 2022 20:23
-
-
Save korkridake/01aee8dbe49af2766a40b185e4124dd0 to your computer and use it in GitHub Desktop.
Data Quality Assessment Script Using Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
####################################################################### | |
# This is Kyle's Python script for Data Quality Assessment | |
# Assume df = your dataframe | |
# Replace "df" with "[Your Dataframe]" | |
####################################################################### | |
####################################################################### | |
# Check summary statistics | |
####################################################################### | |
df.describe() | |
####################################################################### | |
# Check missing values for each column | |
####################################################################### | |
df.isnull().sum(axis = 1) # NULL value by column | |
df.isnull().sum(axis = 0) # NULL value by row | |
####################################################################### | |
# Replace empty value as NULL | |
####################################################################### | |
df.replace(r'^\s*$', np.nan, regex=True) | |
####################################################################### | |
# Missing Data Treatment | |
####################################################################### | |
df_no_missing = df.dropna() # Drop missing observations | |
df_cleaned = df.dropna(how='all') # Drop rows where all cells in that row is NA | |
df['location'] = np.nan # Create a new column full of missing values | |
df.dropna(axis=1, how='all') # Drop column if they only contain missing values | |
df.dropna(thresh=5) # Drop rows that contain less than five observations | |
df.fillna(0) # Fill in missing data with zeros | |
df["preTestScore"].fillna(df["preTestScore"].mean(), inplace=True) # Fill in missing in preTestScore with the mean value of preTestScore | |
df["postTestScore"].fillna(df.groupby("sex")["postTestScore"].transform("mean"), inplace=True) # Fill in missing in postTestScore with each sex’s mean value of postTestScore | |
df[df['age'].notnull() & df['sex'].notnull()] # Select the rows of df where age is not NaN and sex is not NaN | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment