korkridake · February 3, 2022 20:23
diff --git a/DQA-Python-Script.py b/DQA-Python-Script.py
 #######################################################################
 # This is Kyle's Python script for Data Quality Assessment
 # Assume df = your dataframe
 # Replace "df" with "[Your Dataframe]"
 #######################################################################

 #######################################################################
 # Check summary statistics
 #######################################################################
 df.describe()

 #######################################################################
 # Check missing values for each column
 #######################################################################
 df.isnull().sum(axis = 1) # NULL value by column
 df.isnull().sum(axis = 0) # NULL value by row

 #######################################################################
 # Replace empty value as NULL 
 #######################################################################
 df.replace(r'^\s*$', np.nan, regex=True)

 #######################################################################
 # Missing Data Treatment
 #######################################################################
 df_no_missing = df.dropna() # Drop missing observations
 df_cleaned = df.dropna(how='all') # Drop rows where all cells in that row is NA
 df['location'] = np.nan # Create a new column full of missing values
 df.dropna(axis=1, how='all') # Drop column if they only contain missing values
 df.dropna(thresh=5) # Drop rows that contain less than five observations
 df.fillna(0) # Fill in missing data with zeros
 df["preTestScore"].fillna(df["preTestScore"].mean(), inplace=True) # Fill in missing in preTestScore with the mean value of preTestScore
 df["postTestScore"].fillna(df.groupby("sex")["postTestScore"].transform("mean"), inplace=True) # Fill in missing in postTestScore with each sex’s mean value of postTestScore
 df[df['age'].notnull() & df['sex'].notnull()] # Select the rows of df where age is not NaN and sex is not NaN
	#######################################################################
	# This is Kyle's Python script for Data Quality Assessment
	# Assume df = your dataframe
	# Replace "df" with "[Your Dataframe]"
	#######################################################################

	#######################################################################
	# Check summary statistics
	#######################################################################
	df.describe()

	#######################################################################
	# Check missing values for each column
	#######################################################################
	df.isnull().sum(axis = 1) # NULL value by column
	df.isnull().sum(axis = 0) # NULL value by row

	#######################################################################
	# Replace empty value as NULL
	#######################################################################
	df.replace(r'^\s*$', np.nan, regex=True)

	#######################################################################
	# Missing Data Treatment
	#######################################################################
	df_no_missing = df.dropna() # Drop missing observations
	df_cleaned = df.dropna(how='all') # Drop rows where all cells in that row is NA
	df['location'] = np.nan # Create a new column full of missing values
	df.dropna(axis=1, how='all') # Drop column if they only contain missing values
	df.dropna(thresh=5) # Drop rows that contain less than five observations
	df.fillna(0) # Fill in missing data with zeros
	df["preTestScore"].fillna(df["preTestScore"].mean(), inplace=True) # Fill in missing in preTestScore with the mean value of preTestScore
	df["postTestScore"].fillna(df.groupby("sex")["postTestScore"].transform("mean"), inplace=True) # Fill in missing in postTestScore with each sex’s mean value of postTestScore
	df[df['age'].notnull() & df['sex'].notnull()] # Select the rows of df where age is not NaN and sex is not NaN