decorouz · August 5, 2024 13:46
diff --git a/count_special_values.py b/count_special_values.py
 # When analyzing survey data, it is common for missing values to be represented by various placeholders 
 # such as "DK", "NO RESPONSE", "Missing/DK", and "REFUSED". During exploratory data analysis (EDA), 
 # it is standard practice to examine instances of missing values using the command df.isna().sum(). 
 # However, this command does not account for the aforementioned placeholders. 
 # Consequently, it is beneficial to employ a utility function that computes the count of these specific 
 # representations of missing values.

 def count_special_values(df: pd.DataFrame)->pd.DataFrame:
    """
    Counts the occurrences of specific special values (DK, REFUSED, NO RESPONSE, Missing/DK) 
    and missing values (NaN) in each column.
    
    Parameters:
    df (pandas DataFrame): The input DataFrame
    
    Returns:
    pandas DataFrame: A DataFrame that shows the counts of special values and missing values for each column
    """
    # List of special values to look for
    special_values = ["DK", "NO RESPONSE", "Missing/DK"]
    
    # Create a dictionary to hold the counts
    counts = {val: {} for val in special_values}
    counts['MISSING'] = {}
    
    for column in df.columns:
        for val in special_values:
            counts[val][column] = (df[column] == val).sum()
        # Count missing values (NaN) in the current column
        counts['MISSING'][column] = df[column].isna().sum()
    
    result_df = pd.DataFrame(counts)
    
    # Filter out columns where the count of all special values and missing values is 0
    result_df = result_df.loc[(result_df > 0).any(axis=1)]
    
    return result_df
	# When analyzing survey data, it is common for missing values to be represented by various placeholders
	# such as "DK", "NO RESPONSE", "Missing/DK", and "REFUSED". During exploratory data analysis (EDA),
	# it is standard practice to examine instances of missing values using the command df.isna().sum().
	# However, this command does not account for the aforementioned placeholders.
	# Consequently, it is beneficial to employ a utility function that computes the count of these specific
	# representations of missing values.

	def count_special_values(df: pd.DataFrame)->pd.DataFrame:
	"""
	Counts the occurrences of specific special values (DK, REFUSED, NO RESPONSE, Missing/DK)
	and missing values (NaN) in each column.

	Parameters:
	df (pandas DataFrame): The input DataFrame

	Returns:
	pandas DataFrame: A DataFrame that shows the counts of special values and missing values for each column
	"""
	# List of special values to look for
	special_values = ["DK", "NO RESPONSE", "Missing/DK"]

	# Create a dictionary to hold the counts
	counts = {val: {} for val in special_values}
	counts['MISSING'] = {}

	for column in df.columns:
	for val in special_values:
	counts[val][column] = (df[column] == val).sum()
	# Count missing values (NaN) in the current column
	counts['MISSING'][column] = df[column].isna().sum()

	result_df = pd.DataFrame(counts)

	# Filter out columns where the count of all special values and missing values is 0
	result_df = result_df.loc[(result_df > 0).any(axis=1)]

	return result_df