Last active
December 15, 2020 19:30
-
-
Save MokeEire/cb5b8768e544d8f8b22718c245eb75d2 to your computer and use it in GitHub Desktop.
Build a codebook from a dataframe
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(lubridate) | |
# The function below was written primarily with the aim of producing a codebook which can be exported to Excel. | |
# If you are trying to get a quick understanding of your data, str(), glimpse(), or summary() will all likely better serve you | |
# If you use this and make any improvements, please let me know. | |
# There's lots of room for this to be improved. | |
# - Check for presence of each data type, and perform processing accordingly | |
# - Check if the remaining df is empty after summarising each data type | |
# I wrote it without any safeguards because I know what I use it for but it could definitely use some more careful argument checking | |
build_codebook = function(df, factor_levels, description_df){ | |
# This function takes a dataframe as an input, | |
# partitions the data into factors, numerics, characters, and dates, | |
# and summarises the data for each group. | |
# | |
# Arguments | |
# | |
# @df: the data frame to summarise | |
# @factor_levels: the max number of levels that can be in a factor | |
# @description_df: a two column dataframe containing the column names in df and a corresponding description of the column name | |
# | |
# This summary differs for each group: | |
# Factors | |
# - A count of distinct values for each column | |
# - List distinct values, and their frequency for each column | |
# - Missingness | |
# Numerics | |
# - Summary stats: min, mean, median, max | |
# - Distinct values | |
# - Missingness | |
# Characters | |
# - A count of distinct values | |
# - Missingness | |
# Dates | |
# - Summary stats: min, mean, median, max | |
# - Distinct values | |
# - Missingness | |
# A dataframe to track remaining columns | |
# After each variable type, we will remove the variables from this dataframe | |
# This means that we don't double count character variables as both character and factor | |
remaining = df | |
################################ | |
## Factor vars | |
################################ | |
# Select variables which have at most the number given by factor_levels | |
factor_vars = df %>% | |
select(where(~n_distinct(., na.rm=T) < factor_levels)) | |
# Summarise factor variables | |
factor_summary = factor_vars %>% | |
# Convert all variables to character in order to pivot their values into the same column | |
mutate(across(.fns = as.character)) %>% | |
# Pivot data, one row is a column-value pair | |
pivot_longer(cols = everything(), names_to = "column", values_to = "value") %>% | |
# 1. Count obs, missingness for each column | |
group_by(column) %>% | |
mutate(n = n(), | |
missing = sum(is.na(value))) %>% | |
ungroup() %>% | |
# 2. Get frequency of each distinct value | |
count(column, value, n, missing, name = "value_count") %>% | |
group_by(column) %>% | |
# Ensure count is a dbl, calculate missingness % and number of distinct values | |
mutate(min = NA_character_, | |
mean = NA_character_, | |
median = NA_character_, | |
max = NA_character_, | |
value_count = as.double(value_count), | |
missing_pct = scales::percent(missing/n), | |
distinct_values = n_distinct(value, na.rm=T), | |
type = "factor") %>% # Assign variable type | |
ungroup() %>% | |
# Now remove missing values (we have already counted them) | |
filter(!is.na(value) | distinct_values == 0) | |
# Remove factor vars from the remaining df | |
remaining = remaining %>% | |
select(-colnames(factor_vars)) | |
################################ | |
## Numeric vars | |
################################ | |
if( | |
map(remaining, class) %>% | |
some(~(.=="numeric")) | |
){ | |
# Select numeric variables | |
numeric_vars = remaining %>% | |
select(where(is.numeric)) | |
numeric_summary = numeric_vars %>% | |
# Summarise values: min, mean, median, max, missingness, and n distinct | |
summarise( | |
n = n(), | |
across(.fns = list( | |
min = ~round(min(., na.rm=T), 3), | |
mean = ~round(mean(., na.rm=T), 3), | |
median = ~round(median(., na.rm=T), 3), | |
max = ~round(max(., na.rm=T), 3), | |
missing = ~sum(is.na(.)), | |
distinct_values = ~n_distinct(., na.rm = T) | |
) | |
) | |
) %>% | |
# Pivot to have a row per column-measure | |
pivot_longer(-n, names_to = "col", values_to = "val") %>% | |
# Separate column-measure concatenation | |
separate(col = col, into = c("column", "measure"), sep = "_(?=(min|mean|median|max|missing|missing_pct|distinct_values))") %>% | |
# Pivot measures to their own columns | |
pivot_wider(id_cols = c(column, n), names_from = measure, values_from = val) %>% | |
mutate(type = "numeric", | |
# Calculate missingness percent | |
missing_pct = scales::percent(missing/n, accuracy = .01)) %>% | |
# Convert numeric measures to character to allow min/max of dates | |
mutate(across(.cols = matches("^(min|mean|median|max)$"), .fns = as.character)) | |
# Remove numerics from remaining df | |
remaining = remaining %>% | |
select(-colnames(numeric_vars)) | |
} | |
################################ | |
## Character vars | |
################################ | |
if( | |
map(remaining, class) %>% | |
some(~(.=="character")) | |
){ | |
# Select character vars | |
character_vars = remaining %>% | |
select(where(is.character)) | |
character_summary = character_vars %>% | |
summarise( | |
n = n(), | |
across(.cols = -n, | |
.fns = list( | |
missing = ~sum(is.na(.)), | |
distinct_values = ~n_distinct(., na.rm = T) | |
) | |
) | |
) %>% | |
# Pivot to have a row per column-measure | |
pivot_longer(-n, names_to = "col", values_to = "val") %>% | |
# Separate column-measure concatenation | |
separate(col = col, into = c("column", "measure"), sep = "_(?=(missing|missing_pct|distinct_values))") %>% | |
# Pivot measures to their own columns | |
pivot_wider(id_cols = c(column, n), names_from = measure, values_from = val) %>% | |
mutate(min = NA_character_, | |
mean = NA_character_, | |
median = NA_character_, | |
max = NA_character_, | |
type = "character", | |
# Calculate missingness percent | |
missing_pct = scales::percent(missing/n, accuracy = .01)) | |
# Remove characters from remaining df | |
remaining = remaining %>% | |
select(-colnames(character_vars)) | |
} | |
################################ | |
## Date vars | |
################################ | |
if( | |
map(remaining, class) %>% | |
some(~(.=="date")) | |
){ | |
# Select date vars | |
date_vars = remaining %>% | |
select(where(is.POSIXct)) %>% | |
mutate(across( | |
.fns = ~date(.) | |
)) | |
date_summary = date_vars %>% | |
# Summarise values: min, mean, median, max, missingness, and n distinct | |
summarise( | |
n = n(), | |
across(.cols = -n, | |
.fns = list( | |
min = ~min(., na.rm=T), | |
mean = ~mean(., na.rm=T), | |
median = ~median(., na.rm=T), | |
max = ~max(., na.rm=T), | |
missing = ~sum(is.na(.)), | |
distinct_values = ~n_distinct(., na.rm = T) | |
) | |
) | |
) %>% | |
# Convert all variables to character in order to pivot their values into the same column | |
mutate(across(.cols = -n, .fns = as.character)) %>% | |
# Pivot to have a row per column-measure | |
pivot_longer(-n, names_to = "col", values_to = "val") %>% | |
# Separate column-measure concatenation | |
separate(col = col, into = c("column", "measure"), sep = "_(?=(min|mean|median|max|missing|missing_pct|distinct_values))") %>% | |
# Pivot to have a row per column | |
pivot_wider(id_cols = c(column, n), names_from = measure, values_from = val) %>% | |
mutate(type = "date", | |
across(.cols = c(missing, distinct_values), .fns = as.numeric), | |
# Calculate missingness percent | |
missing_pct = scales::percent(missing/n, accuracy = .01)) | |
# Remove dates from remaining df | |
remaining = remaining %>% | |
select(-colnames(date_vars)) | |
} | |
######################################## | |
## Join each summary together | |
######################################## | |
codebook = list("factor_summary", | |
"numeric_summary", | |
"character_summary", | |
"date_summary") %>% | |
# Keep the dataframe objects which exist | |
keep(exists, envir = rlang::env_parent()) %>% | |
# Evaluate them by their name and bind them into a dataframe | |
map_dfr(~eval(sym(.))) %>% | |
# Remove n | |
filter(column != "n") %>% | |
select(column, type, value, value_count, distinct_values, missing, missing_pct, min, mean, median, max) %>% | |
arrange(column, value) | |
# Include a description | |
if(!missing(description_df)){ | |
codebook = left_join(codebook, description_df, by = "column") %>% | |
select(column, type, description, missing, missing_pct, | |
distinct_values, value, value_count, | |
min, mean, median, max) %>% | |
# Make sure descriptive stats are not characters | |
mutate(across(.cols = c(min, mean, median, max), .fns = ~suppressWarnings(as.numeric(.)))) %>% | |
arrange(column, value) | |
} | |
return(codebook) | |
} | |
# > build_codebook(mtcars, factor_levels = 4) | |
# A tibble: 17 x 11 | |
# column type value value_count distinct_values missing missing_pct min mean median max | |
# <chr> <chr> <chr> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> | |
# 1 am factor 0 19 2 0 0.00% NA NA NA NA | |
# 2 am factor 1 13 2 0 0.00% NA NA NA NA | |
# 3 carb numeric NA NA 6 0 0.00% 1 2.812 2 8 | |
# 4 cyl factor 4 11 3 0 0.00% NA NA NA NA | |
# 5 cyl factor 6 7 3 0 0.00% NA NA NA NA | |
# 6 cyl factor 8 14 3 0 0.00% NA NA NA NA | |
# 7 disp numeric NA NA 27 0 0.00% 71.1 230.722 196.3 472 | |
# 8 drat numeric NA NA 22 0 0.00% 2.76 3.597 3.695 4.93 | |
# 9 gear factor 3 15 3 0 0.00% NA NA NA NA | |
# 10 gear factor 4 12 3 0 0.00% NA NA NA NA | |
# 11 gear factor 5 5 3 0 0.00% NA NA NA NA | |
# 12 hp numeric NA NA 22 0 0.00% 52 146.688 123 335 | |
# 13 mpg numeric NA NA 25 0 0.00% 10.4 20.091 19.2 33.9 | |
# 14 qsec numeric NA NA 30 0 0.00% 14.5 17.849 17.71 22.9 | |
# 15 vs factor 0 18 2 0 0.00% NA NA NA NA | |
# 16 vs factor 1 14 2 0 0.00% NA NA NA NA | |
# 17 wt numeric NA NA 29 0 0.00% 1.513 3.217 3.325 5.424 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment