Skip to content

Instantly share code, notes, and snippets.

@danielecook
Last active April 19, 2020 15:32
Show Gist options
  • Save danielecook/2f4fb084342a872d90b4d96fe20f049c to your computer and use it in GitHub Desktop.
Save danielecook/2f4fb084342a872d90b4d96fe20f049c to your computer and use it in GitHub Desktop.
Data Dictionary
# Generate data dictionary
DESCRIPTIONS <- list(mpg = "Miles Per Gallon")
# data dict function
data_dict <- function(data, descriptions = NA, print_table = FALSE){
# Adapted from https://github.com/pw2/Data-Dictionary-Function/blob/master/Data%20Dictionary%20Function.R
if(!require(pacman)) install.packages("pacman")
pacman::p_load('tidyverse', 'psych', 'ggpubr', 'gridExtra', 'knitr', 'data.table')
options(scipen = 999)
# Get variable info and NAs
var_info <- data.table::data.table(variable = names(data),
variable_type = sapply(data, class),
cardinality = sapply(data, function(x)
x %>%
purrr::keep(~ !is.na(.x)) %>%
unique() %>%
length()),
n_missing = sapply(data, function(y)
y %>% purrr::keep(~ is.na(.x)) %>%
length()),
example = sapply(data, function(x)
stringr::str_trunc(as.character(purrr::detect(x, ~!is.na(.x))), width = 20))
)
# get descriptive stats
desc_stats <- data.frame(variable = names(data),
describe(data)[c(2:5, 13, 8:10)],
row.names = NULL)
# Create the data dictionary
d_dict <- merge(var_info, desc_stats, by = "variable", sort=FALSE)
d_dict <- d_dict %>% dplyr::mutate_at(vars("mean":"range"), .fun = round, 2)
# NA's for summary stats of variables not of class numeric or integer
d_dict <- d_dict %>%
dplyr::mutate(mean = ifelse(variable_type == "numeric" | variable_type == "integer", mean, ""),
sd = ifelse(variable_type == "numeric" | variable_type == "integer", sd, ""),
median = ifelse(variable_type == "numeric" | variable_type == "integer", median, ""),
se = ifelse(variable_type == "numeric" | variable_type == "integer", se, ""),
min = ifelse(variable_type == "numeric" | variable_type == "integer", min, ""),
max = ifelse(variable_type == "numeric" | variable_type == "integer", max, ""),
range = ifelse(variable_type == "numeric" | variable_type == "integer", range, ""))
setDT(d_dict)
# Merge in descriptions
if (!is.na(descriptions)) {
desc <- data.table::data.table(variable = names(descriptions), description = descriptions)
d_dict <- merge(d_dict, desc, all.x = TRUE, on = "variable")
d_dict[is.na(description) == TRUE, description := ""][]
missing_vars <- d_dict$variable %>% purrr::discard(~.x %in% names(descriptions))
if(length(missing_vars) > 0) {
warning(paste("\nMissing variable descriptions for: \n\n", paste0(missing_vars, sep="", collapse="\n"), sep=""))
}
}
d_dict
}
data_dict(mtcars, DESCRIPTIONS)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment