Last active
April 19, 2020 15:32
-
-
Save danielecook/2f4fb084342a872d90b4d96fe20f049c to your computer and use it in GitHub Desktop.
Data Dictionary
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Generate data dictionary | |
DESCRIPTIONS <- list(mpg = "Miles Per Gallon") | |
# data dict function | |
data_dict <- function(data, descriptions = NA, print_table = FALSE){ | |
# Adapted from https://github.com/pw2/Data-Dictionary-Function/blob/master/Data%20Dictionary%20Function.R | |
if(!require(pacman)) install.packages("pacman") | |
pacman::p_load('tidyverse', 'psych', 'ggpubr', 'gridExtra', 'knitr', 'data.table') | |
options(scipen = 999) | |
# Get variable info and NAs | |
var_info <- data.table::data.table(variable = names(data), | |
variable_type = sapply(data, class), | |
cardinality = sapply(data, function(x) | |
x %>% | |
purrr::keep(~ !is.na(.x)) %>% | |
unique() %>% | |
length()), | |
n_missing = sapply(data, function(y) | |
y %>% purrr::keep(~ is.na(.x)) %>% | |
length()), | |
example = sapply(data, function(x) | |
stringr::str_trunc(as.character(purrr::detect(x, ~!is.na(.x))), width = 20)) | |
) | |
# get descriptive stats | |
desc_stats <- data.frame(variable = names(data), | |
describe(data)[c(2:5, 13, 8:10)], | |
row.names = NULL) | |
# Create the data dictionary | |
d_dict <- merge(var_info, desc_stats, by = "variable", sort=FALSE) | |
d_dict <- d_dict %>% dplyr::mutate_at(vars("mean":"range"), .fun = round, 2) | |
# NA's for summary stats of variables not of class numeric or integer | |
d_dict <- d_dict %>% | |
dplyr::mutate(mean = ifelse(variable_type == "numeric" | variable_type == "integer", mean, ""), | |
sd = ifelse(variable_type == "numeric" | variable_type == "integer", sd, ""), | |
median = ifelse(variable_type == "numeric" | variable_type == "integer", median, ""), | |
se = ifelse(variable_type == "numeric" | variable_type == "integer", se, ""), | |
min = ifelse(variable_type == "numeric" | variable_type == "integer", min, ""), | |
max = ifelse(variable_type == "numeric" | variable_type == "integer", max, ""), | |
range = ifelse(variable_type == "numeric" | variable_type == "integer", range, "")) | |
setDT(d_dict) | |
# Merge in descriptions | |
if (!is.na(descriptions)) { | |
desc <- data.table::data.table(variable = names(descriptions), description = descriptions) | |
d_dict <- merge(d_dict, desc, all.x = TRUE, on = "variable") | |
d_dict[is.na(description) == TRUE, description := ""][] | |
missing_vars <- d_dict$variable %>% purrr::discard(~.x %in% names(descriptions)) | |
if(length(missing_vars) > 0) { | |
warning(paste("\nMissing variable descriptions for: \n\n", paste0(missing_vars, sep="", collapse="\n"), sep="")) | |
} | |
} | |
d_dict | |
} | |
data_dict(mtcars, DESCRIPTIONS) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment