danielecook · April 19, 2020 15:32
diff --git a/data_dictionary.R b/data_dictionary.R
 # Generate data dictionary

 DESCRIPTIONS <- list(mpg = "Miles Per Gallon")

 # data dict function
 data_dict <- function(data, descriptions = NA, print_table = FALSE){
  # Adapted from https://github.com/pw2/Data-Dictionary-Function/blob/master/Data%20Dictionary%20Function.R
  if(!require(pacman)) install.packages("pacman")
  pacman::p_load('tidyverse', 'psych', 'ggpubr', 'gridExtra', 'knitr', 'data.table')
  
  options(scipen = 999)
  
  # Get variable info and NAs
  var_info <- data.table::data.table(variable = names(data),
                                     variable_type = sapply(data, class),
                                     cardinality = sapply(data, function(x)
                                                            x %>%
                                                              purrr::keep(~ !is.na(.x)) %>%
                                                              unique() %>% 
                                                              length()),
                                     n_missing = sapply(data, function(y) 
                                       y %>% purrr::keep(~ is.na(.x)) %>%
                                         length()),
                                     example = sapply(data, function(x) 
                                       stringr::str_trunc(as.character(purrr::detect(x, ~!is.na(.x))), width = 20))
                                     )

   # get descriptive stats
   desc_stats <- data.frame(variable = names(data), 
                            describe(data)[c(2:5, 13, 8:10)], 
                            row.names = NULL)
   
   # Create the data dictionary
   d_dict <- merge(var_info, desc_stats, by = "variable", sort=FALSE)
   d_dict <- d_dict %>% dplyr::mutate_at(vars("mean":"range"), .fun = round, 2)
   
   # NA's for summary stats of variables not of class numeric or integer
   d_dict <- d_dict %>%
     dplyr::mutate(mean = ifelse(variable_type == "numeric" | variable_type == "integer", mean, ""),
                   sd = ifelse(variable_type == "numeric" | variable_type == "integer", sd, ""),
                   median = ifelse(variable_type == "numeric" | variable_type == "integer", median, ""),
                   se = ifelse(variable_type == "numeric" | variable_type == "integer", se, ""),
                   min = ifelse(variable_type == "numeric" | variable_type == "integer", min, ""),
                   max = ifelse(variable_type == "numeric" | variable_type == "integer", max, ""),
                   range = ifelse(variable_type == "numeric" | variable_type == "integer", range, ""))
   
   setDT(d_dict)
   # Merge in descriptions
   if (!is.na(descriptions)) {
     desc <- data.table::data.table(variable = names(descriptions), description = descriptions)
     d_dict <- merge(d_dict, desc, all.x = TRUE, on = "variable")
     d_dict[is.na(description) == TRUE, description := ""][]
     missing_vars <- d_dict$variable %>% purrr::discard(~.x %in% names(descriptions))
     if(length(missing_vars) > 0) {
       warning(paste("\nMissing variable descriptions for: \n\n", paste0(missing_vars, sep="", collapse="\n"), sep=""))
     }
   }
   d_dict
 }

 data_dict(mtcars, DESCRIPTIONS)
	# Generate data dictionary

	DESCRIPTIONS <- list(mpg = "Miles Per Gallon")

	# data dict function
	data_dict <- function(data, descriptions = NA, print_table = FALSE){
	# Adapted from https://github.com/pw2/Data-Dictionary-Function/blob/master/Data%20Dictionary%20Function.R
	if(!require(pacman)) install.packages("pacman")
	pacman::p_load('tidyverse', 'psych', 'ggpubr', 'gridExtra', 'knitr', 'data.table')

	options(scipen = 999)

	# Get variable info and NAs
	var_info <- data.table::data.table(variable = names(data),
	variable_type = sapply(data, class),
	cardinality = sapply(data, function(x)
	x %>%
	purrr::keep(~ !is.na(.x)) %>%
	unique() %>%
	length()),
	n_missing = sapply(data, function(y)
	y %>% purrr::keep(~ is.na(.x)) %>%
	length()),
	example = sapply(data, function(x)
	stringr::str_trunc(as.character(purrr::detect(x, ~!is.na(.x))), width = 20))
	)

	# get descriptive stats
	desc_stats <- data.frame(variable = names(data),
	describe(data)[c(2:5, 13, 8:10)],
	row.names = NULL)

	# Create the data dictionary
	d_dict <- merge(var_info, desc_stats, by = "variable", sort=FALSE)
	d_dict <- d_dict %>% dplyr::mutate_at(vars("mean":"range"), .fun = round, 2)

	# NA's for summary stats of variables not of class numeric or integer
	d_dict <- d_dict %>%
	dplyr::mutate(mean = ifelse(variable_type == "numeric" \| variable_type == "integer", mean, ""),
	sd = ifelse(variable_type == "numeric" \| variable_type == "integer", sd, ""),
	median = ifelse(variable_type == "numeric" \| variable_type == "integer", median, ""),
	se = ifelse(variable_type == "numeric" \| variable_type == "integer", se, ""),
	min = ifelse(variable_type == "numeric" \| variable_type == "integer", min, ""),
	max = ifelse(variable_type == "numeric" \| variable_type == "integer", max, ""),
	range = ifelse(variable_type == "numeric" \| variable_type == "integer", range, ""))

	setDT(d_dict)
	# Merge in descriptions
	if (!is.na(descriptions)) {
	desc <- data.table::data.table(variable = names(descriptions), description = descriptions)
	d_dict <- merge(d_dict, desc, all.x = TRUE, on = "variable")
	d_dict[is.na(description) == TRUE, description := ""][]
	missing_vars <- d_dict$variable %>% purrr::discard(~.x %in% names(descriptions))
	if(length(missing_vars) > 0) {
	warning(paste("\nMissing variable descriptions for: \n\n", paste0(missing_vars, sep="", collapse="\n"), sep=""))
	}
	}
	d_dict
	}

	data_dict(mtcars, DESCRIPTIONS)