Last active
May 3, 2017 15:53
-
-
Save tjvananne/ea37f1597452c9fc75d80133bf1d7456 to your computer and use it in GitHub Desktop.
Generate Generic Data Dictionary in R. It will count the number of blanks, the number of NAs, tell you the number of unique values per column, calculate the percentages of the previously mentioned column aggregations, and report out the top n (5 is default) number of unique values per row.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# generic data dictionary creation using base-R | |
#' a couple notes: this could of course be done much faster using | |
#' third party packages, but I like to provide base-R solutions before | |
#' branching out into packages just in case they aren't available | |
#' | |
#' Also, this could be done in a much less verbose and modular way, | |
#' but I did want to also demonstrate the "Functional Programming" | |
#' abilities within R (although no functions were passed to functions) | |
#' and wanted this to be a helpful reference gist | |
# function definitions --------------------------------------------------------------------- | |
# these are to be used on a single data frame column, preferrably in a "sapply()" call | |
func_unq <- function(df_col, n_returned=5) { | |
return(unique(df_col)[1:n_returned]) | |
} | |
func_unq_ln <- function(df_col) { | |
return(length(unique(df_col))) | |
} | |
func_NAs <- function(df_col) { | |
return(sum(is.na(df_col))) | |
} | |
func_blanks <- function(df_col) { | |
return(sum( trimws(df_col) == '', na.rm=T)) | |
} | |
# this is the wrapper function around the lower-level ones above | |
func_build_data_dict <- function(df, n_returned=5) { | |
t_start <- Sys.time() | |
# # for testing: | |
# df <- dataset | |
# n_returned <- 5 | |
# initialize some space for collection | |
dict_all <- list() | |
# clear current environment from all of the "this_" | |
# if this is in a function, will it affect things outside the function? #YOLO | |
rm(list=ls()[grepl("^this_", ls())]) | |
# execute all lower level funcs | |
print(paste0("Gathering first ", n_returned, " unique values per column...")) | |
this_unq <- data.frame(t(sapply(df, func_unq, n_returned=n_returned)), stringsAsFactors = F) | |
names(this_unq) <- paste0('unique_val_', 1:ncol(this_unq)) | |
print("Calculating how many unique values per column...") | |
this_unq_ln <- data.frame(sapply(df, func_unq_ln), stringsAsFactors = F) | |
names(this_unq_ln) <- 'count_of_unique_values' | |
print("Counting the 'NA' values per column...") | |
this_NAs <- data.frame(sapply(df, func_NAs), stringsAsFactors = F) | |
names(this_NAs) <- 'count_of_NA_values' | |
print("Counting the blank values per column...") | |
this_blanks <- data.frame(sapply(df, func_blanks), stringsAsFactors = F) | |
names(this_blanks) <- 'count_of_blanks' | |
# isolate the "this_" objects in environment, loop through and add to collection | |
print("Combinding results into data dictionary...") | |
all_of_this <- ls()[grepl("^this_", ls())] | |
for(i in seq_along(all_of_this)) { | |
dict_all[[i]] <- eval(parse(text=all_of_this[[i]])) | |
} | |
# compile into final data.frame and make sure column names are a column as well | |
final <- do.call(cbind, dict_all) | |
final <- cbind(column_name=row.names(final), final) | |
final$percent_blank <- round(final$count_of_blanks / nrow(df) * 100, digits = 2) | |
final$percent_NAs <- round(final$count_of_NA_values / nrow(df) * 100, digits = 2) | |
final$percent_unique <- round(final$count_of_unique_values / nrow(df) * 100, digits = 2) | |
print(paste0("Finished after: ", round(as.numeric(Sys.time() - t_start, units="mins"), digits = 2), " minutes")) | |
return(final) | |
} | |
# using the functions -------------------------------------------------------------------------------- | |
# built-in dataset, no need for packages | |
dataset <- airquality | |
for(i in 1:50) {dataset[sample(1:nrow(dataset), 1), sample(1:ncol(dataset), 1)] <- ""} # add some blanks | |
# now lets test it out | |
my_data_dict <- func_build_data_dict(dataset) | |
# testing on a larger dataset -------------------------------------------------------------------------- | |
# set this to TRUE and run this code chunk if you want to test on a larger dataset (1 Million rows) | |
TEST_LARGE_DATASET <- FALSE | |
if(TEST_LARGE_DATASET) { | |
dataset <- airquality | |
for(i in 1:50) {dataset[sample(1:nrow(dataset), 1), sample(1:ncol(dataset), 1)] <- ""} # add some blanks | |
# build a larger dataset | |
list_datasets <- list() | |
for(i in 1:10000) {list_datasets[[i]] <- dataset} | |
dataset_1M_rows <- do.call(rbind, list_datasets) # this takes a while to build | |
# alright, let's build the data dictionary for a much larger dataset | |
big_data_dictionary <- func_build_data_dict(dataset_1M_rows) | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment