Skip to content

Instantly share code, notes, and snippets.

@alekrutkowski
Created January 10, 2022 08:34
Show Gist options
  • Save alekrutkowski/4af221d08e580ad08fbdd3bae283df5c to your computer and use it in GitHub Desktop.
Save alekrutkowski/4af221d08e580ad08fbdd3bae283df5c to your computer and use it in GitHub Desktop.
R data.table: Summarize each column with each function and name the new columns automatically
library(magrittr)
library(data.table)
summaries <- function(dt, col_names, fun_names, additional_code, by, sep='__')
eval(parse(
text=
expand.grid(var=col_names, fun=fun_names) %>%
{paste0('`',.$fun,sep,.$var,'`=`',.$fun,'`(',.$var,')')} %>%
c(additional_code) %>%
paste(collapse=',') %>%
paste0('dt[,.(',.,'),by=c(',
by %>%
gsub('"','\\"',.,fixed=TRUE) %>%
paste0('"',.,'"') %>%
paste(collapse=','),
')]')
))
# # Examples ---------------------------------------------------------
#
# ## Helpers
#
# Sum <- function(x)
# if (all(is.na(x))) # if all xs are NA then sum=NA instead of sum=0
# NA_real_ else sum(x,na.rm=TRUE)
#
# Mean <- function(x)
# mean(x, na.rm=TRUE)
#
# Median <- function(x)
# median(x, na.rm=TRUE)
#
# ## Fake data
# my_dt <-
# data.table(A=c('a','a','a', 'b','b','b'),
# B=c(1,2,3, 1,2,3),
# x=1:6,
# y=101:106)
# my_dt
# # A B x y
# # 1: a 1 1 101
# # 2: a 2 2 102
# # 3: a 3 3 103
# # 4: b 1 4 104
# # 5: b 2 5 105
# # 6: b 3 6 106
#
# ## Usage example:
# summaries(dt=my_dt,
# col_names=c('x','y'),
# fun_names=c('Sum','Mean','Median'),
# additional_code='Count=length(x)',
# by=c('A','B'))
# ## Generated code:
# # dt[,.(`Sum__x`=`Sum`(x),
# # `Sum__y`=`Sum`(y),
# # `Mean__x`=`Mean`(x),
# # `Mean__y`=`Mean`(y),
# # `Median__x`=`Median`(x),
# # `Median__y`=`Median`(y),
# # Count=length(x))
# # ,by=c("A","B")]
# ## Result:
# # A B Sum__x Sum__y Mean__x Mean__y Median__x Median__y Count
# # 1: a 1 1 101 1 101 1 101 1
# # 2: a 2 2 102 2 102 2 102 1
# # 3: a 3 3 103 3 103 3 103 1
# # 4: b 1 4 104 4 104 4 104 1
# # 5: b 2 5 105 5 105 5 105 1
# # 6: b 3 6 106 6 106 6 106 1
#
# ## Another usage example:
# summaries(dt=my_dt,
# col_names=c('x','y'),
# fun_names=c('Sum','Mean','Median'),
# additional_code='Count=length(x)',
# by=c('A'))
# ## Result:
# # A Sum__x Sum__y Mean__x Mean__y Median__x Median__y Count
# # 1: a 6 306 2 102 2 102 3
# # 2: b 15 315 5 105 5 105 3
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment