Skip to content

Instantly share code, notes, and snippets.

@infotroph
Created September 4, 2017 03:09
Show Gist options
  • Select an option

  • Save infotroph/fd76379d8a0e2aa3d7a8ed4ac76dbd2c to your computer and use it in GitHub Desktop.

Select an option

Save infotroph/fd76379d8a0e2aa3d7a8ed4ac76dbd2c to your computer and use it in GitHub Desktop.
# Overthinking a speed comparison. The task at hand is:
# "if this column contains values greater than 1, assume they're percentages and divide them by 100"
library(microbenchmark)
library(data.table)
library(dplyr)
library(ggplot2)
# We'll generate 20 columns for realistic size, but only column 10 used in this test
newdata <- function(nrow, max_1 = TRUE){
x <- replicate(
n = 20,
expr = {
row <- rnorm(nrow)
if(max_1){ row <- row/max(row) }
row
})
as.data.frame(cbind(x))
}
base <- function(dat){
if(max(dat$V10) > 1){ dat$V10 = dat$V10 / 100 }
}
dt <- function(dat){
if(dat[, max(V10) > 1]){ dat[, `:=`(V10 = V10 / 100)] }
}
dp <- function(dat){
if((dat %>% select(V10) %>% max) > 1){ dat = dat %>% mutate(V10 = V10 / 100) }
}
base_dt <- function(dat){
if(max(dat$V10) > 1){ dat[, `:=`(V10 = V10 / 100)] }
}
base_dp <- function(dat){
if(max(dat$V10) > 1){ dat = dat %>% mutate(V10 = V10 / 100) }
}
dt_base <- function(dat){
if(dat[, max(V10) > 1]){ dat$V10 = dat$V10 / 100 }
}
dp_base <- function(dat){
if((dat %>% select(V10) %>% max) > 1){ dat$V10 = dat$V10 / 100 }
}
compare_fns <- function(nrow, nrep){
replicate(
n = nrep,
expr = {
max_1 <- sample(c(TRUE, FALSE), size = 1)
x <- newdata(nrow, max_1);
# Pre-convert so each function gets the format it's expecting
xt <- as.data.table(x);
xd <- as_tibble(x);
mb <- microbenchmark(
base = base(x),
dt = dt(xt),
dp = dp(xd),
base_dt = base_dt(xt),
base_dp = base_dp(xd),
dt_base = dt_base(xt),
dp_base = dp_base(xd),
times = 100);
mb$nrow <- nrow;
mb$max_1 <- max_1;
mb$id <- digest::sha1(mb);
as.data.frame(mb)
},
simplify=FALSE)
}
res_1k_df <- do.call("rbind", compare_fns(nrow = 1e3, nrep = 20))
res_10k_df <- do.call("rbind", compare_fns(nrow = 1e4, nrep = 20))
res_100k_df <- do.call("rbind", compare_fns(nrow = 1e5, nrep = 20))
# drop very slowest outliers from each group before plotting --
# otherwise tails are absurdly long
# I think these may be a microbenchmark artifact
res <- (rbind(res_1k_df, res_10k_df, res_100k_df)
%>% group_by(expr, id, nrow)
%>% filter(time < quantile(time, 0.95))
%>% mutate(time = time/1e6) # ns to ms
)
all <- (ggplot(res,
aes(x = expr, y = time, group = paste(expr, id), color=max_1))
+ geom_violin()
+ facet_wrap(~nrow)
+ ylab("time (ms)")
+ theme_bw()
+ theme(legend.position = c(0.05, 0.8), aspect.ratio = 0.6667))
ggsave(filename = "dt_vs_base_all.pdf", plot = all, width = 300, height = 150, units = "mm")
# ==> The functions with base calls as the conditional test seem to be consistently fastest.
# Let's zoom in and see if there's any difference between reassignment methods.
justbase <- (ggplot(res %>% filter(grepl("^base", expr)),
aes(x = expr, y = time, group = paste(expr, id), color=max_1))
+ geom_violin()
+ facet_wrap(~nrow)
+ ylab("time (ms)")
+ theme_bw()
+ theme(legend.position = c(0.05, 0.8), aspect.ratio = 0.667))
ggsave(filename = "dt_vs_base_base.pdf", plot = justbase, width = 300, height = 150, units = "mm")
#... Answer: No, not reliably.
@infotroph
Copy link
Author

dt_vs_base_all

@infotroph
Copy link
Author

dt_vs_base_base

@infotroph
Copy link
Author

Overall conclusion:

  • For the if test, use the base R construction if(max(dat$V10) > 1). It's the fastest AND most obvious.
  • For sufficiently large datasets, the data.table approach dat[, :=(V10 = V10 / 100)] is faster than the base R dat$V10 = dat$V10 / 100, but the difference is small.
  • The dplyr call shown here is far slower than the others. Maybe it could be optimized -- x %>% mutate(...) to mutate(x, ...)? -- but I haven't tried.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment