Skip to content

Instantly share code, notes, and snippets.

@nanxstats
Last active November 16, 2023 06:19
Show Gist options
  • Save nanxstats/cf71094cc730147a0fc230b86f642db3 to your computer and use it in GitHub Desktop.
Save nanxstats/cf71094cc730147a0fc230b86f642db3 to your computer and use it in GitHub Desktop.
simtrial backend benchmark sketch
n 1 2 4 8 16
dplyr 5093.77 2671.44 1447.21 810.42 446.06
data.table 1336.79 677.94 364.5 217.75 143.95
n 1 2 4 8 16 32
dplyr 487.89 264.16 145.39 86.51 62.51 88.83
data.table 131.3 67.36 38.89 28.19 31 65.65
remove.packages("simtrial")
remotes::install_github("Merck/simtrial@341f77f", force = TRUE, update = FALSE)
library("simtrial")
library("future")
library("doFuture")
library("tictoc")
k <- 10000
enroll_rate <- data.frame(rate = c(5, 20, 10), duration = c(100, 150, 150))
sim_fix <- function(k) {
simtrial::sim_fixed_n(
n_sim = k,
sample_size = 3000,
target_event = 700,
enroll_rate = enroll_rate,
timing_type = 2
)
}
set.seed(42)
plan(sequential)
tic()
for (i in 1:10) dplyr_01 <- sim_fix(k)
toc()
set.seed(42)
plan(multisession, workers = 2)
tic()
for (i in 1:10) dplyr_02 <- sim_fix(k)
toc()
set.seed(42)
plan(multisession, workers = 4)
tic()
for (i in 1:10) dplyr_04 <- sim_fix(k)
toc()
set.seed(42)
plan(multisession, workers = 8)
tic()
for (i in 1:10) dplyr_08 <- sim_fix(k)
toc()
set.seed(42)
plan(multisession, workers = 16)
tic()
for (i in 1:10) dplyr_16 <- sim_fix(k)
toc()
rstudioapi::restartSession()
remove.packages("simtrial")
remotes::install_github("Merck/simtrial@206ca44", force = TRUE, update = FALSE)
library("simtrial")
library("future")
library("doFuture")
library("tictoc")
k <- 10000
enroll_rate <- data.frame(rate = c(5, 20, 10), duration = c(100, 150, 150))
sim_fix <- function(k) {
simtrial::sim_fixed_n(
n_sim = k,
sample_size = 3000,
target_event = 700,
enroll_rate = enroll_rate,
timing_type = 2
)
}
set.seed(42)
plan(sequential)
tic()
for (i in 1:10) dt_01 <- sim_fix(k)
toc()
set.seed(42)
plan(multisession, workers = 2)
tic()
for (i in 1:10) dt_02 <- sim_fix(k)
toc()
set.seed(42)
plan(multisession, workers = 4)
tic()
for (i in 1:10) dt_04 <- sim_fix(k)
toc()
set.seed(42)
plan(multisession, workers = 8)
tic()
for (i in 1:10) dt_08 <- sim_fix(k)
toc()
set.seed(42)
plan(multisession, workers = 16)
tic()
for (i in 1:10) dt_16 <- sim_fix(k)
toc()
library("ggplot2")
library("cowplot")
library("ggsci")
x <- read.table("simtrial-10k.tsv")
x <- as.data.frame(t(x[, 2:ncol(x)]))
colnames(x) <- c("Cores", "dplyr", "data.table")
row.names(x) <- NULL
# Time
df_time <- reshape(
data = x,
varying = list(c("dplyr", "data.table")),
v.names = "Time",
timevar = "Backend",
times = c("dplyr", "data.table"),
direction = "long"
)
ggplot(df_time, aes(x = Cores, y = Time, color = Backend)) +
geom_line() +
geom_point() +
labs(
title = "Performance comparison",
x = "Number of cores",
y = "Time (s)",
color = "Backend"
) +
scale_x_continuous(breaks = c(1, 2, 4, 8, 16)) +
theme_cowplot() +
background_grid() +
scale_color_d3()
# Speedup
baseline_dplyr <- x$dplyr[x$Cores == 1]
baseline_data_table <- x$data.table[x$Cores == 1]
x$speedup_dplyr <- baseline_dplyr / x$dplyr
x$speedup_data_table <- baseline_data_table / x$data.table
df_speedup <- reshape(
data = x,
varying = list(c("speedup_dplyr", "speedup_data_table")),
v.names = "Speedup",
timevar = "Backend",
times = c("dplyr", "data.table"),
direction = "long"
)
ggplot(df_speedup, aes(x = Cores, y = Speedup, color = Backend)) +
geom_line() +
geom_point() +
labs(
title = "Speedup vs. number of cores",
x = "Number of cores",
y = "Speedup",
color = "Backend"
) +
scale_x_continuous(breaks = c(1, 2, 4, 8, 16)) +
scale_y_continuous(breaks = c(2, 4, 6, 8, 10)) +
theme_cowplot() +
background_grid() +
scale_color_d3()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment