Skip to content

Instantly share code, notes, and snippets.

@RCura
Last active February 3, 2018 06:24
Show Gist options
  • Save RCura/2d32ac3d253ea368bffadca3c9ffde0b to your computer and use it in GitHub Desktop.
Save RCura/2d32ac3d253ea368bffadca3c9ffde0b to your computer and use it in GitHub Desktop.
Comparing the performances of various DBMS using dplyr for large-ish data
library(tidyverse)
library(dbplyr)
library(DBI)
# 1 - Creating the big df example
bigSample <- tibble(uid = 1:50E6,
category1 = sample(x = LETTERS, size = 50E6, replace = TRUE),
category2 = sample(x = sample(x = colors(), size = 10, replace = FALSE), size = 50E6, replace = TRUE),
integer1 = round(runif(n = 50E6, min = 1, max = 1000))
)
# dplyr -------------------------------------------------------------------
testDF <- bigSample
# group by on 2 variables
benchmarks <- tibble(db = character(0),
test = character(0),
perftype = character(0),
perf = double(0))
bench <- system.time({
testDF %>%
group_by(category1, category2) %>%
summarise(groupmean = mean(integer1)) %>%
collect()
})
thisBench <- tibble(db = "dplyr",
test = "group_by",
perftype = names(bench),
perf = bench)
benchmarks <- benchmarks %>%
bind_rows(thisBench)
# left_join
bench <- system.time({
refTable <- testDF %>%
group_by(category1, category2) %>%
summarise(groupmean = mean(integer1))
testDF %>%
left_join(y = refTable, by = c("category1", "category2")) %>%
mutate(ratio = integer1 / groupmean) %>%
collect()
})
thisBench <- tibble(db = "dplyr",
test = "left_join",
perftype = names(bench),
perf = bench)
benchmarks <- benchmarks %>%
bind_rows(thisBench)
## SQLite - Memory DB ----------------------------------------------------------------------
library(RSQLite)
con <- DBI::dbConnect(RSQLite::SQLite(), path = ":memory:")
bench <- system.time({
copy_to(con, bigSample, "testsample",
temporary = FALSE)
})
thisBench <- tibble(db = "sqlite_memory",
test = "copy_to",
perftype = names(bench),
perf = bench)
benchmarks <- benchmarks %>%
bind_rows(thisBench)
testDF <- tbl(con, "testsample")
# group by on 2 variables
bench <- system.time({
testDF %>%
group_by(category1, category2) %>%
summarise(groupmean = mean(integer1)) %>%
collect()
})
thisBench <- tibble(db = "sqlite_memory",
test = "group_by",
perftype = names(bench),
perf = bench)
benchmarks <- benchmarks %>%
bind_rows(thisBench)
# left_join
bench <- system.time({
refTable <- testDF %>%
group_by(category1, category2) %>%
summarise(groupmean = mean(integer1))
testDF %>%
left_join(y = refTable, by = c("category1", "category2")) %>%
mutate(ratio = integer1 / groupmean) %>%
collect()
})
thisBench <- tibble(db = "sqlite_memory",
test = "left_join",
perftype = names(bench),
perf = bench)
benchmarks <- benchmarks %>%
bind_rows(thisBench)
dbDisconnect(con)
### SQLite - File DB ----------------------------------------------------------------------
con <- DBI::dbConnect(RSQLite::SQLite(), path = "testSQLite.db")
bench <- system.time({
copy_to(con, bigSample, "testsample",
temporary = FALSE)
})
thisBench <- tibble(db = "sqlite_file",
test = "copy_to",
perftype = names(bench),
perf = bench)
benchmarks <- benchmarks %>%
bind_rows(thisBench)
testDF <- tbl(con, "testsample")
# group by on 2 variables
bench <- system.time({
testDF %>%
group_by(category1, category2) %>%
summarise(groupmean = mean(integer1)) %>%
collect()
})
thisBench <- tibble(db = "sqlite_file",
test = "group_by",
perftype = names(bench),
perf = bench)
benchmarks <- benchmarks %>%
bind_rows(thisBench)
# left_join
bench <- system.time({
refTable <- testDF %>%
group_by(category1, category2) %>%
summarise(groupmean = mean(integer1))
testDF %>%
left_join(y = refTable, by = c("category1", "category2")) %>%
mutate(ratio = integer1 / groupmean) %>%
collect()
})
thisBench <- tibble(db = "sqlite_file",
test = "left_join",
perftype = names(bench),
perf = bench)
benchmarks <- benchmarks %>%
bind_rows(thisBench)
dbDisconnect(con)
### PostgreSQL ----------------------------------------------------------------------
library(RPostgreSQL)
con <- DBI::dbConnect(RPostgreSQL::PostgreSQL(), dbname = "robin")
bench <- system.time({
DBI::dbWriteTable(con, name = "testsample", value = bigSample, temporary = FALSE, overwrite = TRUE)
})
thisBench <- tibble(db = "postgresql",
test = "copy_to",
perftype = names(bench),
perf = bench)
benchmarks <- benchmarks %>%
bind_rows(thisBench)
testDF <- tbl(con, "testsample")
# group by on 2 variables
bench <- system.time({
testDF %>%
group_by(category1, category2) %>%
summarise(groupmean = mean(integer1)) %>%
collect()
})
thisBench <- tibble(db = "postgresql",
test = "group_by",
perftype = names(bench),
perf = bench)
benchmarks <- benchmarks %>%
bind_rows(thisBench)
# left_join
bench <- system.time({
refTable <- testDF %>%
group_by(category1, category2) %>%
summarise(groupmean = mean(integer1))
testDF %>%
left_join(y = refTable, by = c("category1", "category2")) %>%
mutate(ratio = integer1 / groupmean) %>%
collect()
})
thisBench <- tibble(db = "postgresql",
test = "left_join",
perftype = names(bench),
perf = bench)
benchmarks <- benchmarks %>%
bind_rows(thisBench)
db_drop_table(con, table = "testsample")
dbDisconnect(con)
### MonetDB ----------------------------------------------------------------------
library(MonetDBLite)
con <- dbConnect(MonetDBLite::MonetDBLite())
bench <- system.time({
copy_to(con, bigSample, "testsample",
temporary = FALSE)
})
thisBench <- tibble(db = "monetdb",
test = "copy_to",
perftype = names(bench),
perf = bench)
benchmarks <- benchmarks %>%
bind_rows(thisBench)
testDF <- tbl(con, "testsample")
# group by on 2 variables
bench <- system.time({
testDF %>%
group_by(category1, category2) %>%
summarise(groupmean = mean(integer1)) %>%
collect()
})
thisBench <- tibble(db = "monetdb",
test = "group_by",
perftype = names(bench),
perf = bench)
benchmarks <- benchmarks %>%
bind_rows(thisBench)
# left_join
bench <- system.time({
refTable <- testDF %>%
group_by(category1, category2) %>%
summarise(groupmean = mean(integer1))
testDF %>%
left_join(y = refTable, by = c("category1", "category2")) %>%
mutate(ratio = integer1 / groupmean) %>%
collect()
})
thisBench <- tibble(db = "monetdb",
test = "left_join",
perftype = names(bench),
perf = bench)
benchmarks <- benchmarks %>%
bind_rows(thisBench)
DBI::dbDisconnect(con)
MonetDBLite::monetdblite_shutdown()
benchName <- tibble(db = c("dplyr", "sqlite_memory", "sqlite_file", "postgresql", "monetdb"),
DBMS = c("Dplyr (in memory)", "SQLite (in memory)", "SQLite (file)", "PostgreSQL", "MonetDBLite"))
library(hrbrthemes)
benchmarks %>%
filter(perftype == "elapsed") %>%
select(-perftype) %>%
spread(key = test, value = perf) %>%
gather(key = test, value = time, -db) %>%
left_join(benchName, by = "db") %>%
ggplot(aes(DBMS, time, fill = DBMS)) +
geom_col() +
facet_wrap(~test, ncol = 1) +
labs(title = "Benchmarking \"out-of-memory\"\ndata analysis DBMS",
subtitle = "Benchmarks run on a 50 million rows dataset using a common dplyr query",
caption = "R. Cura, 2017 @RobinCura") +
ylab("Time [seconds] (Smaller is better)") +
xlab("DataBase Management System") +
theme_ipsum() +
theme(legend.position="none")
ggsave(last_plot(), filename = "DMBS_benchmarks.png", width = 20, height = 20, units = "cm")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment