Last active
February 3, 2018 06:24
-
-
Save RCura/2d32ac3d253ea368bffadca3c9ffde0b to your computer and use it in GitHub Desktop.
Comparing the performances of various DBMS using dplyr for large-ish data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(dbplyr) | |
library(DBI) | |
# 1 - Creating the big df example | |
bigSample <- tibble(uid = 1:50E6, | |
category1 = sample(x = LETTERS, size = 50E6, replace = TRUE), | |
category2 = sample(x = sample(x = colors(), size = 10, replace = FALSE), size = 50E6, replace = TRUE), | |
integer1 = round(runif(n = 50E6, min = 1, max = 1000)) | |
) | |
# dplyr ------------------------------------------------------------------- | |
testDF <- bigSample | |
# group by on 2 variables | |
benchmarks <- tibble(db = character(0), | |
test = character(0), | |
perftype = character(0), | |
perf = double(0)) | |
bench <- system.time({ | |
testDF %>% | |
group_by(category1, category2) %>% | |
summarise(groupmean = mean(integer1)) %>% | |
collect() | |
}) | |
thisBench <- tibble(db = "dplyr", | |
test = "group_by", | |
perftype = names(bench), | |
perf = bench) | |
benchmarks <- benchmarks %>% | |
bind_rows(thisBench) | |
# left_join | |
bench <- system.time({ | |
refTable <- testDF %>% | |
group_by(category1, category2) %>% | |
summarise(groupmean = mean(integer1)) | |
testDF %>% | |
left_join(y = refTable, by = c("category1", "category2")) %>% | |
mutate(ratio = integer1 / groupmean) %>% | |
collect() | |
}) | |
thisBench <- tibble(db = "dplyr", | |
test = "left_join", | |
perftype = names(bench), | |
perf = bench) | |
benchmarks <- benchmarks %>% | |
bind_rows(thisBench) | |
## SQLite - Memory DB ---------------------------------------------------------------------- | |
library(RSQLite) | |
con <- DBI::dbConnect(RSQLite::SQLite(), path = ":memory:") | |
bench <- system.time({ | |
copy_to(con, bigSample, "testsample", | |
temporary = FALSE) | |
}) | |
thisBench <- tibble(db = "sqlite_memory", | |
test = "copy_to", | |
perftype = names(bench), | |
perf = bench) | |
benchmarks <- benchmarks %>% | |
bind_rows(thisBench) | |
testDF <- tbl(con, "testsample") | |
# group by on 2 variables | |
bench <- system.time({ | |
testDF %>% | |
group_by(category1, category2) %>% | |
summarise(groupmean = mean(integer1)) %>% | |
collect() | |
}) | |
thisBench <- tibble(db = "sqlite_memory", | |
test = "group_by", | |
perftype = names(bench), | |
perf = bench) | |
benchmarks <- benchmarks %>% | |
bind_rows(thisBench) | |
# left_join | |
bench <- system.time({ | |
refTable <- testDF %>% | |
group_by(category1, category2) %>% | |
summarise(groupmean = mean(integer1)) | |
testDF %>% | |
left_join(y = refTable, by = c("category1", "category2")) %>% | |
mutate(ratio = integer1 / groupmean) %>% | |
collect() | |
}) | |
thisBench <- tibble(db = "sqlite_memory", | |
test = "left_join", | |
perftype = names(bench), | |
perf = bench) | |
benchmarks <- benchmarks %>% | |
bind_rows(thisBench) | |
dbDisconnect(con) | |
### SQLite - File DB ---------------------------------------------------------------------- | |
con <- DBI::dbConnect(RSQLite::SQLite(), path = "testSQLite.db") | |
bench <- system.time({ | |
copy_to(con, bigSample, "testsample", | |
temporary = FALSE) | |
}) | |
thisBench <- tibble(db = "sqlite_file", | |
test = "copy_to", | |
perftype = names(bench), | |
perf = bench) | |
benchmarks <- benchmarks %>% | |
bind_rows(thisBench) | |
testDF <- tbl(con, "testsample") | |
# group by on 2 variables | |
bench <- system.time({ | |
testDF %>% | |
group_by(category1, category2) %>% | |
summarise(groupmean = mean(integer1)) %>% | |
collect() | |
}) | |
thisBench <- tibble(db = "sqlite_file", | |
test = "group_by", | |
perftype = names(bench), | |
perf = bench) | |
benchmarks <- benchmarks %>% | |
bind_rows(thisBench) | |
# left_join | |
bench <- system.time({ | |
refTable <- testDF %>% | |
group_by(category1, category2) %>% | |
summarise(groupmean = mean(integer1)) | |
testDF %>% | |
left_join(y = refTable, by = c("category1", "category2")) %>% | |
mutate(ratio = integer1 / groupmean) %>% | |
collect() | |
}) | |
thisBench <- tibble(db = "sqlite_file", | |
test = "left_join", | |
perftype = names(bench), | |
perf = bench) | |
benchmarks <- benchmarks %>% | |
bind_rows(thisBench) | |
dbDisconnect(con) | |
### PostgreSQL ---------------------------------------------------------------------- | |
library(RPostgreSQL) | |
con <- DBI::dbConnect(RPostgreSQL::PostgreSQL(), dbname = "robin") | |
bench <- system.time({ | |
DBI::dbWriteTable(con, name = "testsample", value = bigSample, temporary = FALSE, overwrite = TRUE) | |
}) | |
thisBench <- tibble(db = "postgresql", | |
test = "copy_to", | |
perftype = names(bench), | |
perf = bench) | |
benchmarks <- benchmarks %>% | |
bind_rows(thisBench) | |
testDF <- tbl(con, "testsample") | |
# group by on 2 variables | |
bench <- system.time({ | |
testDF %>% | |
group_by(category1, category2) %>% | |
summarise(groupmean = mean(integer1)) %>% | |
collect() | |
}) | |
thisBench <- tibble(db = "postgresql", | |
test = "group_by", | |
perftype = names(bench), | |
perf = bench) | |
benchmarks <- benchmarks %>% | |
bind_rows(thisBench) | |
# left_join | |
bench <- system.time({ | |
refTable <- testDF %>% | |
group_by(category1, category2) %>% | |
summarise(groupmean = mean(integer1)) | |
testDF %>% | |
left_join(y = refTable, by = c("category1", "category2")) %>% | |
mutate(ratio = integer1 / groupmean) %>% | |
collect() | |
}) | |
thisBench <- tibble(db = "postgresql", | |
test = "left_join", | |
perftype = names(bench), | |
perf = bench) | |
benchmarks <- benchmarks %>% | |
bind_rows(thisBench) | |
db_drop_table(con, table = "testsample") | |
dbDisconnect(con) | |
### MonetDB ---------------------------------------------------------------------- | |
library(MonetDBLite) | |
con <- dbConnect(MonetDBLite::MonetDBLite()) | |
bench <- system.time({ | |
copy_to(con, bigSample, "testsample", | |
temporary = FALSE) | |
}) | |
thisBench <- tibble(db = "monetdb", | |
test = "copy_to", | |
perftype = names(bench), | |
perf = bench) | |
benchmarks <- benchmarks %>% | |
bind_rows(thisBench) | |
testDF <- tbl(con, "testsample") | |
# group by on 2 variables | |
bench <- system.time({ | |
testDF %>% | |
group_by(category1, category2) %>% | |
summarise(groupmean = mean(integer1)) %>% | |
collect() | |
}) | |
thisBench <- tibble(db = "monetdb", | |
test = "group_by", | |
perftype = names(bench), | |
perf = bench) | |
benchmarks <- benchmarks %>% | |
bind_rows(thisBench) | |
# left_join | |
bench <- system.time({ | |
refTable <- testDF %>% | |
group_by(category1, category2) %>% | |
summarise(groupmean = mean(integer1)) | |
testDF %>% | |
left_join(y = refTable, by = c("category1", "category2")) %>% | |
mutate(ratio = integer1 / groupmean) %>% | |
collect() | |
}) | |
thisBench <- tibble(db = "monetdb", | |
test = "left_join", | |
perftype = names(bench), | |
perf = bench) | |
benchmarks <- benchmarks %>% | |
bind_rows(thisBench) | |
DBI::dbDisconnect(con) | |
MonetDBLite::monetdblite_shutdown() | |
benchName <- tibble(db = c("dplyr", "sqlite_memory", "sqlite_file", "postgresql", "monetdb"), | |
DBMS = c("Dplyr (in memory)", "SQLite (in memory)", "SQLite (file)", "PostgreSQL", "MonetDBLite")) | |
library(hrbrthemes) | |
benchmarks %>% | |
filter(perftype == "elapsed") %>% | |
select(-perftype) %>% | |
spread(key = test, value = perf) %>% | |
gather(key = test, value = time, -db) %>% | |
left_join(benchName, by = "db") %>% | |
ggplot(aes(DBMS, time, fill = DBMS)) + | |
geom_col() + | |
facet_wrap(~test, ncol = 1) + | |
labs(title = "Benchmarking \"out-of-memory\"\ndata analysis DBMS", | |
subtitle = "Benchmarks run on a 50 million rows dataset using a common dplyr query", | |
caption = "R. Cura, 2017 @RobinCura") + | |
ylab("Time [seconds] (Smaller is better)") + | |
xlab("DataBase Management System") + | |
theme_ipsum() + | |
theme(legend.position="none") | |
ggsave(last_plot(), filename = "DMBS_benchmarks.png", width = 20, height = 20, units = "cm") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment