Last active
February 3, 2018 06:24
-
-
Save RCura/2d32ac3d253ea368bffadca3c9ffde0b to your computer and use it in GitHub Desktop.
Comparing the performances of various DBMS using dplyr for large-ish data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(tidyverse) | |
| library(dbplyr) | |
| library(DBI) | |
| # 1 - Creating the big df example | |
| bigSample <- tibble(uid = 1:50E6, | |
| category1 = sample(x = LETTERS, size = 50E6, replace = TRUE), | |
| category2 = sample(x = sample(x = colors(), size = 10, replace = FALSE), size = 50E6, replace = TRUE), | |
| integer1 = round(runif(n = 50E6, min = 1, max = 1000)) | |
| ) | |
| # dplyr ------------------------------------------------------------------- | |
| testDF <- bigSample | |
| # group by on 2 variables | |
| benchmarks <- tibble(db = character(0), | |
| test = character(0), | |
| perftype = character(0), | |
| perf = double(0)) | |
| bench <- system.time({ | |
| testDF %>% | |
| group_by(category1, category2) %>% | |
| summarise(groupmean = mean(integer1)) %>% | |
| collect() | |
| }) | |
| thisBench <- tibble(db = "dplyr", | |
| test = "group_by", | |
| perftype = names(bench), | |
| perf = bench) | |
| benchmarks <- benchmarks %>% | |
| bind_rows(thisBench) | |
| # left_join | |
| bench <- system.time({ | |
| refTable <- testDF %>% | |
| group_by(category1, category2) %>% | |
| summarise(groupmean = mean(integer1)) | |
| testDF %>% | |
| left_join(y = refTable, by = c("category1", "category2")) %>% | |
| mutate(ratio = integer1 / groupmean) %>% | |
| collect() | |
| }) | |
| thisBench <- tibble(db = "dplyr", | |
| test = "left_join", | |
| perftype = names(bench), | |
| perf = bench) | |
| benchmarks <- benchmarks %>% | |
| bind_rows(thisBench) | |
| ## SQLite - Memory DB ---------------------------------------------------------------------- | |
| library(RSQLite) | |
| con <- DBI::dbConnect(RSQLite::SQLite(), path = ":memory:") | |
| bench <- system.time({ | |
| copy_to(con, bigSample, "testsample", | |
| temporary = FALSE) | |
| }) | |
| thisBench <- tibble(db = "sqlite_memory", | |
| test = "copy_to", | |
| perftype = names(bench), | |
| perf = bench) | |
| benchmarks <- benchmarks %>% | |
| bind_rows(thisBench) | |
| testDF <- tbl(con, "testsample") | |
| # group by on 2 variables | |
| bench <- system.time({ | |
| testDF %>% | |
| group_by(category1, category2) %>% | |
| summarise(groupmean = mean(integer1)) %>% | |
| collect() | |
| }) | |
| thisBench <- tibble(db = "sqlite_memory", | |
| test = "group_by", | |
| perftype = names(bench), | |
| perf = bench) | |
| benchmarks <- benchmarks %>% | |
| bind_rows(thisBench) | |
| # left_join | |
| bench <- system.time({ | |
| refTable <- testDF %>% | |
| group_by(category1, category2) %>% | |
| summarise(groupmean = mean(integer1)) | |
| testDF %>% | |
| left_join(y = refTable, by = c("category1", "category2")) %>% | |
| mutate(ratio = integer1 / groupmean) %>% | |
| collect() | |
| }) | |
| thisBench <- tibble(db = "sqlite_memory", | |
| test = "left_join", | |
| perftype = names(bench), | |
| perf = bench) | |
| benchmarks <- benchmarks %>% | |
| bind_rows(thisBench) | |
| dbDisconnect(con) | |
| ### SQLite - File DB ---------------------------------------------------------------------- | |
| con <- DBI::dbConnect(RSQLite::SQLite(), path = "testSQLite.db") | |
| bench <- system.time({ | |
| copy_to(con, bigSample, "testsample", | |
| temporary = FALSE) | |
| }) | |
| thisBench <- tibble(db = "sqlite_file", | |
| test = "copy_to", | |
| perftype = names(bench), | |
| perf = bench) | |
| benchmarks <- benchmarks %>% | |
| bind_rows(thisBench) | |
| testDF <- tbl(con, "testsample") | |
| # group by on 2 variables | |
| bench <- system.time({ | |
| testDF %>% | |
| group_by(category1, category2) %>% | |
| summarise(groupmean = mean(integer1)) %>% | |
| collect() | |
| }) | |
| thisBench <- tibble(db = "sqlite_file", | |
| test = "group_by", | |
| perftype = names(bench), | |
| perf = bench) | |
| benchmarks <- benchmarks %>% | |
| bind_rows(thisBench) | |
| # left_join | |
| bench <- system.time({ | |
| refTable <- testDF %>% | |
| group_by(category1, category2) %>% | |
| summarise(groupmean = mean(integer1)) | |
| testDF %>% | |
| left_join(y = refTable, by = c("category1", "category2")) %>% | |
| mutate(ratio = integer1 / groupmean) %>% | |
| collect() | |
| }) | |
| thisBench <- tibble(db = "sqlite_file", | |
| test = "left_join", | |
| perftype = names(bench), | |
| perf = bench) | |
| benchmarks <- benchmarks %>% | |
| bind_rows(thisBench) | |
| dbDisconnect(con) | |
| ### PostgreSQL ---------------------------------------------------------------------- | |
| library(RPostgreSQL) | |
| con <- DBI::dbConnect(RPostgreSQL::PostgreSQL(), dbname = "robin") | |
| bench <- system.time({ | |
| DBI::dbWriteTable(con, name = "testsample", value = bigSample, temporary = FALSE, overwrite = TRUE) | |
| }) | |
| thisBench <- tibble(db = "postgresql", | |
| test = "copy_to", | |
| perftype = names(bench), | |
| perf = bench) | |
| benchmarks <- benchmarks %>% | |
| bind_rows(thisBench) | |
| testDF <- tbl(con, "testsample") | |
| # group by on 2 variables | |
| bench <- system.time({ | |
| testDF %>% | |
| group_by(category1, category2) %>% | |
| summarise(groupmean = mean(integer1)) %>% | |
| collect() | |
| }) | |
| thisBench <- tibble(db = "postgresql", | |
| test = "group_by", | |
| perftype = names(bench), | |
| perf = bench) | |
| benchmarks <- benchmarks %>% | |
| bind_rows(thisBench) | |
| # left_join | |
| bench <- system.time({ | |
| refTable <- testDF %>% | |
| group_by(category1, category2) %>% | |
| summarise(groupmean = mean(integer1)) | |
| testDF %>% | |
| left_join(y = refTable, by = c("category1", "category2")) %>% | |
| mutate(ratio = integer1 / groupmean) %>% | |
| collect() | |
| }) | |
| thisBench <- tibble(db = "postgresql", | |
| test = "left_join", | |
| perftype = names(bench), | |
| perf = bench) | |
| benchmarks <- benchmarks %>% | |
| bind_rows(thisBench) | |
| db_drop_table(con, table = "testsample") | |
| dbDisconnect(con) | |
| ### MonetDB ---------------------------------------------------------------------- | |
| library(MonetDBLite) | |
| con <- dbConnect(MonetDBLite::MonetDBLite()) | |
| bench <- system.time({ | |
| copy_to(con, bigSample, "testsample", | |
| temporary = FALSE) | |
| }) | |
| thisBench <- tibble(db = "monetdb", | |
| test = "copy_to", | |
| perftype = names(bench), | |
| perf = bench) | |
| benchmarks <- benchmarks %>% | |
| bind_rows(thisBench) | |
| testDF <- tbl(con, "testsample") | |
| # group by on 2 variables | |
| bench <- system.time({ | |
| testDF %>% | |
| group_by(category1, category2) %>% | |
| summarise(groupmean = mean(integer1)) %>% | |
| collect() | |
| }) | |
| thisBench <- tibble(db = "monetdb", | |
| test = "group_by", | |
| perftype = names(bench), | |
| perf = bench) | |
| benchmarks <- benchmarks %>% | |
| bind_rows(thisBench) | |
| # left_join | |
| bench <- system.time({ | |
| refTable <- testDF %>% | |
| group_by(category1, category2) %>% | |
| summarise(groupmean = mean(integer1)) | |
| testDF %>% | |
| left_join(y = refTable, by = c("category1", "category2")) %>% | |
| mutate(ratio = integer1 / groupmean) %>% | |
| collect() | |
| }) | |
| thisBench <- tibble(db = "monetdb", | |
| test = "left_join", | |
| perftype = names(bench), | |
| perf = bench) | |
| benchmarks <- benchmarks %>% | |
| bind_rows(thisBench) | |
| DBI::dbDisconnect(con) | |
| MonetDBLite::monetdblite_shutdown() | |
| benchName <- tibble(db = c("dplyr", "sqlite_memory", "sqlite_file", "postgresql", "monetdb"), | |
| DBMS = c("Dplyr (in memory)", "SQLite (in memory)", "SQLite (file)", "PostgreSQL", "MonetDBLite")) | |
| library(hrbrthemes) | |
| benchmarks %>% | |
| filter(perftype == "elapsed") %>% | |
| select(-perftype) %>% | |
| spread(key = test, value = perf) %>% | |
| gather(key = test, value = time, -db) %>% | |
| left_join(benchName, by = "db") %>% | |
| ggplot(aes(DBMS, time, fill = DBMS)) + | |
| geom_col() + | |
| facet_wrap(~test, ncol = 1) + | |
| labs(title = "Benchmarking \"out-of-memory\"\ndata analysis DBMS", | |
| subtitle = "Benchmarks run on a 50 million rows dataset using a common dplyr query", | |
| caption = "R. Cura, 2017 @RobinCura") + | |
| ylab("Time [seconds] (Smaller is better)") + | |
| xlab("DataBase Management System") + | |
| theme_ipsum() + | |
| theme(legend.position="none") | |
| ggsave(last_plot(), filename = "DMBS_benchmarks.png", width = 20, height = 20, units = "cm") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
