Last active
March 16, 2021 18:00
-
-
Save jthomasmock/94f4444dd3fd28340d40074e35fc87bc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(tictoc) | |
library(arrow) | |
tic() | |
con <- DBI::dbConnect(duckdb::duckdb(), "data/pbp_db.duckdb") | |
nfl_pbp <- tbl(con, "nflfastR_pbp") | |
toc() | |
# 0.123 sec elapsed | |
tic() | |
con_lite <- DBI::dbConnect(RSQLite::SQLite(), "data/pbp_db") | |
nfl_pbp_lite <- tbl(con_lite, "nflfastR_pbp") | |
toc() | |
# 0.141 sec elapsed | |
tic() | |
nfl_parq <- open_dataset("nflfastr-parquet") | |
toc() | |
# 0.077 sec elapsed | |
tic() | |
nfl_parq_shard <- open_dataset("nflfastr-shard", partitioning = schema(season = int32(), week = string())) | |
toc() | |
# 0.038 sec elapsed | |
### Here's a benchmark | |
sqlite_expr <- function(){ | |
con_lite <- DBI::dbConnect(RSQLite::SQLite(), "data/pbp_db") | |
nfl_pbp_lite <- tbl(con_lite, "nflfastR_pbp") | |
out <- nfl_pbp_lite %>% | |
select(epa, posteam, season, week) %>% | |
filter( | |
season %in% c(2017, 2018, 2019), | |
week %in% c(8:16), | |
!is.na(posteam) | |
) %>% | |
group_by(posteam) %>% | |
summarise(mean_epa = mean(epa, na.rm = TRUE)) %>% | |
arrange(desc(mean_epa)) %>% | |
ungroup() %>% | |
collect() | |
duckdb::dbDisconnect(con_lite) | |
out | |
} | |
duck_expr <- function(){ | |
con <- DBI::dbConnect(duckdb::duckdb(), "data/pbp_db.duckdb") | |
nfl_pbp <- tbl(con, "nflfastR_pbp") | |
out <- nfl_pbp %>% | |
select(epa, posteam, season, week) %>% | |
filter( | |
season %in% c(2017, 2018, 2019), | |
week %in% c(8:16), | |
!is.na(posteam) | |
) %>% | |
group_by(posteam) %>% | |
summarise(mean_epa = mean(epa,na.rm = TRUE)) %>% | |
ungroup() %>% | |
arrange(desc(mean_epa)) %>% | |
collect() | |
duckdb::dbDisconnect(con) | |
out | |
} | |
parquet_expr <- function(){ | |
nfl_parq %>% | |
select(epa, posteam, season, week) %>% | |
filter( | |
season %in% c(2017, 2018, 2019), | |
week %in% c(8:16), | |
!is.na(posteam) | |
) %>% | |
collect() %>% | |
group_by(posteam) %>% | |
summarise(mean_epa = mean(epa,na.rm = TRUE)) %>% | |
ungroup() %>% | |
arrange(desc(mean_epa)) | |
} | |
parquet_part_expr <- function(){ | |
nfl_parq_shard %>% | |
select(epa, posteam, season, week) %>% | |
filter( | |
season %in% c(2017, 2018, 2019), | |
week %in% c(8:16), | |
!is.na(posteam) | |
) %>% | |
collect() %>% | |
group_by(posteam) %>% | |
summarise(mean_epa = mean(epa,na.rm = TRUE)) %>% | |
ungroup() %>% | |
arrange(desc(mean_epa)) | |
} | |
bench::mark( | |
min_time = 0.1, | |
iterations = 10, | |
sqlite_expr(), | |
duck_expr(), | |
parquet_expr(), | |
parquet_part_expr() | |
) | |
# # A tibble: 4 x 13 | |
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time | |
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> | |
# 1 sqlite_expr() 1.15s 1.27s 0.704 3.97MB 0.176 8 2 11.36s | |
# 2 duck_expr() 192.69ms 211.45ms 3.04 434.47KB 1.30 7 3 2.31s | |
# 3 parquet_expr() 208.91ms 451.19ms 2.12 9.11MB 0.529 8 2 3.78s | |
# 4 parquet_part_expr() 228.37ms 254.39ms 3.96 5.11MB 0.440 9 1 2.27s |
Author
jthomasmock
commented
Mar 16, 2021
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment