Created
October 12, 2019 17:34
-
-
Save TysonStanley/f64ed76e4859199e22904f9a0ee849c9 to your computer and use it in GitHub Desktop.
Using `profmem` package to understand data summaries by group for both `data.table` and `dplyr`
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(bench) # assess speed and memory | |
library(data.table) # data.table for all of its stuff | |
library(dplyr) # compare it to data.table | |
library(profmem) # assess the process of R functions | |
set.seed(84322) | |
# Example Data | |
d <- data.table( | |
grp = sample(c(1,2), size = 1e6, replace = TRUE) %>% factor, | |
x = rnorm(1e6), | |
y = runif(1e6) | |
) | |
d | |
#> grp x y | |
#> 1: 1 0.2558379 0.2034364 | |
#> 2: 2 -0.8886153 0.4684875 | |
#> 3: 2 0.4724519 0.6850357 | |
#> 4: 1 0.7360537 0.4890217 | |
#> 5: 1 0.6855063 0.6964860 | |
#> --- | |
#> 999996: 1 2.1008965 0.3624327 | |
#> 999997: 2 2.2423628 0.2595716 | |
#> 999998: 2 1.5314115 0.1102460 | |
#> 999999: 2 -1.6086973 0.2679477 | |
#> 1000000: 2 1.2518419 0.6566943 | |
# data size and group vector size | |
lobstr::obj_size(d) | |
lobstr::obj_size(d$grp) | |
#> 20,001,752 B | |
#> 4,000,560 B | |
# Copy all data for summaries | |
df <- copy(d) %>% as.data.frame() | |
tbl <- copy(d) %>% as_tibble() | |
dt <- copy(d) | |
# profile dplyr | |
profmem::profmem(summarize(group_by(tbl, grp), mean(x))) %>% | |
data.frame %>% | |
select(bytes, calls) | |
#> bytes calls | |
#> 1 1997104 summarize() -> group_by() -> group_by.data.frame() -> grouped_df() -> grouped_df_impl() | |
#> 2 2003000 summarize() -> group_by() -> group_by.data.frame() -> grouped_df() -> grouped_df_impl() | |
#> | |
# benchmark dplyr | |
bench::mark(summarize(group_by(tbl, grp), mean(x)), | |
iterations = 25) %>% | |
select(median, mem_alloc) | |
#> # A tibble: 1 x 2 | |
#> median mem_alloc | |
#> <bch:tm> <bch:byt> | |
#> 1 26.7ms 3.81MB | |
# profile data.table | |
profmem::profmem(dt[, mean(x), by = grp]) %>% | |
data.frame %>% | |
select(bytes, calls) | |
#> bytes calls | |
#> 1 280 [() -> [.data.table() -> new.env() | |
#> 2 8240 [() -> [.data.table() -> null.data.table() -> setalloccol() | |
#> 3 8240 [() -> [.data.table() -> null.data.table() -> setalloccol() | |
#> 4 4000048 [() -> [.data.table() -> forderv() | |
#> 5 2003000 [() -> [.data.table() | |
#> 6 8248 [() -> [.data.table() | |
#> 7 4005944 [() -> [.data.table() | |
#> 8 8248 [() -> [.data.table() | |
#> 9 280 [() -> [.data.table() -> new.env() | |
#> 10 248 [() -> [.data.table() | |
#> 11 248 [() -> [.data.table() | |
#> 12 248 [() -> [.data.table() | |
#> 13 248 [() -> [.data.table() | |
#> 14 248 [() -> [.data.table() | |
#> 15 248 [() -> [.data.table() | |
#> 16 248 [() -> [.data.table() | |
#> 17 248 [() -> [.data.table() | |
#> 18 248 [() -> [.data.table() | |
#> 19 248 [() -> [.data.table() | |
#> 20 248 [() -> [.data.table() | |
#> 21 248 [() -> [.data.table() | |
#> 22 248 [() -> [.data.table() | |
#> 23 248 [() -> [.data.table() | |
#> 24 248 [() -> [.data.table() | |
#> 25 4000056 [() -> [.data.table() -> gforce() | |
#> 26 2000056 [() -> [.data.table() -> gforce() | |
#> 27 2000056 [() -> [.data.table() -> gforce() | |
#> 28 16000056 [() -> [.data.table() -> gforce() | |
#> 29 8256 [() -> [.data.table() -> setalloccol() | |
#> 30 8256 [() -> [.data.table() -> setalloccol() | |
# benchmark with gforce | |
bench::mark(dt[, mean(x), by = grp], | |
iterations = 25) %>% | |
select(median, mem_alloc) | |
#> # A tibble: 1 x 2 | |
#> median mem_alloc | |
#> <bch:tm> <bch:byt> | |
#> 1 16.8ms 32.5MB | |
# profile data.table without gforce | |
profmem::profmem(dt[, base::mean(x), by = grp]) %>% | |
data.frame %>% | |
select(bytes, calls) | |
#> bytes calls | |
#> 1 280 [() -> [.data.table() -> new.env() | |
#> 2 8240 [() -> [.data.table() -> null.data.table() -> setalloccol() | |
#> 3 8240 [() -> [.data.table() -> null.data.table() -> setalloccol() | |
#> 4 4000048 [() -> [.data.table() -> forderv() | |
#> 5 2003000 [() -> [.data.table() | |
#> 6 8248 [() -> [.data.table() | |
#> 7 4005944 [() -> [.data.table() | |
#> 8 8248 [() -> [.data.table() | |
#> 9 2003000 [() -> [.data.table() | |
#> 10 8256 [() -> [.data.table() -> setalloccol() | |
#> 11 8256 [() -> [.data.table() -> setalloccol() | |
# benchmark without gforce | |
bench::mark(dt[, base::mean(x), by = grp], | |
iterations = 25) %>% | |
select(median, mem_alloc) | |
#> # A tibble: 1 x 2 | |
#> median mem_alloc | |
#> <bch:tm> <bch:byt> | |
#> 1 10.5ms 11.5MB | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment