Skip to content

Instantly share code, notes, and snippets.

@iangow
Last active January 13, 2024 14:59
Show Gist options
  • Save iangow/5b2ff852ea19d7b73f92fa2d682dfb4e to your computer and use it in GitHub Desktop.
Save iangow/5b2ff852ea19d7b73f92fa2d682dfb4e to your computer and use it in GitHub Desktop.
Benchmarking approaches to summing prices for fruit baskets.

See StackOverflow for the original question.

library(data.table)
library(dplyr, warn.conflicts = FALSE)
library(tidyr)
library(DBI)

# Make the data
fruits_tbl <- tibble(fruit_name = c('orange', 'apple',
                                    'pear', 'kiwi', 'blueberry'), 
                     price = c(1, 1.531, 2.1, 2.25, 3.03))

basket_size <- 8L
num_baskets <- 8e6L

baskets_long_tbl <-
  tibble(id = sort(rep(1:num_baskets, basket_size)),
         item_no = rep(1:basket_size, num_baskets),
         fruit = sample(fruits_tbl$fruit_name, 
                        basket_size * num_baskets, 
                        replace = TRUE))

baskets_tbl <- 
  pivot_wider(baskets_long_tbl, 
              id_cols = "id", 
              values_from = "fruit",
              names_from = "item_no",
              names_glue = "fruit_{item_no}") |>
  select(-id)

# DuckDB
db <- dbConnect(duckdb::duckdb())

system.time({
  baskets_db <- 
    copy_to(db, baskets_tbl) |>
    mutate(basket_id = row_number())
  fruits_db <- copy_to(db, fruits_tbl)
})
#>    user  system elapsed 
#>   4.685   0.126   4.907

system.time({
  res_db <-
    baskets_db |>
    pivot_longer(cols = -basket_id, 
                 values_to = "fruit_name") |>
    inner_join(fruits_db, by = "fruit_name") |>
    group_by(basket_id) |>
    summarize(price_total = sum(price, na.rm = TRUE)) |>
    inner_join(baskets_db, by = "basket_id") |>
    arrange(basket_id) |>
    collect()
})
#>    user  system elapsed 
#>  12.954   2.730   4.649

dbDisconnect(db, shutdown = TRUE)

head(res_db)
#> # A tibble: 6 × 10
#>   basket_id price_total fruit_1  fruit_2 fruit_3 fruit_4 fruit_5 fruit_6 fruit_7
#>       <dbl>       <dbl> <chr>    <chr>   <chr>   <chr>   <chr>   <chr>   <chr>  
#> 1         1        17.4 orange   kiwi    kiwi    kiwi    kiwi    bluebe… pear   
#> 2         2        13.3 apple    orange  kiwi    apple   kiwi    apple   kiwi   
#> 3         3        16.9 pear     apple   bluebe… kiwi    apple   pear    kiwi   
#> 4         4        15.1 apple    orange  bluebe… orange  kiwi    kiwi    bluebe…
#> 5         5        15.1 pear     pear    bluebe… apple   kiwi    orange  orange 
#> 6         6        16.5 blueber… kiwi    kiwi    apple   orange  pear    pear   
#> # ℹ 1 more variable: fruit_8 <chr>

# data.table
baskets_dt <- as.data.table(baskets_tbl)
fruits_dt <- as.data.table(fruits_tbl)

## data.table per Onyambu
baskets_dt[, melt(.SD, measure.vars = patterns('fruit'),
                    value.name = 'fruit_name')
][fruits_dt, on = 'fruit_name'
][, price := sum(price), by = rowid(variable)
][, dcast(.SD, rowid(variable)+price~variable, value.var = 'fruit_name')] |>
  system.time()
#>    user  system elapsed 
#>  18.421   1.735  21.533

## data.table per B. Christian Kamgang
price_lookup <- function(x) {
  fruits_dt$price[chmatch(x, fruits_dt$fruit_name)]
}

baskets_dt[, price_total := rowSums(sapply(.SD, price_lookup)), 
           .SDcols=patterns("fruit")] |>
  system.time()
#>    user  system elapsed 
#>   0.527   0.096   0.631

## data.table using dtplyr
system.time({
  baskets_dt <- 
    as.data.table(baskets_tbl) |>
    mutate(basket_id = row_number())
})
#>    user  system elapsed 
#>   0.032   0.021   0.054

system.time({
  res_dtplyr <-
    baskets_dt |>
    pivot_longer(cols = -basket_id, 
                 values_to = "fruit_name") |>
    inner_join(fruits_dt, by = "fruit_name") |>
    group_by(basket_id) |>
    summarize(price_total = sum(price, na.rm = TRUE),
              .groups = "drop") |>
    inner_join(baskets_dt, by = "basket_id") |>
    collect()
})
#>    user  system elapsed 
#>  15.728   1.406  17.480

Created on 2024-01-13 with reprex v2.0.2

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment