See StackOverflow for the original question.
library(data.table)
library(dplyr, warn.conflicts = FALSE)
library(tidyr)
library(DBI)
# Make the data
fruits_tbl <- tibble(fruit_name = c('orange', 'apple',
'pear', 'kiwi', 'blueberry'),
price = c(1, 1.531, 2.1, 2.25, 3.03))
basket_size <- 8L
num_baskets <- 8e6L
baskets_long_tbl <-
tibble(id = sort(rep(1:num_baskets, basket_size)),
item_no = rep(1:basket_size, num_baskets),
fruit = sample(fruits_tbl$fruit_name,
basket_size * num_baskets,
replace = TRUE))
baskets_tbl <-
pivot_wider(baskets_long_tbl,
id_cols = "id",
values_from = "fruit",
names_from = "item_no",
names_glue = "fruit_{item_no}") |>
select(-id)
# DuckDB
db <- dbConnect(duckdb::duckdb())
system.time({
baskets_db <-
copy_to(db, baskets_tbl) |>
mutate(basket_id = row_number())
fruits_db <- copy_to(db, fruits_tbl)
})
#> user system elapsed
#> 4.685 0.126 4.907
system.time({
res_db <-
baskets_db |>
pivot_longer(cols = -basket_id,
values_to = "fruit_name") |>
inner_join(fruits_db, by = "fruit_name") |>
group_by(basket_id) |>
summarize(price_total = sum(price, na.rm = TRUE)) |>
inner_join(baskets_db, by = "basket_id") |>
arrange(basket_id) |>
collect()
})
#> user system elapsed
#> 12.954 2.730 4.649
dbDisconnect(db, shutdown = TRUE)
head(res_db)
#> # A tibble: 6 × 10
#> basket_id price_total fruit_1 fruit_2 fruit_3 fruit_4 fruit_5 fruit_6 fruit_7
#> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1 17.4 orange kiwi kiwi kiwi kiwi bluebe… pear
#> 2 2 13.3 apple orange kiwi apple kiwi apple kiwi
#> 3 3 16.9 pear apple bluebe… kiwi apple pear kiwi
#> 4 4 15.1 apple orange bluebe… orange kiwi kiwi bluebe…
#> 5 5 15.1 pear pear bluebe… apple kiwi orange orange
#> 6 6 16.5 blueber… kiwi kiwi apple orange pear pear
#> # ℹ 1 more variable: fruit_8 <chr>
# data.table
baskets_dt <- as.data.table(baskets_tbl)
fruits_dt <- as.data.table(fruits_tbl)
## data.table per Onyambu
baskets_dt[, melt(.SD, measure.vars = patterns('fruit'),
value.name = 'fruit_name')
][fruits_dt, on = 'fruit_name'
][, price := sum(price), by = rowid(variable)
][, dcast(.SD, rowid(variable)+price~variable, value.var = 'fruit_name')] |>
system.time()
#> user system elapsed
#> 18.421 1.735 21.533
## data.table per B. Christian Kamgang
price_lookup <- function(x) {
fruits_dt$price[chmatch(x, fruits_dt$fruit_name)]
}
baskets_dt[, price_total := rowSums(sapply(.SD, price_lookup)),
.SDcols=patterns("fruit")] |>
system.time()
#> user system elapsed
#> 0.527 0.096 0.631
## data.table using dtplyr
system.time({
baskets_dt <-
as.data.table(baskets_tbl) |>
mutate(basket_id = row_number())
})
#> user system elapsed
#> 0.032 0.021 0.054
system.time({
res_dtplyr <-
baskets_dt |>
pivot_longer(cols = -basket_id,
values_to = "fruit_name") |>
inner_join(fruits_dt, by = "fruit_name") |>
group_by(basket_id) |>
summarize(price_total = sum(price, na.rm = TRUE),
.groups = "drop") |>
inner_join(baskets_dt, by = "basket_id") |>
collect()
})
#> user system elapsed
#> 15.728 1.406 17.480Created on 2024-01-13 with reprex v2.0.2