Skip to content

Instantly share code, notes, and snippets.

@iangow
Created January 10, 2024 14:51
Show Gist options
  • Save iangow/55ff3c63690a41eb5288f9b0b38a39a6 to your computer and use it in GitHub Desktop.
Save iangow/55ff3c63690a41eb5288f9b0b38a39a6 to your computer and use it in GitHub Desktop.
Benchmark of `rpolars`.
install.packages(
'tidypolars',
repos = c('https://etiennebacher.r-universe.dev', getOption("repos"))
)
library(tidypolars)
library(polars)
library(collapse, warn.conflicts = FALSE)
library(dplyr, warn.conflicts = FALSE)
library(DBI)
# library(bench)
# library(data.table)
# library(duckdb)
db <- dbConnect(duckdb::duckdb())
large_iris <- data.table::rbindlist(rep(list(iris), 100000))
large_iris_pl <- as_polars_lf(large_iris)
large_iris_pl$sink_parquet("large_iris.parquet")
format(nrow(large_iris), big.mark = ",")
res <-
bench::mark(
polars = {
pl$scan_parquet("large_iris.parquet")$
select(c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width"))$
with_columns(
pl$when(
(pl$col("Petal.Length") / pl$col("Petal.Width") > 3)
)$then(pl$lit("long"))$
otherwise(pl$lit("large"))$
alias("petal_type")
)$
filter(pl$col("Sepal.Length")$is_between(4.5, 5.5)) |>
collect()
},
tidypolars = {
pl$scan_parquet("large_iris.parquet") |>
select(starts_with(c("Sep", "Pet"))) |>
mutate(
petal_type = ifelse((Petal.Length / Petal.Width) > 3, "long", "large")
) |>
filter(between(Sepal.Length, 4.5, 5.5)) |>
collect()
},
dplyr = {
arrow::read_parquet("large_iris.parquet") |>
select(starts_with(c("Sep", "Pet"))) |>
mutate(
petal_type = ifelse((Petal.Length / Petal.Width) > 3, "long", "large")
) |>
filter(between(Sepal.Length, 4.5, 5.5))
},
`DuckDB/dbplyr` = {
tbl(db, "read_parquet('large_iris.parquet')",
check_from = FALSE) |>
select(starts_with(c("Sep", "Pet"))) |>
mutate(
petal_type = if_else((Petal.Length / Petal.Width) > 3, "long", "large")
) |>
filter(between(Sepal.Length, 4.5, 5.5)) |>
collect()
},
collapse = {
arrow::read_parquet("large_iris.parquet") |>
fselect(c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width")) |>
fmutate(
petal_type = data.table::fifelse((Petal.Length / Petal.Width) > 3, "long", "large")
) |>
fsubset(Sepal.Length >= 4.5 & Sepal.Length <= 5.5)
},
check = FALSE,
iterations = 40
)
# NOTE: do NOT take the "mem_alloc" results into account.
# `bench::mark()` doesn't report the accurate memory usage for packages calling
# Rust code.
res |>
select(expression, median, total_time)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment