Skip to content

Instantly share code, notes, and snippets.

@arthurgailes
Created April 3, 2024 18:17
Show Gist options
  • Save arthurgailes/1a7c3f1afb5b51acc0e68042d3a4bc19 to your computer and use it in GitHub Desktop.
Save arthurgailes/1a7c3f1afb5b51acc0e68042d3a4bc19 to your computer and use it in GitHub Desktop.
duckdplyr summarizing wide data
# Load necessary library
pacman::p_load(
stringi, data.table, duckplyr, readr, dplyr, collapse, duckdb, dbplyr, bench,
ggplot2
)
# Function to generate a dataframe chunk
generate_data_chunk <- function(num_rows = 1000, num_cols = 100) {
# Generate numeric columns
numeric_cols <- replicate(n = num_cols/2, expr = runif(num_rows, 1, 10000), simplify = FALSE)
# Generate word columns
word_cols <- replicate(n = num_cols/2, expr = stri_rand_strings(num_rows, 10), simplify = FALSE)
# Combine word and numeric columns
data <- data.frame(matrix(ncol = num_cols, nrow = num_rows))
colnames(data) <- c(paste0("WordColumn", 1:(num_cols/2)), paste0("NumericColumn", 1:(num_cols/2)))
data[,paste0("WordColumn", 1:(num_cols/2))] <- word_cols
data[,paste0("NumericColumn", 1:(num_cols/2))] <- numeric_cols
return(data)
}
# Target file size in bytes (1GB)
target_file_size <- 1 * 1024^3
# Temporary file to accumulate data
temp_file_path <- "data/large_dataset.csv"
# Write an initial chunk to establish the file with the correct headers
initial_chunk <- generate_data_chunk()
fwrite(initial_chunk, file = temp_file_path)
# Keep appending data until the file size exceeds the target size
while(file.info(temp_file_path)$size < target_file_size) {
chunk <- generate_data_chunk()
# Append data without header and row names
fwrite(chunk, file = temp_file_path, append = TRUE, col.names = FALSE)
}
# time it takes to read, group_by, summarize
read_times <- bench::mark(
readr = {
readr::read_csv(temp_file_path, show_col_types = FALSE)
},
readr_lazy = {
readr::read_csv(temp_file_path, lazy = TRUE, show_col_types = FALSE)
},
data.table = {
data.table::fread(temp_file_path)
},
duckplyr = {
duckplyr::duckplyr_df_from_csv(temp_file_path)
},
check = FALSE
)
p <- ggplot(read_times) +
geom_col(aes(x = expression, y = median), fill = "#ff1f51", alpha = 0.5) +
hrbrthemes::theme_ipsum() +
scale_y_continuous(breaks = c(1, 5, 10, 30)) +
theme(
axis.text = element_text(size = 14),
plot.background = element_rect(fill = "#e6e6e6")
) +
labs(
title = "Time to read 1GB CSV",
x = "", y = "Total time (s)"
)
ggsave("chart/duckplyr_read_times.png", p, width = 4, height = 4, dpi = 300)
data_duckplyr <- duckplyr_df_from_csv(temp_file_path)
data_dt <- as.data.table(data_duckplyr, key = "WordColumn1")
data_tbl <- qTBL(data_dt)
con <- dbConnect(duckdb())
duckdb_register(con, "data", data_tbl, overwrite = TRUE)
data_dbplyr <- tbl(con, "data")
sum_times <- bench::mark(
dplyr = {
data_tbl %>%
dplyr::group_by(WordColumn1) %>%
dplyr::summarize(across(starts_with("NumericColumn"), mean))
},
data.table = {
data_dt[, lapply(.SD, mean), by = WordColumn1, .SDcols = paste0("NumericColumn", 1:50)]
},
collapse = {
data_dt %>%
collapse::fgroup_by(WordColumn1) %>%
collapse::fsummarise(across(paste0("NumericColumn", 1:50), fmean))
},
duckplyr = {
data_duckplyr %>%
group_by(WordColumn1) %>%
summarize(across(starts_with("NumericColumn"), mean))
},
duckdb = {
dbGetQuery(con, (
"SELECT WordColumn1, AVG(NumericColumn1), AVG(NumericColumn2), AVG(NumericColumn3), AVG(NumericColumn4), AVG(NumericColumn5), AVG(NumericColumn6), AVG(NumericColumn7), AVG(NumericColumn8), AVG(NumericColumn9), AVG(NumericColumn10), AVG(NumericColumn11), AVG(NumericColumn12), AVG(NumericColumn13), AVG(NumericColumn14), AVG(NumericColumn15), AVG(NumericColumn16), AVG(NumericColumn17), AVG(NumericColumn18), AVG(NumericColumn19), AVG(NumericColumn20), AVG(NumericColumn21), AVG(NumericColumn22), AVG(NumericColumn23), AVG(NumericColumn24), AVG(NumericColumn25), AVG(NumericColumn26), AVG(NumericColumn27), AVG(NumericColumn28), AVG(NumericColumn29), AVG(NumericColumn30), AVG(NumericColumn31), AVG(NumericColumn32), AVG(NumericColumn33), AVG(NumericColumn34), AVG(NumericColumn35), AVG(NumericColumn36), AVG(NumericColumn37), AVG(NumericColumn38), AVG(NumericColumn39), AVG(NumericColumn40), AVG(NumericColumn41), AVG(NumericColumn42), AVG(NumericColumn43), AVG(NumericColumn44), AVG(NumericColumn45), AVG(NumericColumn46), AVG(NumericColumn47), AVG(NumericColumn48), AVG(NumericColumn49), AVG(NumericColumn50)
FROM data GROUP BY WordColumn1"
))
},
dbplyr = {
data_dbplyr %>%
group_by(WordColumn1) %>%
summarize(across(starts_with("NumericColumn"), mean))
},
check = FALSE
)
p <- ggplot(sum_times) +
geom_col(aes(x = expression, y = median), fill = "#ff1f51", alpha = 0.5) +
ggthemes::theme_clean() +
scale_y_continuous(trans = "pseudo_log", breaks = c(0, 1, 5, 25)) +
theme(
axis.text = element_text(size = 12),
plot.title = element_text(size = 12),
plot.subtitle = element_text(size = 10),
plot.background = element_rect(fill = "#e6e6e6")
) +
labs(
title = "Time to group_by and summarize 1GB CSV",
subtitle = "Grouping by 1 word column and summarizing 50 numeric columns",
caption = "@arthurgailes",
x = "", y = "Total time (s)"
)
ggsave("chart/duckplyr_sum_times.png", p, width = 5, height = 3, dpi = 300)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment