arthurgailes · April 3, 2024 18:17
diff --git a/sum_wide.R b/sum_wide.R
 # Load necessary library
 pacman::p_load(
  stringi, data.table, duckplyr, readr, dplyr, collapse, duckdb, dbplyr, bench,
  ggplot2
 )

 # Function to generate a dataframe chunk
 generate_data_chunk <- function(num_rows = 1000, num_cols = 100) {
  # Generate numeric columns
  numeric_cols <- replicate(n = num_cols/2, expr = runif(num_rows, 1, 10000), simplify = FALSE)
  # Generate word columns
  word_cols <- replicate(n = num_cols/2, expr = stri_rand_strings(num_rows, 10), simplify = FALSE)
  # Combine word and numeric columns
  data <- data.frame(matrix(ncol = num_cols, nrow = num_rows))
  colnames(data) <- c(paste0("WordColumn", 1:(num_cols/2)), paste0("NumericColumn", 1:(num_cols/2)))
  data[,paste0("WordColumn", 1:(num_cols/2))] <- word_cols
  data[,paste0("NumericColumn", 1:(num_cols/2))] <- numeric_cols

  return(data)
 }

 # Target file size in bytes (1GB)
 target_file_size <- 1 * 1024^3

 # Temporary file to accumulate data
 temp_file_path <- "data/large_dataset.csv"

 # Write an initial chunk to establish the file with the correct headers
 initial_chunk <- generate_data_chunk()
 fwrite(initial_chunk, file = temp_file_path)

 # Keep appending data until the file size exceeds the target size
 while(file.info(temp_file_path)$size < target_file_size) {
  chunk <- generate_data_chunk()
  # Append data without header and row names
  fwrite(chunk, file = temp_file_path, append = TRUE, col.names = FALSE)
 }

 # time it takes to read, group_by, summarize
 read_times <- bench::mark(
  readr = {
    readr::read_csv(temp_file_path, show_col_types = FALSE)
  },
  readr_lazy = {
    readr::read_csv(temp_file_path, lazy = TRUE, show_col_types = FALSE)
  },
  data.table = {
    data.table::fread(temp_file_path)
  },
  duckplyr = {
    duckplyr::duckplyr_df_from_csv(temp_file_path)
  },
  check = FALSE
 )

 p <- ggplot(read_times) +
  geom_col(aes(x = expression, y = median), fill = "#ff1f51", alpha = 0.5) +
  hrbrthemes::theme_ipsum() +
  scale_y_continuous(breaks = c(1, 5, 10, 30)) +
  theme(
    axis.text = element_text(size = 14),
    plot.background = element_rect(fill = "#e6e6e6")
  ) +
  labs(
    title = "Time to read 1GB CSV",
    x = "", y = "Total time (s)"
  )

 ggsave("chart/duckplyr_read_times.png", p, width = 4, height = 4, dpi = 300)


 data_duckplyr <- duckplyr_df_from_csv(temp_file_path)
 data_dt <- as.data.table(data_duckplyr, key = "WordColumn1")
 data_tbl <- qTBL(data_dt)

 con <- dbConnect(duckdb())
 duckdb_register(con, "data", data_tbl, overwrite = TRUE)
 data_dbplyr <- tbl(con, "data")

 sum_times <- bench::mark(
  dplyr = {
    data_tbl %>%
      dplyr::group_by(WordColumn1) %>%
      dplyr::summarize(across(starts_with("NumericColumn"), mean))
  },
  data.table = {
    data_dt[, lapply(.SD, mean), by = WordColumn1, .SDcols = paste0("NumericColumn", 1:50)]
  },
  collapse = {
    data_dt %>%
      collapse::fgroup_by(WordColumn1) %>%
      collapse::fsummarise(across(paste0("NumericColumn", 1:50), fmean))
  },
  duckplyr = {
    data_duckplyr %>%
      group_by(WordColumn1) %>%
      summarize(across(starts_with("NumericColumn"), mean))
  },
  duckdb = {
    dbGetQuery(con, (
      "SELECT WordColumn1, AVG(NumericColumn1), AVG(NumericColumn2), AVG(NumericColumn3), AVG(NumericColumn4), AVG(NumericColumn5), AVG(NumericColumn6), AVG(NumericColumn7), AVG(NumericColumn8), AVG(NumericColumn9), AVG(NumericColumn10), AVG(NumericColumn11), AVG(NumericColumn12), AVG(NumericColumn13), AVG(NumericColumn14), AVG(NumericColumn15), AVG(NumericColumn16), AVG(NumericColumn17), AVG(NumericColumn18), AVG(NumericColumn19), AVG(NumericColumn20), AVG(NumericColumn21), AVG(NumericColumn22), AVG(NumericColumn23), AVG(NumericColumn24), AVG(NumericColumn25), AVG(NumericColumn26), AVG(NumericColumn27), AVG(NumericColumn28), AVG(NumericColumn29), AVG(NumericColumn30), AVG(NumericColumn31), AVG(NumericColumn32), AVG(NumericColumn33), AVG(NumericColumn34), AVG(NumericColumn35), AVG(NumericColumn36), AVG(NumericColumn37), AVG(NumericColumn38), AVG(NumericColumn39), AVG(NumericColumn40), AVG(NumericColumn41), AVG(NumericColumn42), AVG(NumericColumn43), AVG(NumericColumn44), AVG(NumericColumn45), AVG(NumericColumn46), AVG(NumericColumn47), AVG(NumericColumn48), AVG(NumericColumn49), AVG(NumericColumn50)
      FROM data GROUP BY WordColumn1"
    ))
  },
  dbplyr = {
    data_dbplyr %>%
      group_by(WordColumn1) %>%
      summarize(across(starts_with("NumericColumn"), mean))
  },
  check = FALSE
 )


 p <- ggplot(sum_times) +
  geom_col(aes(x = expression, y = median), fill = "#ff1f51", alpha = 0.5) +
  ggthemes::theme_clean() +
  scale_y_continuous(trans = "pseudo_log", breaks = c(0, 1, 5, 25)) +
  theme(
    axis.text = element_text(size = 12),
    plot.title = element_text(size = 12),
    plot.subtitle = element_text(size = 10),
    plot.background = element_rect(fill = "#e6e6e6")
  ) +
  labs(
    title = "Time to group_by and summarize 1GB CSV",
    subtitle = "Grouping by 1 word column and summarizing 50 numeric columns",
    caption = "@arthurgailes",
    x = "", y = "Total time (s)"
  )

 ggsave("chart/duckplyr_sum_times.png", p, width = 5, height = 3, dpi = 300)
	# Load necessary library
	pacman::p_load(
	stringi, data.table, duckplyr, readr, dplyr, collapse, duckdb, dbplyr, bench,
	ggplot2
	)

	# Function to generate a dataframe chunk
	generate_data_chunk <- function(num_rows = 1000, num_cols = 100) {
	# Generate numeric columns
	numeric_cols <- replicate(n = num_cols/2, expr = runif(num_rows, 1, 10000), simplify = FALSE)
	# Generate word columns
	word_cols <- replicate(n = num_cols/2, expr = stri_rand_strings(num_rows, 10), simplify = FALSE)
	# Combine word and numeric columns
	data <- data.frame(matrix(ncol = num_cols, nrow = num_rows))
	colnames(data) <- c(paste0("WordColumn", 1:(num_cols/2)), paste0("NumericColumn", 1:(num_cols/2)))
	data[,paste0("WordColumn", 1:(num_cols/2))] <- word_cols
	data[,paste0("NumericColumn", 1:(num_cols/2))] <- numeric_cols

	return(data)
	}

	# Target file size in bytes (1GB)
	target_file_size <- 1 * 1024^3

	# Temporary file to accumulate data
	temp_file_path <- "data/large_dataset.csv"

	# Write an initial chunk to establish the file with the correct headers
	initial_chunk <- generate_data_chunk()
	fwrite(initial_chunk, file = temp_file_path)

	# Keep appending data until the file size exceeds the target size
	while(file.info(temp_file_path)$size < target_file_size) {
	chunk <- generate_data_chunk()
	# Append data without header and row names
	fwrite(chunk, file = temp_file_path, append = TRUE, col.names = FALSE)
	}

	# time it takes to read, group_by, summarize
	read_times <- bench::mark(
	readr = {
	readr::read_csv(temp_file_path, show_col_types = FALSE)
	},
	readr_lazy = {
	readr::read_csv(temp_file_path, lazy = TRUE, show_col_types = FALSE)
	},
	data.table = {
	data.table::fread(temp_file_path)
	},
	duckplyr = {
	duckplyr::duckplyr_df_from_csv(temp_file_path)
	},
	check = FALSE
	)

	p <- ggplot(read_times) +
	geom_col(aes(x = expression, y = median), fill = "#ff1f51", alpha = 0.5) +
	hrbrthemes::theme_ipsum() +
	scale_y_continuous(breaks = c(1, 5, 10, 30)) +
	theme(
	axis.text = element_text(size = 14),
	plot.background = element_rect(fill = "#e6e6e6")
	) +
	labs(
	title = "Time to read 1GB CSV",
	x = "", y = "Total time (s)"
	)

	ggsave("chart/duckplyr_read_times.png", p, width = 4, height = 4, dpi = 300)


	data_duckplyr <- duckplyr_df_from_csv(temp_file_path)
	data_dt <- as.data.table(data_duckplyr, key = "WordColumn1")
	data_tbl <- qTBL(data_dt)

	con <- dbConnect(duckdb())
	duckdb_register(con, "data", data_tbl, overwrite = TRUE)
	data_dbplyr <- tbl(con, "data")

	sum_times <- bench::mark(
	dplyr = {
	data_tbl %>%
	dplyr::group_by(WordColumn1) %>%
	dplyr::summarize(across(starts_with("NumericColumn"), mean))
	},
	data.table = {
	data_dt[, lapply(.SD, mean), by = WordColumn1, .SDcols = paste0("NumericColumn", 1:50)]
	},
	collapse = {
	data_dt %>%
	collapse::fgroup_by(WordColumn1) %>%
	collapse::fsummarise(across(paste0("NumericColumn", 1:50), fmean))
	},
	duckplyr = {
	data_duckplyr %>%
	group_by(WordColumn1) %>%
	summarize(across(starts_with("NumericColumn"), mean))
	},
	duckdb = {
	dbGetQuery(con, (
	"SELECT WordColumn1, AVG(NumericColumn1), AVG(NumericColumn2), AVG(NumericColumn3), AVG(NumericColumn4), AVG(NumericColumn5), AVG(NumericColumn6), AVG(NumericColumn7), AVG(NumericColumn8), AVG(NumericColumn9), AVG(NumericColumn10), AVG(NumericColumn11), AVG(NumericColumn12), AVG(NumericColumn13), AVG(NumericColumn14), AVG(NumericColumn15), AVG(NumericColumn16), AVG(NumericColumn17), AVG(NumericColumn18), AVG(NumericColumn19), AVG(NumericColumn20), AVG(NumericColumn21), AVG(NumericColumn22), AVG(NumericColumn23), AVG(NumericColumn24), AVG(NumericColumn25), AVG(NumericColumn26), AVG(NumericColumn27), AVG(NumericColumn28), AVG(NumericColumn29), AVG(NumericColumn30), AVG(NumericColumn31), AVG(NumericColumn32), AVG(NumericColumn33), AVG(NumericColumn34), AVG(NumericColumn35), AVG(NumericColumn36), AVG(NumericColumn37), AVG(NumericColumn38), AVG(NumericColumn39), AVG(NumericColumn40), AVG(NumericColumn41), AVG(NumericColumn42), AVG(NumericColumn43), AVG(NumericColumn44), AVG(NumericColumn45), AVG(NumericColumn46), AVG(NumericColumn47), AVG(NumericColumn48), AVG(NumericColumn49), AVG(NumericColumn50)
	FROM data GROUP BY WordColumn1"
	))
	},
	dbplyr = {
	data_dbplyr %>%
	group_by(WordColumn1) %>%
	summarize(across(starts_with("NumericColumn"), mean))
	},
	check = FALSE
	)


	p <- ggplot(sum_times) +
	geom_col(aes(x = expression, y = median), fill = "#ff1f51", alpha = 0.5) +
	ggthemes::theme_clean() +
	scale_y_continuous(trans = "pseudo_log", breaks = c(0, 1, 5, 25)) +
	theme(
	axis.text = element_text(size = 12),
	plot.title = element_text(size = 12),
	plot.subtitle = element_text(size = 10),
	plot.background = element_rect(fill = "#e6e6e6")
	) +
	labs(
	title = "Time to group_by and summarize 1GB CSV",
	subtitle = "Grouping by 1 word column and summarizing 50 numeric columns",
	caption = "@arthurgailes",
	x = "", y = "Total time (s)"
	)

	ggsave("chart/duckplyr_sum_times.png", p, width = 5, height = 3, dpi = 300)