Skip to content

Instantly share code, notes, and snippets.

@lwaldron
Created July 30, 2025 11:28
Show Gist options
  • Save lwaldron/e5426f35599c2b89303603a90686e0ec to your computer and use it in GitHub Desktop.
Save lwaldron/e5426f35599c2b89303603a90686e0ec to your computer and use it in GitHub Desktop.
# Title: Benchmark Query Performance on Sorted vs. Unsorted Parquet Files
# Description: This script first creates a new Parquet file sorted by the
# '# Gene Family' column. It then uses the 'microbenchmark'
# package to compare the query speed for a specific gene
# family between the original, unsorted file and the new,
# sorted file.
# --- 1. SETUP: Load necessary libraries ---
# Ensure you have these packages installed:
# install.packages(c("duckdb", "dplyr", "arrow", "microbenchmark"))
library(duckdb)
library(dplyr)
library(arrow)
library(microbenchmark)
# --- 2. PREPARATION: Define file paths and connect to DuckDB ---
# IMPORTANT: Please verify the path to your original Parquet file.
original_path <- path.expand("~/Downloads/genefamilies_unstratified.parquet")
sorted_path <- path.expand("~/Downloads/genefamilies_sorted_by_family.parquet")
# Establish a connection to an in-memory DuckDB database
con <- dbConnect(duckdb(), dbdir = ":memory:", read_only = FALSE)
# --- 3. CREATE SORTED FILE: Read, sort, and write the new Parquet file ---
# Check if the sorted file already exists to avoid re-creating it unnecessarily.
if (!file.exists(sorted_path)) {
cat("Sorted Parquet file not found. Creating it now...\n")
# Register the original Parquet file as a table in DuckDB
duck_tbl <- tbl(con, sql(paste0("SELECT * FROM read_parquet('", original_path, "')")))
# Use dplyr to arrange the data by the '# Gene Family' column
# The backticks are necessary because the column name contains a space and '#'
sorted_tbl <- duck_tbl %>%
arrange(`# Gene Family`)
# Write the sorted data to a new Parquet file using the arrow package
# The collect() function brings the sorted data from DuckDB into an R data frame
write_parquet(collect(sorted_tbl), sorted_path)
cat("Successfully created sorted file at:", sorted_path, "\n")
} else {
cat("Sorted file already exists at:", sorted_path, "\n")
}
# --- 4. BENCHMARKING: Compare query times ---
cat("\nNow benchmarking query performance...\n")
# Define the gene family to query for
target_family <- "UniRef90_C9KZX3"
# Use microbenchmark to run each query multiple times for accurate results
benchmark_results <- microbenchmark(
# Query on the original file (sorted by 'filename')
original_file = {
tbl(con, sql(paste0("SELECT * FROM read_parquet('", original_path, "')"))) %>%
filter(`# Gene Family` == target_family) %>%
collect()
},
# Query on the new file (sorted by '# Gene Family')
sorted_file = {
tbl(con, sql(paste0("SELECT * FROM read_parquet('", sorted_path, "')"))) %>%
filter(`# Gene Family` == target_family) %>%
collect()
},
times = 25L # Number of times to run each test. Adjust as needed.
)
# --- 5. RESULTS: Print the benchmark summary ---
cat("\n--- Benchmark Results ---\n")
print(benchmark_results)
# --- 6. CLEANUP: Disconnect from the database ---
dbDisconnect(con, shutdown = TRUE)
sessionInfo()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment