Created
July 30, 2025 11:28
-
-
Save lwaldron/e5426f35599c2b89303603a90686e0ec to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Title: Benchmark Query Performance on Sorted vs. Unsorted Parquet Files | |
# Description: This script first creates a new Parquet file sorted by the | |
# '# Gene Family' column. It then uses the 'microbenchmark' | |
# package to compare the query speed for a specific gene | |
# family between the original, unsorted file and the new, | |
# sorted file. | |
# --- 1. SETUP: Load necessary libraries --- | |
# Ensure you have these packages installed: | |
# install.packages(c("duckdb", "dplyr", "arrow", "microbenchmark")) | |
library(duckdb) | |
library(dplyr) | |
library(arrow) | |
library(microbenchmark) | |
# --- 2. PREPARATION: Define file paths and connect to DuckDB --- | |
# IMPORTANT: Please verify the path to your original Parquet file. | |
original_path <- path.expand("~/Downloads/genefamilies_unstratified.parquet") | |
sorted_path <- path.expand("~/Downloads/genefamilies_sorted_by_family.parquet") | |
# Establish a connection to an in-memory DuckDB database | |
con <- dbConnect(duckdb(), dbdir = ":memory:", read_only = FALSE) | |
# --- 3. CREATE SORTED FILE: Read, sort, and write the new Parquet file --- | |
# Check if the sorted file already exists to avoid re-creating it unnecessarily. | |
if (!file.exists(sorted_path)) { | |
cat("Sorted Parquet file not found. Creating it now...\n") | |
# Register the original Parquet file as a table in DuckDB | |
duck_tbl <- tbl(con, sql(paste0("SELECT * FROM read_parquet('", original_path, "')"))) | |
# Use dplyr to arrange the data by the '# Gene Family' column | |
# The backticks are necessary because the column name contains a space and '#' | |
sorted_tbl <- duck_tbl %>% | |
arrange(`# Gene Family`) | |
# Write the sorted data to a new Parquet file using the arrow package | |
# The collect() function brings the sorted data from DuckDB into an R data frame | |
write_parquet(collect(sorted_tbl), sorted_path) | |
cat("Successfully created sorted file at:", sorted_path, "\n") | |
} else { | |
cat("Sorted file already exists at:", sorted_path, "\n") | |
} | |
# --- 4. BENCHMARKING: Compare query times --- | |
cat("\nNow benchmarking query performance...\n") | |
# Define the gene family to query for | |
target_family <- "UniRef90_C9KZX3" | |
# Use microbenchmark to run each query multiple times for accurate results | |
benchmark_results <- microbenchmark( | |
# Query on the original file (sorted by 'filename') | |
original_file = { | |
tbl(con, sql(paste0("SELECT * FROM read_parquet('", original_path, "')"))) %>% | |
filter(`# Gene Family` == target_family) %>% | |
collect() | |
}, | |
# Query on the new file (sorted by '# Gene Family') | |
sorted_file = { | |
tbl(con, sql(paste0("SELECT * FROM read_parquet('", sorted_path, "')"))) %>% | |
filter(`# Gene Family` == target_family) %>% | |
collect() | |
}, | |
times = 25L # Number of times to run each test. Adjust as needed. | |
) | |
# --- 5. RESULTS: Print the benchmark summary --- | |
cat("\n--- Benchmark Results ---\n") | |
print(benchmark_results) | |
# --- 6. CLEANUP: Disconnect from the database --- | |
dbDisconnect(con, shutdown = TRUE) | |
sessionInfo() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment