lwaldron · July 30, 2025 11:28
diff --git a/parquet_sorting_benchmark.R b/parquet_sorting_benchmark.R
 # Title: Benchmark Query Performance on Sorted vs. Unsorted Parquet Files
 # Description: This script first creates a new Parquet file sorted by the
 #              '# Gene Family' column. It then uses the 'microbenchmark'
 #              package to compare the query speed for a specific gene
 #              family between the original, unsorted file and the new,
 #              sorted file.

 # --- 1. SETUP: Load necessary libraries ---
 # Ensure you have these packages installed:
 # install.packages(c("duckdb", "dplyr", "arrow", "microbenchmark"))

 library(duckdb)
 library(dplyr)
 library(arrow)
 library(microbenchmark)

 # --- 2. PREPARATION: Define file paths and connect to DuckDB ---

 # IMPORTANT: Please verify the path to your original Parquet file.
 original_path <- path.expand("~/Downloads/genefamilies_unstratified.parquet")
 sorted_path <- path.expand("~/Downloads/genefamilies_sorted_by_family.parquet")

 # Establish a connection to an in-memory DuckDB database
 con <- dbConnect(duckdb(), dbdir = ":memory:", read_only = FALSE)

 # --- 3. CREATE SORTED FILE: Read, sort, and write the new Parquet file ---

 # Check if the sorted file already exists to avoid re-creating it unnecessarily.
 if (!file.exists(sorted_path)) {
    cat("Sorted Parquet file not found. Creating it now...\n")
    
    # Register the original Parquet file as a table in DuckDB
    duck_tbl <- tbl(con, sql(paste0("SELECT * FROM read_parquet('", original_path, "')")))
    
    # Use dplyr to arrange the data by the '# Gene Family' column
    # The backticks are necessary because the column name contains a space and '#'
    sorted_tbl <- duck_tbl %>%
        arrange(`# Gene Family`)
    
    # Write the sorted data to a new Parquet file using the arrow package
    # The collect() function brings the sorted data from DuckDB into an R data frame
    write_parquet(collect(sorted_tbl), sorted_path)
    
    cat("Successfully created sorted file at:", sorted_path, "\n")
 } else {
    cat("Sorted file already exists at:", sorted_path, "\n")
 }


 # --- 4. BENCHMARKING: Compare query times ---

 cat("\nNow benchmarking query performance...\n")

 # Define the gene family to query for
 target_family <- "UniRef90_C9KZX3"

 # Use microbenchmark to run each query multiple times for accurate results
 benchmark_results <- microbenchmark(
    # Query on the original file (sorted by 'filename')
    original_file = {
        tbl(con, sql(paste0("SELECT * FROM read_parquet('", original_path, "')"))) %>%
            filter(`# Gene Family` == target_family) %>%
            collect()
    },
    # Query on the new file (sorted by '# Gene Family')
    sorted_file = {
        tbl(con, sql(paste0("SELECT * FROM read_parquet('", sorted_path, "')"))) %>%
            filter(`# Gene Family` == target_family) %>%
            collect()
    },
    times = 25L # Number of times to run each test. Adjust as needed.
 )

 # --- 5. RESULTS: Print the benchmark summary ---

 cat("\n--- Benchmark Results ---\n")
 print(benchmark_results)

 # --- 6. CLEANUP: Disconnect from the database ---
 dbDisconnect(con, shutdown = TRUE)

 sessionInfo()
	# Title: Benchmark Query Performance on Sorted vs. Unsorted Parquet Files
	# Description: This script first creates a new Parquet file sorted by the
	# '# Gene Family' column. It then uses the 'microbenchmark'
	# package to compare the query speed for a specific gene
	# family between the original, unsorted file and the new,
	# sorted file.

	# --- 1. SETUP: Load necessary libraries ---
	# Ensure you have these packages installed:
	# install.packages(c("duckdb", "dplyr", "arrow", "microbenchmark"))

	library(duckdb)
	library(dplyr)
	library(arrow)
	library(microbenchmark)

	# --- 2. PREPARATION: Define file paths and connect to DuckDB ---

	# IMPORTANT: Please verify the path to your original Parquet file.
	original_path <- path.expand("~/Downloads/genefamilies_unstratified.parquet")
	sorted_path <- path.expand("~/Downloads/genefamilies_sorted_by_family.parquet")

	# Establish a connection to an in-memory DuckDB database
	con <- dbConnect(duckdb(), dbdir = ":memory:", read_only = FALSE)

	# --- 3. CREATE SORTED FILE: Read, sort, and write the new Parquet file ---

	# Check if the sorted file already exists to avoid re-creating it unnecessarily.
	if (!file.exists(sorted_path)) {
	cat("Sorted Parquet file not found. Creating it now...\n")

	# Register the original Parquet file as a table in DuckDB
	duck_tbl <- tbl(con, sql(paste0("SELECT * FROM read_parquet('", original_path, "')")))

	# Use dplyr to arrange the data by the '# Gene Family' column
	# The backticks are necessary because the column name contains a space and '#'
	sorted_tbl <- duck_tbl %>%
	arrange(`# Gene Family`)

	# Write the sorted data to a new Parquet file using the arrow package
	# The collect() function brings the sorted data from DuckDB into an R data frame
	write_parquet(collect(sorted_tbl), sorted_path)

	cat("Successfully created sorted file at:", sorted_path, "\n")
	} else {
	cat("Sorted file already exists at:", sorted_path, "\n")
	}


	# --- 4. BENCHMARKING: Compare query times ---

	cat("\nNow benchmarking query performance...\n")

	# Define the gene family to query for
	target_family <- "UniRef90_C9KZX3"

	# Use microbenchmark to run each query multiple times for accurate results
	benchmark_results <- microbenchmark(
	# Query on the original file (sorted by 'filename')
	original_file = {
	tbl(con, sql(paste0("SELECT * FROM read_parquet('", original_path, "')"))) %>%
	filter(`# Gene Family` == target_family) %>%
	collect()
	},
	# Query on the new file (sorted by '# Gene Family')
	sorted_file = {
	tbl(con, sql(paste0("SELECT * FROM read_parquet('", sorted_path, "')"))) %>%
	filter(`# Gene Family` == target_family) %>%
	collect()
	},
	times = 25L # Number of times to run each test. Adjust as needed.
	)

	# --- 5. RESULTS: Print the benchmark summary ---

	cat("\n--- Benchmark Results ---\n")
	print(benchmark_results)

	# --- 6. CLEANUP: Disconnect from the database ---
	dbDisconnect(con, shutdown = TRUE)

	sessionInfo()