Skip to content

Instantly share code, notes, and snippets.

@nicornk
Created November 24, 2025 20:55
Show Gist options
  • Select an option

  • Save nicornk/ce90b7f51ef8f089592e4881d28ad2f3 to your computer and use it in GitHub Desktop.

Select an option

Save nicornk/ce90b7f51ef8f089592e4881d28ad2f3 to your computer and use it in GitHub Desktop.
tpch-1000 with duckdb
#!/usr/bin/env python
# coding: utf-8
# In[4]:
import duckdb
import os
import time
import psutil
from pathlib import Path
SCALE_FACTOR = 1000
# In[5]:
def setup_environment():
"""Configure environment for optimal performance"""
# Get job info
job_id = os.environ.get('SLURM_JOB_ID', 'unknown')
scratch_dir = os.environ.get('LSCRATCH', '/tmp')
print(f"Job ID: {job_id}")
print(f"Scratch directory: {scratch_dir}")
print(f"Available CPUs: {os.cpu_count()}")
print(f"Available memory: {psutil.virtual_memory().total / (1024**3):.1f} GB")
# Create working directory on fast SSD
work_dir = Path(scratch_dir) / "duckdb_tpch"
work_dir.mkdir(exist_ok=True)
os.chdir(work_dir)
return work_dir, job_id
# In[6]:
def create_optimized_connection(work_dir):
"""Create DuckDB connection with optimal settings"""
# Database file on fast SSD
# db_path = Path("/shared/project/scicomp-hpc/nrenkamp") / f"tpch-sf{SCALE_FACTOR}.db"
db_path = Path("/tmp") / f"tpch-sf{SCALE_FACTOR}.db"
print(f"{db_path=}")
# Connect to DuckDB
conn = duckdb.connect(str(db_path))
# Configure for your massive resources
conn.execute("SET temp_directory=?", [str(work_dir)]) # Temp files on SSD
conn.execute("SET enable_progress_bar=false")
conn.execute("SET preserve_insertion_order=false") # Better performance
print("DuckDB connection configured with:")
# Get and print memory limit
memory_result = conn.execute("SELECT current_setting('memory_limit')").fetchone()
print(f"- Memory limit: {memory_result[0]}")
# Get and print thread count
threads_result = conn.execute("SELECT current_setting('threads')").fetchone()
print(f"- Threads: {threads_result[0]}")
# Get and print temp directory
temp_dir_result = conn.execute("SELECT current_setting('temp_directory')").fetchone()
print(f"- Temp directory: {temp_dir_result[0]}")
return conn
# In[7]:
def monitor_resources():
"""Display current resource usage"""
print("\n📈 Current resource usage:")
# CPU usage
cpu_percent = psutil.cpu_percent(interval=1)
print(f" CPU: {cpu_percent:.1f}%")
# Memory usage
memory = psutil.virtual_memory()
print(f" Memory: {memory.used / (1024**3):.1f}GB / {memory.total / (1024**3):.1f}GB ({memory.percent:.1f}%)")
# Disk usage for scratch
scratch_dir = os.environ.get('LSCRATCH', '/tmp')
disk = psutil.disk_usage(scratch_dir)
print(f" Scratch disk: {disk.used / (1024**3):.1f}GB / {disk.total / (1024**3):.1f}GB ({disk.used/disk.total*100:.1f}%)")
# In[11]:
# Setup environment
work_dir, job_id = setup_environment()
# Create optimized DuckDB connection
conn = create_optimized_connection(work_dir)
start_time = time.time()
# Install and load TPC-H extension
conn.execute("INSTALL tpch")
conn.execute("LOAD tpch")
# Generate data - uncomment below
#conn.execute(f"CALL dbgen(sf={SCALE_FACTOR})")
#generation_time = time.time() - start_time
#print(f"✅ Data generation completed in {generation_time:.2f} seconds")
monitor_resources()
# Run all queries (minimal)
queries_start = time.time()
for i in range(1, 23):
start = time.time()
conn.execute(f"PRAGMA tpch({i})").fetchall()
print(f"Query {i}: {time.time() - start:.3f}s")
print(f"Total time: {time.time() - queries_start:.3f}s")
monitor_resources()
# In[10]:
monitor_resources()
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment