Created
November 24, 2025 20:55
-
-
Save nicornk/ce90b7f51ef8f089592e4881d28ad2f3 to your computer and use it in GitHub Desktop.
tpch-1000 with duckdb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # coding: utf-8 | |
| # In[4]: | |
| import duckdb | |
| import os | |
| import time | |
| import psutil | |
| from pathlib import Path | |
| SCALE_FACTOR = 1000 | |
| # In[5]: | |
| def setup_environment(): | |
| """Configure environment for optimal performance""" | |
| # Get job info | |
| job_id = os.environ.get('SLURM_JOB_ID', 'unknown') | |
| scratch_dir = os.environ.get('LSCRATCH', '/tmp') | |
| print(f"Job ID: {job_id}") | |
| print(f"Scratch directory: {scratch_dir}") | |
| print(f"Available CPUs: {os.cpu_count()}") | |
| print(f"Available memory: {psutil.virtual_memory().total / (1024**3):.1f} GB") | |
| # Create working directory on fast SSD | |
| work_dir = Path(scratch_dir) / "duckdb_tpch" | |
| work_dir.mkdir(exist_ok=True) | |
| os.chdir(work_dir) | |
| return work_dir, job_id | |
| # In[6]: | |
| def create_optimized_connection(work_dir): | |
| """Create DuckDB connection with optimal settings""" | |
| # Database file on fast SSD | |
| # db_path = Path("/shared/project/scicomp-hpc/nrenkamp") / f"tpch-sf{SCALE_FACTOR}.db" | |
| db_path = Path("/tmp") / f"tpch-sf{SCALE_FACTOR}.db" | |
| print(f"{db_path=}") | |
| # Connect to DuckDB | |
| conn = duckdb.connect(str(db_path)) | |
| # Configure for your massive resources | |
| conn.execute("SET temp_directory=?", [str(work_dir)]) # Temp files on SSD | |
| conn.execute("SET enable_progress_bar=false") | |
| conn.execute("SET preserve_insertion_order=false") # Better performance | |
| print("DuckDB connection configured with:") | |
| # Get and print memory limit | |
| memory_result = conn.execute("SELECT current_setting('memory_limit')").fetchone() | |
| print(f"- Memory limit: {memory_result[0]}") | |
| # Get and print thread count | |
| threads_result = conn.execute("SELECT current_setting('threads')").fetchone() | |
| print(f"- Threads: {threads_result[0]}") | |
| # Get and print temp directory | |
| temp_dir_result = conn.execute("SELECT current_setting('temp_directory')").fetchone() | |
| print(f"- Temp directory: {temp_dir_result[0]}") | |
| return conn | |
| # In[7]: | |
| def monitor_resources(): | |
| """Display current resource usage""" | |
| print("\n📈 Current resource usage:") | |
| # CPU usage | |
| cpu_percent = psutil.cpu_percent(interval=1) | |
| print(f" CPU: {cpu_percent:.1f}%") | |
| # Memory usage | |
| memory = psutil.virtual_memory() | |
| print(f" Memory: {memory.used / (1024**3):.1f}GB / {memory.total / (1024**3):.1f}GB ({memory.percent:.1f}%)") | |
| # Disk usage for scratch | |
| scratch_dir = os.environ.get('LSCRATCH', '/tmp') | |
| disk = psutil.disk_usage(scratch_dir) | |
| print(f" Scratch disk: {disk.used / (1024**3):.1f}GB / {disk.total / (1024**3):.1f}GB ({disk.used/disk.total*100:.1f}%)") | |
| # In[11]: | |
| # Setup environment | |
| work_dir, job_id = setup_environment() | |
| # Create optimized DuckDB connection | |
| conn = create_optimized_connection(work_dir) | |
| start_time = time.time() | |
| # Install and load TPC-H extension | |
| conn.execute("INSTALL tpch") | |
| conn.execute("LOAD tpch") | |
| # Generate data - uncomment below | |
| #conn.execute(f"CALL dbgen(sf={SCALE_FACTOR})") | |
| #generation_time = time.time() - start_time | |
| #print(f"✅ Data generation completed in {generation_time:.2f} seconds") | |
| monitor_resources() | |
| # Run all queries (minimal) | |
| queries_start = time.time() | |
| for i in range(1, 23): | |
| start = time.time() | |
| conn.execute(f"PRAGMA tpch({i})").fetchall() | |
| print(f"Query {i}: {time.time() - start:.3f}s") | |
| print(f"Total time: {time.time() - queries_start:.3f}s") | |
| monitor_resources() | |
| # In[10]: | |
| monitor_resources() | |
| # In[ ]: | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment