maxenglander · April 4, 2026 03:04
diff --git a/README.md b/README.md
diff --git a/bench.sh b/bench.sh
 #!/usr/bin/env bash
 # bench.sh <n_databases> [n_runs]
 #
 # Benchmarks get_all_extensions() at a given database count using
 # bench_get_all_extensions() which captures /proc/self/io deltas.
 #
 # Metrics per run:
 #   wall_ms    - total wall-clock time (ms)
 #   rchar      - bytes read via read() syscalls (page-cache-inclusive)
 #   read_bytes - bytes fetched from storage (page-cache misses only)
 #   n_rows     - extension rows returned
 #
 # Also captures container-level CPU and memory via docker stats.

 set -euo pipefail

 N=${1:-100}
 RUNS=${2:-5}
 PSQL="psql -h 127.0.0.1 -p 5433 -U postgres postgres -t -A"
 CONTAINER="pg_bench"

 echo "========================================================"
 echo " get_all_extensions() benchmark"
 echo " databases : $N"
 echo " runs      : $RUNS"
 echo "========================================================"

 # ── Container baseline stats ──────────────────────────────────────────────────
 echo ""
 MEM_BEFORE=$(/usr/local/bin/docker stats --no-stream \
  --format '{{.MemUsage}}' "$CONTAINER" 2>/dev/null | awk '{print $1}')
 echo "── Container memory before: $MEM_BEFORE"

 # ── Warmup run (not counted) ─────────────────────────────────────────────────
 echo "── Warmup run (excluded from results)..."
 $PSQL -c "SELECT wall_ms FROM public.bench_get_all_extensions();" > /dev/null

 # ── Timed runs ───────────────────────────────────────────────────────────────
 echo ""
 echo "── Timed runs ───────────────────────────────────────────────────────────"
 printf "  %-4s  %-10s  %-8s  %-14s  %-14s  %-6s\n" \
  "run" "wall_ms" "n_rows" "rchar" "read_bytes"

 declare -a WALL_TIMES=()
 declare -a RCHARS=()
 declare -a READ_BYTES=()

 for i in $(seq 1 $RUNS); do
  ROW=$($PSQL -F'|' -c "SELECT n_rows, wall_ms, rchar, read_bytes FROM public.bench_get_all_extensions();")
  N_ROWS=$(echo "$ROW"  | awk -F'|' '{print $1}')
  WALL=$(echo "$ROW"    | awk -F'|' '{print $2}')
  RCHAR=$(echo "$ROW"   | awk -F'|' '{print $3}')
  RBYTES=$(echo "$ROW"  | awk -F'|' '{print $4}')

  WALL_TIMES+=("$WALL")
  RCHARS+=("$RCHAR")
  READ_BYTES+=("$RBYTES")

  # Format bytes human-readable
  RCHAR_MB=$(echo "scale=1; $RCHAR / 1048576" | bc 2>/dev/null || echo "?")
  RBYTES_MB=$(echo "scale=1; $RBYTES / 1048576" | bc 2>/dev/null || echo "?")

  printf "  %-4d  %-10s  %-8s  %-8s MB    %-8s MB\n" \
    "$i" "${WALL}ms" "$N_ROWS" "$RCHAR_MB" "$RBYTES_MB"
 done

 # ── Container stats after ────────────────────────────────────────────────────
 MEM_AFTER=$(/usr/local/bin/docker stats --no-stream \
  --format '{{.MemUsage}}' "$CONTAINER" 2>/dev/null | awk '{print $1}')

 # ── CPU: one more run under /usr/bin/time inside container ───────────────────
 echo ""
 echo "── CPU (1 run via /usr/bin/time inside container) ──────────────────────"
 /usr/local/bin/docker exec "$CONTAINER" \
  /usr/bin/time -v \
  psql -U postgres postgres -t -A \
  -c "SELECT n_rows, wall_ms, rchar, read_bytes FROM public.bench_get_all_extensions();" \
  2>&1 | grep -E "wall clock|Maximum resident|Percent of CPU|n_rows|wall_ms|\|"

 # ── Summary statistics ────────────────────────────────────────────────────────
 echo ""
 echo "── Summary ──────────────────────────────────────────────────────────────"

 # Compute min/max/avg for wall_ms (values may be decimals; use awk)
 printf '%s\n' "${WALL_TIMES[@]}" | sort -n > /tmp/bench_wall.txt
 WALL_MIN=$(head -1 /tmp/bench_wall.txt)
 WALL_MAX=$(tail -1 /tmp/bench_wall.txt)
 WALL_AVG=$(awk '{s+=$1; c++} END{printf "%.0f", s/c}' /tmp/bench_wall.txt)
 P99_IDX=$(echo "($RUNS * 99 / 100)" | bc 2>/dev/null || echo 1)
 [ "$P99_IDX" -lt 1 ] && P99_IDX=1
 WALL_P99=$(sed -n "${P99_IDX}p" /tmp/bench_wall.txt)

 # Avg rchar and read_bytes
 RCHAR_AVG=$(printf '%s\n' "${RCHARS[@]}" | awk '{s+=$1; c++} END{printf "%.0f", s/c}')
 RBYTES_AVG=$(printf '%s\n' "${READ_BYTES[@]}" | awk '{s+=$1; c++} END{printf "%.0f", s/c}')
 RCHAR_AVG_MB=$(echo "scale=2; $RCHAR_AVG / 1048576" | bc 2>/dev/null || echo "?")
 RBYTES_AVG_MB=$(echo "scale=2; $RBYTES_AVG / 1048576" | bc 2>/dev/null || echo "?")

 printf "  wall_ms  : min=%-8s avg=%-8s p99=%-8s max=%s\n" \
  "${WALL_MIN}ms" "${WALL_AVG}ms" "${WALL_P99:-$WALL_MAX}ms" "${WALL_MAX}ms"
 printf "  rchar    : avg=%.2f MB  (all read() syscalls, incl page cache)\n" \
  "$(echo "scale=2; $RCHAR_AVG / 1048576" | bc 2>/dev/null || echo 0)"
 printf "  read_bytes: avg=%.2f MB  (physical disk I/O, page cache misses)\n" \
  "$(echo "scale=2; $RBYTES_AVG / 1048576" | bc 2>/dev/null || echo 0)"
 printf "  memory   : before=%-12s after=%s\n" "$MEM_BEFORE" "$MEM_AFTER"
 echo ""

 # ── OS page cache effect: expected disk reads per DB ────────────────────────
 EXPECTED_MB=$(echo "scale=2; $N * 8192 / 1048576" | bc 2>/dev/null || echo "?")
 echo "── Expected data volume ─────────────────────────────────────────────────"
 echo "  $N databases × 8 KB/file = ${EXPECTED_MB} MB expected rchar (cold)"
 echo ""
diff --git a/bench_get_all_extensions.sql b/bench_get_all_extensions.sql
 -- bench_get_all_extensions.sql
 --
 -- Benchmarking wrapper for get_all_extensions().
 -- Captures /proc/self/io (Linux kernel I/O accounting for the postgres backend)
 -- before and after the call to measure actual filesystem read bytes,
 -- independent of the OS page cache state.
 --
 -- Returns one row with:
 --   n_rows       - number of (datname, extname) pairs returned
 --   wall_ms      - wall-clock time of the call in milliseconds
 --   rchar        - bytes read via read()/pread() syscalls (incl. page cache hits)
 --   wchar        - bytes written
 --   read_bytes   - bytes actually fetched from storage (page cache misses only)
 --   write_bytes  - bytes actually written to storage
 --
 -- rchar vs read_bytes:
 --   rchar counts all read() calls regardless of page cache.
 --   read_bytes counts only physical I/O (page cache misses).
 --   For a cold run: rchar ≈ read_bytes ≈ n_databases * 8192.
 --   For a warm run: rchar stays high (syscalls still happen), read_bytes → 0.

 CREATE OR REPLACE FUNCTION public.bench_get_all_extensions()
 RETURNS TABLE (
    n_rows      bigint,
    wall_ms     numeric,
    rchar       bigint,
    wchar       bigint,
    read_bytes  bigint,
    write_bytes bigint
 )
 LANGUAGE plpgsql
 SECURITY DEFINER
 AS $$
 DECLARE
    io_before  text;
    io_after   text;
    ts_before  timestamptz;
    ts_after   timestamptz;
    cnt        bigint;

    -- Parse a single field from /proc/self/io text
    -- Format is "fieldname: value\n" per line
    rchar_before  bigint; rchar_after  bigint;
    wchar_before  bigint; wchar_after  bigint;
    rb_before     bigint; rb_after     bigint;
    wb_before     bigint; wb_after     bigint;
 BEGIN
    -- Snapshot /proc/self/io before
    io_before := pg_read_file('/proc/self/io');
    ts_before := clock_timestamp();

    -- Run the function; count rows to prevent result from being optimised away
    SELECT count(*) INTO cnt FROM public.get_all_extensions();

    ts_after := clock_timestamp();
    io_after := pg_read_file('/proc/self/io');

    -- Helper: extract numeric value for a given key from /proc/self/io text
    -- /proc/self/io format: "rchar: 12345\nwchar: 67890\n..."
    rchar_before  := (regexp_match(io_before,  'rchar: (\d+)'))[1]::bigint;
    rchar_after   := (regexp_match(io_after,   'rchar: (\d+)'))[1]::bigint;
    wchar_before  := (regexp_match(io_before,  'wchar: (\d+)'))[1]::bigint;
    wchar_after   := (regexp_match(io_after,   'wchar: (\d+)'))[1]::bigint;
    rb_before     := (regexp_match(io_before,  'read_bytes: (\d+)'))[1]::bigint;
    rb_after      := (regexp_match(io_after,   'read_bytes: (\d+)'))[1]::bigint;
    wb_before     := (regexp_match(io_before,  'write_bytes: (\d+)'))[1]::bigint;
    wb_after      := (regexp_match(io_after,   'write_bytes: (\d+)'))[1]::bigint;

    n_rows      := cnt;
    wall_ms     := ROUND(EXTRACT(EPOCH FROM (ts_after - ts_before)) * 1000, 2);
    rchar       := rchar_after  - rchar_before;
    wchar       := wchar_after  - wchar_before;
    read_bytes  := rb_after     - rb_before;
    write_bytes := wb_after     - wb_before;
    RETURN NEXT;
 END;
 $$;

 COMMENT ON FUNCTION public.bench_get_all_extensions() IS
 'Benchmarking wrapper: calls get_all_extensions() and returns row count, wall time,
 and /proc/self/io deltas (rchar, wchar, read_bytes, write_bytes). Linux only.';
diff --git a/Dockerfile b/Dockerfile
 FROM postgres:17

 # Install sysstat (pidstat, iostat) and procps (free, top) for benchmarking
 RUN apt-get update && apt-get install -y --no-install-recommends \
    sysstat \
    procps \
    time \
    bc \
    && rm -rf /var/lib/apt/lists/*

 # Custom postgresql.conf overrides for benchmarking
 COPY postgresql-bench.conf /etc/postgresql/postgresql-bench.conf

 ENV POSTGRES_DB=postgres
 ENV POSTGRES_HOST_AUTH_METHOD=trust
diff --git a/get_all_extensions.sql b/get_all_extensions.sql
 -- get_all_extensions.sql
 --
 -- A PL/pgSQL function that reports installed extensions across ALL databases
 -- from a single connection, by parsing the raw heap file for pg_extension
 -- (catalog OID 3079) in each database's data directory.
 --
 -- Why raw heap parsing instead of dblink / auto-discover-databases?
 -- Opening a connection per database is expensive at scale (hundreds of DBs).
 -- pg_extension's catalog OID is fixed at 3079, so its heap file is always at
 -- base/<db_oid>/3079 relative to the data directory. We read those files
 -- directly with pg_read_binary_file() and parse the page format in PL/pgSQL.
 --
 -- Requirements:
 --   - PostgreSQL 12+ (pg_extension OID is a regular user column in PG12+)
 --   - Superuser or pg_read_server_files role for pg_read_binary_file()
 --   - Run this in the database the exporter connects to (e.g., "postgres")
 --
 -- Caveats:
 --   - Assumes relfilenode == 3079 for pg_extension in every database.
 --     This holds unless someone ran VACUUM FULL on pg_extension (extremely
 --     unlikely for a catalog that barely changes). The connected database's
 --     relfilenode is verified at runtime; other databases are assumed.
 --   - MVCC visibility is approximated: we check HEAP_XMIN_COMMITTED (0x0100)
 --     and HEAP_XMAX_INVALID (0x0800) in t_infomask. Good enough for a catalog
 --     with near-zero churn.
 --   - Reads the on-disk file directly via pg_read_binary_file(), bypassing the
 --     shared buffer pool. Very recently installed/removed extensions (committed
 --     after the last checkpoint) may not be visible until the next checkpoint
 --     flushes those pages. This is acceptable for monitoring — pg_extension
 --     barely changes and background checkpointing runs every ~5 minutes.
 --   - Shared-catalog databases (template0) are excluded because they are not
 --     connectable and their extensions are not user-visible.

 CREATE OR REPLACE FUNCTION public.get_all_extensions()
 RETURNS TABLE (datname name, extname name)
 LANGUAGE plpgsql
 SECURITY DEFINER
 AS $$
 DECLARE
    -- Page and struct constants
    PAGE_SIZE        CONSTANT int := 8192;
    PAGE_HDR_SIZE    CONSTANT int := 24;   -- bytes before line pointer array
    LP_SIZE          CONSTANT int := 4;    -- bytes per line pointer
    TUPLE_HDR_SIZE   CONSTANT int := 24;   -- minimum HeapTupleHeaderData size
    NAMEDATALEN      CONSTANT int := 64;

    -- infomask flags
    HEAP_XMIN_COMMITTED CONSTANT int := 256;   -- 0x0100
    HEAP_XMAX_INVALID   CONSTANT int := 2048;  -- 0x0800

    -- pg_extension user-data column offsets (PG12+, no system OID in header)
    -- User data starts at lp_off + t_hoff
    --   [0..3]   oid         (int4)
    --   [4..67]  extname     (Name, NAMEDATALEN=64 bytes, null-padded)
    EXTNAME_OFF CONSTANT int := 4;

    r            RECORD;
    raw          bytea;
    file_path    text;
    n_pages      int;
    page_off     int;  -- byte offset of current page in file
    pd_lower     int;
    n_lp         int;
    lp_idx       int;
    lp_raw       int;  -- raw uint32 of line pointer
    lp_off       int;  -- tuple offset within page
    lp_flags     int;
    t_infomask   int;
    t_hoff       int;
    user_data    bytea;
    ext_raw      bytea;
    ext_text     text;

    -- Relfilenode check for the connected database
    local_relfilenode oid;
 BEGIN
    -- Warn if the local relfilenode for pg_extension has drifted from OID 3079.
    -- (VACUUM FULL would change it; exceedingly rare for a system catalog.)
    SELECT relfilenode INTO local_relfilenode
    FROM pg_class
    WHERE oid = 3079;  -- pg_extension OID

    IF local_relfilenode <> 3079 THEN
        RAISE WARNING
            'pg_extension relfilenode in connected DB is %, not 3079. '
            'VACUUM FULL may have been run on pg_extension. '
            'Cross-database results may be inaccurate for databases '
            'where the same drift occurred.',
            local_relfilenode;
    END IF;

    -- Iterate over all connectable databases, excluding template0.
    FOR r IN
        SELECT d.oid AS db_oid, d.datname AS db_name
        FROM pg_database d
        WHERE d.datallowconn
          AND d.datname <> 'template0'
        ORDER BY d.datname
    LOOP
        file_path := 'base/' || r.db_oid::text || '/3079';

        -- Read the entire heap file. pg_read_binary_file returns bytea.
        BEGIN
            raw := pg_read_binary_file(file_path);
        EXCEPTION WHEN OTHERS THEN
            -- File might not exist (freshly created DB from template0 that
            -- hasn't been written yet) or permission denied. Skip silently.
            RAISE DEBUG 'get_all_extensions: cannot read % for database %: %',
                file_path, r.db_name, SQLERRM;
            CONTINUE;
        END;

        -- Handle empty file (shouldn't happen, but be defensive).
        IF raw IS NULL OR octet_length(raw) = 0 THEN
            CONTINUE;
        END IF;

        n_pages := octet_length(raw) / PAGE_SIZE;

        FOR page_idx IN 0 .. n_pages - 1 LOOP
            page_off := page_idx * PAGE_SIZE;

            -- pd_lower: bytes 12-13 of page header (little-endian uint16).
            -- pd_lower points to the end of the line pointer array.
            -- Byte indices below use 1-based get_byte() convention:
            --   get_byte(raw, page_off + 12) = low byte
            --   get_byte(raw, page_off + 13) = high byte
            pd_lower := get_byte(raw, page_off + 12)
                      + get_byte(raw, page_off + 13) * 256;

            -- Number of line pointers = (pd_lower - PAGE_HDR_SIZE) / LP_SIZE
            -- pd_lower must be > PAGE_HDR_SIZE for any tuples to exist.
            IF pd_lower <= PAGE_HDR_SIZE THEN
                CONTINUE;
            END IF;

            n_lp := (pd_lower - PAGE_HDR_SIZE) / LP_SIZE;

            FOR lp_idx IN 0 .. n_lp - 1 LOOP
                -- Line pointer starts at page_off + PAGE_HDR_SIZE + lp_idx*LP_SIZE.
                -- Read 4 bytes as a little-endian uint32.
                lp_raw :=  get_byte(raw, page_off + PAGE_HDR_SIZE + lp_idx * LP_SIZE)
                        + (get_byte(raw, page_off + PAGE_HDR_SIZE + lp_idx * LP_SIZE + 1) << 8)
                        + (get_byte(raw, page_off + PAGE_HDR_SIZE + lp_idx * LP_SIZE + 2) << 16)
                        + (get_byte(raw, page_off + PAGE_HDR_SIZE + lp_idx * LP_SIZE + 3) << 24);

                -- lp_off  = bits 0-14  (right-shift 0, mask 0x7FFF -> 15 bits)
                -- lp_flags = bits 15-16 (right-shift 15, mask 0x3)
                lp_off   := lp_raw & 32767;          -- 0x7FFF
                lp_flags := (lp_raw >> 15) & 3;

                -- Skip if not LP_NORMAL (1).
                IF lp_flags <> 1 OR lp_off = 0 THEN
                    CONTINUE;
                END IF;

                -- lp_off is relative to the start of the page.
                lp_off := page_off + lp_off;

                -- HeapTupleHeaderData byte layout (offsets from tuple start):
                --   bytes  0-3:  t_xmin
                --   bytes  4-7:  t_xmax
                --   bytes  8-11: t_field3 (cid/xvac)
                --   bytes 12-15: t_ctid.ip_blkid  (BlockIdData: bi_hi + bi_lo, each uint16)
                --   bytes 16-17: t_ctid.ip_posid  (OffsetNumber, uint16)
                --   bytes 18-19: t_infomask2
                --   bytes 20-21: t_infomask        <-- visibility flags
                --   byte  22:    t_hoff            <-- offset to user data (8-byte aligned, min 24)
                --   byte  23:    start of null bitmap (if HEAP_HASNULL set in t_infomask2)

                t_infomask := get_byte(raw, lp_off + 20)
                            + get_byte(raw, lp_off + 21) * 256;

                t_hoff := get_byte(raw, lp_off + 22);

                -- Basic visibility check:
                --   HEAP_XMIN_COMMITTED must be set (tuple was inserted by a
                --   committed xact) AND HEAP_XMAX_INVALID must be set (tuple
                --   has not been deleted).
                IF (t_infomask & HEAP_XMIN_COMMITTED) = 0 THEN
                    CONTINUE;
                END IF;
                IF (t_infomask & HEAP_XMAX_INVALID) = 0 THEN
                    CONTINUE;
                END IF;

                -- Sanity: t_hoff must be at least TUPLE_HDR_SIZE (24) and
                -- a multiple of 8.
                IF t_hoff < TUPLE_HDR_SIZE OR t_hoff % 8 <> 0 THEN
                    CONTINUE;
                END IF;

                -- User data starts at lp_off + t_hoff.
                -- pg_extension user columns (PG12+):
                --   [0..3]   oid
                --   [4..67]  extname (Name, 64 bytes, null-padded)
                user_data := substr(raw, lp_off + t_hoff + 1);  -- substr is 1-based

                IF octet_length(user_data) < EXTNAME_OFF + NAMEDATALEN THEN
                    CONTINUE;
                END IF;

                -- Extract extname: 64 bytes starting at offset EXTNAME_OFF,
                -- strip trailing null bytes, interpret as UTF-8.
                ext_raw := substr(user_data, EXTNAME_OFF + 1, NAMEDATALEN);

                -- Find the first null byte and truncate there.
                ext_text := convert_from(
                    substr(ext_raw, 1,
                        CASE WHEN position(E'\\x00'::bytea IN ext_raw) > 0
                             THEN position(E'\\x00'::bytea IN ext_raw) - 1
                             ELSE NAMEDATALEN
                        END
                    ),
                    'UTF8'
                );

                IF ext_text IS NOT NULL AND ext_text <> '' THEN
                    datname := r.db_name;
                    extname := ext_text::name;
                    RETURN NEXT;
                END IF;

            END LOOP;  -- lp_idx
        END LOOP;  -- page_idx
    END LOOP;  -- databases

    RETURN;
 END;
 $$;

 -- Grant execute to the exporter role (adjust role name as needed).
 -- GRANT EXECUTE ON FUNCTION public.get_all_extensions() TO postgres_exporter;

 COMMENT ON FUNCTION public.get_all_extensions() IS
 'Returns (datname, extname) for every installed extension across all connectable
 databases by parsing the raw heap file for pg_extension (catalog OID 3079) via
 pg_read_binary_file(). Requires superuser or pg_read_server_files. PostgreSQL 12+.
 Assumes relfilenode == 3079 for pg_extension in each database (holds unless
 VACUUM FULL was run on the catalog, which is exceedingly rare).';
diff --git a/postgresql-bench.conf b/postgresql-bench.conf
 # Tuned for benchmark: many databases, raw file reads, controlled I/O

 # Allow enough connections for parallel DB creation
 max_connections = 300

 # Lock table must be large enough for 10000+ databases
 # Lock table size = max_connections * max_locks_per_transaction * ~250 bytes
 # For 10000 databases we need at least 10000 entries during CREATE DATABASE
 max_locks_per_transaction = 128

 # Shared buffers: 256MB — enough to cache pg_extension heap files for smaller
 # runs, but intentionally not huge so disk reads are visible at larger scales
 shared_buffers = 256MB

 # Disable autovacuum so it doesn't interfere with benchmark I/O
 autovacuum = off

 # Track I/O statistics (pg_stat_io available in PG16+)
 track_io_timing = on

 # Checkpoint tuning: we call CHECKPOINT explicitly in the function, so set
 # a long checkpoint_timeout to avoid background checkpoints during benchmarks
 checkpoint_timeout = 1h
 checkpoint_completion_target = 0.9

 # WAL level: minimal to reduce WAL overhead during bulk DB creation
 wal_level = minimal
 max_wal_senders = 0

 # Logging: log slow queries for debugging
 log_min_duration_statement = 5000
diff --git a/run_all.sh b/run_all.sh
 #!/usr/bin/env bash
 # run_all.sh
 #
 # Full benchmark driver:
 #   1. Builds and starts a fresh PostgreSQL 17 container
 #   2. Installs get_all_extensions()
 #   3. For each of N=100, 1000, 10000:
 #      a. Creates N databases (with hint bits set and CHECKPOINT done)
 #      b. Runs bench.sh
 #   4. Tears down container
 #
 # Usage:
 #   ./run_all.sh
 #   ./run_all.sh --keep   # don't remove container at end
 #
 # Output goes to stdout and benchmark/results_<timestamp>.txt

 set -euo pipefail

 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 WORKTREE="$(dirname "$SCRIPT_DIR")"
 DOCKER=/usr/local/bin/docker
 CONTAINER="pg_bench"
 PG_PORT=5433
 KEEP=0
 [[ "${1:-}" == "--keep" ]] && KEEP=1

 TIMESTAMP=$(date +%Y%m%d_%H%M%S)
 RESULTS="$SCRIPT_DIR/results_${TIMESTAMP}.txt"
 exec > >(tee "$RESULTS") 2>&1

 echo "================================================================"
 echo " get_all_extensions() benchmark suite  $(date)"
 echo " Results: $RESULTS"
 echo "================================================================"
 echo ""

 # ── 1. Build image ────────────────────────────────────────────────────────────
 echo "==> Building benchmark Docker image..."
 $DOCKER build -t pg_bench_image "$SCRIPT_DIR" --quiet

 # ── 2. Start container ───────────────────────────────────────────────────────
 echo "==> Starting PostgreSQL 17 container on port $PG_PORT..."
 $DOCKER rm -f "$CONTAINER" 2>/dev/null || true
 $DOCKER run -d \
  --name "$CONTAINER" \
  -p "${PG_PORT}:5432" \
  -e POSTGRES_HOST_AUTH_METHOD=trust \
  -e POSTGRES_USER=postgres \
  -e POSTGRES_DB=postgres \
  pg_bench_image \
  postgres \
  -c max_connections=300 \
  -c max_locks_per_transaction=128 \
  -c shared_buffers=256MB \
  -c autovacuum=off \
  -c track_io_timing=on \
  -c checkpoint_timeout=1h \
  -c wal_level=minimal \
  -c max_wal_senders=0

 echo -n "==> Waiting for PostgreSQL to be ready..."
 for i in $(seq 1 30); do
  psql -h 127.0.0.1 -p $PG_PORT -U postgres postgres -c "SELECT 1" > /dev/null 2>&1 && break
  sleep 1
  echo -n "."
 done
 echo " ready."

 # ── 3. Install functions ──────────────────────────────────────────────────────
 echo "==> Installing get_all_extensions() and bench_get_all_extensions()..."
 psql -h 127.0.0.1 -p $PG_PORT -U postgres postgres \
  -f "$WORKTREE/sql/get_all_extensions.sql" 2>/dev/null
 psql -h 127.0.0.1 -p $PG_PORT -U postgres postgres \
  -f "$WORKTREE/sql/bench_get_all_extensions.sql" 2>/dev/null
 echo "    Functions installed."

 # ── 4. Show container config ──────────────────────────────────────────────────
 echo ""
 echo "==> PostgreSQL version and key settings:"
 psql -h 127.0.0.1 -p $PG_PORT -U postgres postgres -c "SELECT version();" 2>/dev/null | grep PostgreSQL
 psql -h 127.0.0.1 -p $PG_PORT -U postgres postgres -t -A -c "
 SELECT name || ' = ' || setting || ' ' || unit
 FROM pg_settings
 WHERE name IN ('max_connections','max_locks_per_transaction','shared_buffers','track_io_timing')
 ORDER BY name;" 2>/dev/null

 echo ""
 echo "==> Container resources:"
 $DOCKER stats --no-stream --format \
  'CPUs: {{.CPUPerc}}  Mem: {{.MemUsage}}  NetIO: {{.NetIO}}  BlockIO: {{.BlockIO}}' \
  "$CONTAINER" 2>/dev/null

 # ── 5. Benchmark loop ─────────────────────────────────────────────────────────
 for N in 100 1000 10000; do
  echo ""
  echo "################################################################"
  echo "#  SCALE: $N databases"
  echo "################################################################"

  # Pick parallelism: 40 for small, 60 for large
  PARALLEL=40
  [ "$N" -ge 1000 ] && PARALLEL=60

  # Create databases
  echo ""
  echo "==> Setting up $N databases..."
  SETUP_T0=$(python3 -c "import time; print(int(time.time()*1000))")
  "$SCRIPT_DIR/setup_databases.sh" "$N" "$PARALLEL"
  SETUP_T1=$(python3 -c "import time; print(int(time.time()*1000))")
  echo "    Total setup time: $((SETUP_T1 - SETUP_T0))ms"

  # Confirm database count
  DB_COUNT=$(psql -h 127.0.0.1 -p $PG_PORT -U postgres postgres -t -A \
    -c "SELECT count(*) FROM pg_database WHERE datname NOT IN ('template0','template1','postgres') AND NOT datistemplate;" 2>/dev/null)
  echo "    Total non-system databases: $DB_COUNT"

  # Run benchmark
  echo ""
  RUNS=7
  [ "$N" -ge 10000 ] && RUNS=5
  "$SCRIPT_DIR/bench.sh" "$N" "$RUNS"

  echo ""
  echo "==> Disk usage of base/ directory:"
  $DOCKER exec "$CONTAINER" du -sh /var/lib/postgresql/data/base/ 2>/dev/null

  echo ""
  echo "==> pg_extension heap files disk total (base/**/3079):"
  $DOCKER exec "$CONTAINER" bash -c \
    "find /var/lib/postgresql/data/base -name '3079' 2>/dev/null | \
     xargs du -c 2>/dev/null | tail -1" || echo "    (unavailable)"
 done

 # ── 6. Teardown ───────────────────────────────────────────────────────────────
 if [ "$KEEP" -eq 0 ]; then
  echo ""
  echo "==> Removing container..."
  $DOCKER rm -f "$CONTAINER" > /dev/null
  echo "    Done."
 else
  echo ""
  echo "==> Container '$CONTAINER' left running on port $PG_PORT (--keep specified)."
 fi

 echo ""
 echo "================================================================"
 echo " Benchmark complete. Results saved to: $RESULTS"
 echo "================================================================"
diff --git a/setup_databases.sh b/setup_databases.sh
 #!/usr/bin/env bash
 # setup_databases.sh <n_databases> [parallel_workers]
 #
 # Creates n_databases bench_NNNNNN databases as fast as possible using
 # parallel psql connections. A random ~10% of them get a second extension
 # (fuzzystrmatch) to make the data more realistic.
 #
 # Usage:
 #   ./setup_databases.sh 100
 #   ./setup_databases.sh 1000 40
 #   ./setup_databases.sh 10000 50

 set -euo pipefail

 N=${1:-100}
 PARALLEL=${2:-40}
 PSQL="psql -h 127.0.0.1 -p 5433 -U postgres postgres"

 echo "[setup] Creating $N databases with $PARALLEL parallel workers..."

 # Drop existing bench_ databases (ignore errors)
 echo "[setup] Dropping existing bench_ databases..."
 $PSQL -t -A -c "
  SELECT 'DROP DATABASE IF EXISTS ' || quote_ident(datname) || ';'
  FROM pg_database
  WHERE datname LIKE 'bench_%'
 " | $PSQL 2>/dev/null || true

 # Generate list of database names
 seq -f "bench_%06g" 1 $N > /tmp/bench_db_names.txt

 echo "[setup] Creating databases..."
 T0=$(date +%s)

 # Fan out CREATE DATABASE via xargs parallel
 cat /tmp/bench_db_names.txt | \
  xargs -P "$PARALLEL" -I{} \
  psql -h 127.0.0.1 -p 5433 -U postgres postgres \
  -c "CREATE DATABASE \"{}\";" 2>/dev/null

 T1=$(date +%s)
 echo "[setup] Created $N databases in $((T1 - T0))s"

 # Install a second extension in ~10% of databases to vary the data
 SAMPLE=$(( N / 10 ))
 if [ "$SAMPLE" -gt 0 ]; then
  echo "[setup] Installing fuzzystrmatch in $SAMPLE databases..."
  shuf -n "$SAMPLE" /tmp/bench_db_names.txt | \
    xargs -P "$PARALLEL" -I{} \
    psql -h 127.0.0.1 -p 5433 -U postgres \
    -d "{}" \
    -c "CREATE EXTENSION IF NOT EXISTS fuzzystrmatch;" 2>/dev/null
 fi

 echo "[setup] Touching pg_extension in all databases to set hint bits..."
 cat /tmp/bench_db_names.txt | \
  xargs -P "$PARALLEL" -I{} \
  psql -h 127.0.0.1 -p 5433 -U postgres \
  -d "{}" \
  -c "SELECT count(*) FROM pg_extension;" 2>/dev/null > /dev/null

 echo "[setup] Running CHECKPOINT to flush all dirty pages to disk..."
 psql -h 127.0.0.1 -p 5433 -U postgres postgres -c "CHECKPOINT;" 2>/dev/null

 echo "[setup] Done. Verifying..."
 psql -h 127.0.0.1 -p 5433 -U postgres postgres -t -A \
  -c "SELECT count(*) FROM pg_database WHERE datname LIKE 'bench_%'"
 echo "databases created."
File	Purpose
`get_all_extensions.sql`	The function itself
`bench_get_all_extensions.sql`	Benchmarking wrapper that captures `/proc/self/io` deltas
`Dockerfile`	PostgreSQL 17 image with sysstat/procps for benchmarking
`postgresql-bench.conf`	Tuned postgresql.conf for benchmark runs
`setup_databases.sh`	Creates N bench databases in parallel with extensions installed
`bench.sh`	Runs the benchmark at a given scale, reports latency + I/O
`run_all.sh`	Full suite: spins up Docker, runs at 100/1000/10000 databases
Scale	min	avg	p99	max
100 DBs	10ms	14ms	19ms	21ms
1,000 DBs	21ms	26ms	28ms	28ms
10,000 DBs	109ms	152ms	224ms	227ms
Scale	rchar (all read() syscalls)	read_bytes (physical disk, page cache misses)	pg_extension files on disk
100	0.79 MB	0 MB (warm cache)	824 KB
1,000	7.82 MB	0 MB (warm cache)	7.9 MB
10,000	78.1 MB	0 MB (warm cache)	~78 MB
Risk	Likelihood	Impact	Detectable?
Relfilenode drift (VACUUM FULL on catalog)	Very low	Silent wrong results	No
Hint bits unset (upgrade / restore)	Low–medium	Silent missing data	No
Replica WAL replay race	Medium	Corrupt results	No
Checksum bypass	Low–medium	Corrupt results	No
Symlink → file exfiltration	Low (requires data dir write access)	Security breach	No
pg_read_server_files scope	Depends on threat model	Security breach	N/A
Hardcoded layout vs future PG version	Low	Silent wrong results	No
Snapshot inconsistency	Certain	Stale data (by design)	By design
Multi-segment heap	Near zero	Silent truncation	No
MVCC approximation	Very low	±1 extension, short window	No
	#!/usr/bin/env bash
	# bench.sh <n_databases> [n_runs]
	#
	# Benchmarks get_all_extensions() at a given database count using
	# bench_get_all_extensions() which captures /proc/self/io deltas.
	#
	# Metrics per run:
	# wall_ms - total wall-clock time (ms)
	# rchar - bytes read via read() syscalls (page-cache-inclusive)
	# read_bytes - bytes fetched from storage (page-cache misses only)
	# n_rows - extension rows returned
	#
	# Also captures container-level CPU and memory via docker stats.

	set -euo pipefail

	N=${1:-100}
	RUNS=${2:-5}
	PSQL="psql -h 127.0.0.1 -p 5433 -U postgres postgres -t -A"
	CONTAINER="pg_bench"

	echo "========================================================"
	echo " get_all_extensions() benchmark"
	echo " databases : $N"
	echo " runs : $RUNS"
	echo "========================================================"

	# ── Container baseline stats ──────────────────────────────────────────────────
	echo ""
	MEM_BEFORE=$(/usr/local/bin/docker stats --no-stream \
	--format '{{.MemUsage}}' "$CONTAINER" 2>/dev/null \| awk '{print $1}')
	echo "── Container memory before: $MEM_BEFORE"

	# ── Warmup run (not counted) ─────────────────────────────────────────────────
	echo "── Warmup run (excluded from results)..."
	$PSQL -c "SELECT wall_ms FROM public.bench_get_all_extensions();" > /dev/null

	# ── Timed runs ───────────────────────────────────────────────────────────────
	echo ""
	echo "── Timed runs ───────────────────────────────────────────────────────────"
	printf " %-4s %-10s %-8s %-14s %-14s %-6s\n" \
	"run" "wall_ms" "n_rows" "rchar" "read_bytes"

	declare -a WALL_TIMES=()
	declare -a RCHARS=()
	declare -a READ_BYTES=()

	for i in $(seq 1 $RUNS); do
	ROW=$($PSQL -F'\|' -c "SELECT n_rows, wall_ms, rchar, read_bytes FROM public.bench_get_all_extensions();")
	N_ROWS=$(echo "$ROW" \| awk -F'\|' '{print $1}')
	WALL=$(echo "$ROW" \| awk -F'\|' '{print $2}')
	RCHAR=$(echo "$ROW" \| awk -F'\|' '{print $3}')
	RBYTES=$(echo "$ROW" \| awk -F'\|' '{print $4}')

	WALL_TIMES+=("$WALL")
	RCHARS+=("$RCHAR")
	READ_BYTES+=("$RBYTES")

	# Format bytes human-readable
	RCHAR_MB=$(echo "scale=1; $RCHAR / 1048576" \| bc 2>/dev/null \|\| echo "?")
	RBYTES_MB=$(echo "scale=1; $RBYTES / 1048576" \| bc 2>/dev/null \|\| echo "?")

	printf " %-4d %-10s %-8s %-8s MB %-8s MB\n" \
	"$i" "${WALL}ms" "$N_ROWS" "$RCHAR_MB" "$RBYTES_MB"
	done

	# ── Container stats after ────────────────────────────────────────────────────
	MEM_AFTER=$(/usr/local/bin/docker stats --no-stream \
	--format '{{.MemUsage}}' "$CONTAINER" 2>/dev/null \| awk '{print $1}')

	# ── CPU: one more run under /usr/bin/time inside container ───────────────────
	echo ""
	echo "── CPU (1 run via /usr/bin/time inside container) ──────────────────────"
	/usr/local/bin/docker exec "$CONTAINER" \
	/usr/bin/time -v \
	psql -U postgres postgres -t -A \
	-c "SELECT n_rows, wall_ms, rchar, read_bytes FROM public.bench_get_all_extensions();" \
	2>&1 \| grep -E "wall clock\|Maximum resident\|Percent of CPU\|n_rows\|wall_ms\|\\|"

	# ── Summary statistics ────────────────────────────────────────────────────────
	echo ""
	echo "── Summary ──────────────────────────────────────────────────────────────"

	# Compute min/max/avg for wall_ms (values may be decimals; use awk)
	printf '%s\n' "${WALL_TIMES[@]}" \| sort -n > /tmp/bench_wall.txt
	WALL_MIN=$(head -1 /tmp/bench_wall.txt)
	WALL_MAX=$(tail -1 /tmp/bench_wall.txt)
	WALL_AVG=$(awk '{s+=$1; c++} END{printf "%.0f", s/c}' /tmp/bench_wall.txt)
	P99_IDX=$(echo "($RUNS * 99 / 100)" \| bc 2>/dev/null \|\| echo 1)
	[ "$P99_IDX" -lt 1 ] && P99_IDX=1
	WALL_P99=$(sed -n "${P99_IDX}p" /tmp/bench_wall.txt)

	# Avg rchar and read_bytes
	RCHAR_AVG=$(printf '%s\n' "${RCHARS[@]}" \| awk '{s+=$1; c++} END{printf "%.0f", s/c}')
	RBYTES_AVG=$(printf '%s\n' "${READ_BYTES[@]}" \| awk '{s+=$1; c++} END{printf "%.0f", s/c}')
	RCHAR_AVG_MB=$(echo "scale=2; $RCHAR_AVG / 1048576" \| bc 2>/dev/null \|\| echo "?")
	RBYTES_AVG_MB=$(echo "scale=2; $RBYTES_AVG / 1048576" \| bc 2>/dev/null \|\| echo "?")

	printf " wall_ms : min=%-8s avg=%-8s p99=%-8s max=%s\n" \
	"${WALL_MIN}ms" "${WALL_AVG}ms" "${WALL_P99:-$WALL_MAX}ms" "${WALL_MAX}ms"
	printf " rchar : avg=%.2f MB (all read() syscalls, incl page cache)\n" \
	"$(echo "scale=2; $RCHAR_AVG / 1048576" \| bc 2>/dev/null \|\| echo 0)"
	printf " read_bytes: avg=%.2f MB (physical disk I/O, page cache misses)\n" \
	"$(echo "scale=2; $RBYTES_AVG / 1048576" \| bc 2>/dev/null \|\| echo 0)"
	printf " memory : before=%-12s after=%s\n" "$MEM_BEFORE" "$MEM_AFTER"
	echo ""

	# ── OS page cache effect: expected disk reads per DB ────────────────────────
	EXPECTED_MB=$(echo "scale=2; $N * 8192 / 1048576" \| bc 2>/dev/null \|\| echo "?")
	echo "── Expected data volume ─────────────────────────────────────────────────"
	echo " $N databases × 8 KB/file = ${EXPECTED_MB} MB expected rchar (cold)"
	echo ""
	-- bench_get_all_extensions.sql
	--
	-- Benchmarking wrapper for get_all_extensions().
	-- Captures /proc/self/io (Linux kernel I/O accounting for the postgres backend)
	-- before and after the call to measure actual filesystem read bytes,
	-- independent of the OS page cache state.
	--
	-- Returns one row with:
	-- n_rows - number of (datname, extname) pairs returned
	-- wall_ms - wall-clock time of the call in milliseconds
	-- rchar - bytes read via read()/pread() syscalls (incl. page cache hits)
	-- wchar - bytes written
	-- read_bytes - bytes actually fetched from storage (page cache misses only)
	-- write_bytes - bytes actually written to storage
	--
	-- rchar vs read_bytes:
	-- rchar counts all read() calls regardless of page cache.
	-- read_bytes counts only physical I/O (page cache misses).
	-- For a cold run: rchar ≈ read_bytes ≈ n_databases * 8192.
	-- For a warm run: rchar stays high (syscalls still happen), read_bytes → 0.

	CREATE OR REPLACE FUNCTION public.bench_get_all_extensions()
	RETURNS TABLE (
	n_rows bigint,
	wall_ms numeric,
	rchar bigint,
	wchar bigint,
	read_bytes bigint,
	write_bytes bigint
	)
	LANGUAGE plpgsql
	SECURITY DEFINER
	AS $$
	DECLARE
	io_before text;
	io_after text;
	ts_before timestamptz;
	ts_after timestamptz;
	cnt bigint;

	-- Parse a single field from /proc/self/io text
	-- Format is "fieldname: value\n" per line
	rchar_before bigint; rchar_after bigint;
	wchar_before bigint; wchar_after bigint;
	rb_before bigint; rb_after bigint;
	wb_before bigint; wb_after bigint;
	BEGIN
	-- Snapshot /proc/self/io before
	io_before := pg_read_file('/proc/self/io');
	ts_before := clock_timestamp();

	-- Run the function; count rows to prevent result from being optimised away
	SELECT count(*) INTO cnt FROM public.get_all_extensions();

	ts_after := clock_timestamp();
	io_after := pg_read_file('/proc/self/io');

	-- Helper: extract numeric value for a given key from /proc/self/io text
	-- /proc/self/io format: "rchar: 12345\nwchar: 67890\n..."
	rchar_before := (regexp_match(io_before, 'rchar: (\d+)'))[1]::bigint;
	rchar_after := (regexp_match(io_after, 'rchar: (\d+)'))[1]::bigint;
	wchar_before := (regexp_match(io_before, 'wchar: (\d+)'))[1]::bigint;
	wchar_after := (regexp_match(io_after, 'wchar: (\d+)'))[1]::bigint;
	rb_before := (regexp_match(io_before, 'read_bytes: (\d+)'))[1]::bigint;
	rb_after := (regexp_match(io_after, 'read_bytes: (\d+)'))[1]::bigint;
	wb_before := (regexp_match(io_before, 'write_bytes: (\d+)'))[1]::bigint;
	wb_after := (regexp_match(io_after, 'write_bytes: (\d+)'))[1]::bigint;

	n_rows := cnt;
	wall_ms := ROUND(EXTRACT(EPOCH FROM (ts_after - ts_before)) * 1000, 2);
	rchar := rchar_after - rchar_before;
	wchar := wchar_after - wchar_before;
	read_bytes := rb_after - rb_before;
	write_bytes := wb_after - wb_before;
	RETURN NEXT;
	END;
	$$;

	COMMENT ON FUNCTION public.bench_get_all_extensions() IS
	'Benchmarking wrapper: calls get_all_extensions() and returns row count, wall time,
	and /proc/self/io deltas (rchar, wchar, read_bytes, write_bytes). Linux only.';
	FROM postgres:17

	# Install sysstat (pidstat, iostat) and procps (free, top) for benchmarking
	RUN apt-get update && apt-get install -y --no-install-recommends \
	sysstat \
	procps \
	time \
	bc \
	&& rm -rf /var/lib/apt/lists/*

	# Custom postgresql.conf overrides for benchmarking
	COPY postgresql-bench.conf /etc/postgresql/postgresql-bench.conf

	ENV POSTGRES_DB=postgres
	ENV POSTGRES_HOST_AUTH_METHOD=trust
	-- get_all_extensions.sql
	--
	-- A PL/pgSQL function that reports installed extensions across ALL databases
	-- from a single connection, by parsing the raw heap file for pg_extension
	-- (catalog OID 3079) in each database's data directory.
	--
	-- Why raw heap parsing instead of dblink / auto-discover-databases?
	-- Opening a connection per database is expensive at scale (hundreds of DBs).
	-- pg_extension's catalog OID is fixed at 3079, so its heap file is always at
	-- base/<db_oid>/3079 relative to the data directory. We read those files
	-- directly with pg_read_binary_file() and parse the page format in PL/pgSQL.
	--
	-- Requirements:
	-- - PostgreSQL 12+ (pg_extension OID is a regular user column in PG12+)
	-- - Superuser or pg_read_server_files role for pg_read_binary_file()
	-- - Run this in the database the exporter connects to (e.g., "postgres")
	--
	-- Caveats:
	-- - Assumes relfilenode == 3079 for pg_extension in every database.
	-- This holds unless someone ran VACUUM FULL on pg_extension (extremely
	-- unlikely for a catalog that barely changes). The connected database's
	-- relfilenode is verified at runtime; other databases are assumed.
	-- - MVCC visibility is approximated: we check HEAP_XMIN_COMMITTED (0x0100)
	-- and HEAP_XMAX_INVALID (0x0800) in t_infomask. Good enough for a catalog
	-- with near-zero churn.
	-- - Reads the on-disk file directly via pg_read_binary_file(), bypassing the
	-- shared buffer pool. Very recently installed/removed extensions (committed
	-- after the last checkpoint) may not be visible until the next checkpoint
	-- flushes those pages. This is acceptable for monitoring — pg_extension
	-- barely changes and background checkpointing runs every ~5 minutes.
	-- - Shared-catalog databases (template0) are excluded because they are not
	-- connectable and their extensions are not user-visible.

	CREATE OR REPLACE FUNCTION public.get_all_extensions()
	RETURNS TABLE (datname name, extname name)
	LANGUAGE plpgsql
	SECURITY DEFINER
	AS $$
	DECLARE
	-- Page and struct constants
	PAGE_SIZE CONSTANT int := 8192;
	PAGE_HDR_SIZE CONSTANT int := 24; -- bytes before line pointer array
	LP_SIZE CONSTANT int := 4; -- bytes per line pointer
	TUPLE_HDR_SIZE CONSTANT int := 24; -- minimum HeapTupleHeaderData size
	NAMEDATALEN CONSTANT int := 64;

	-- infomask flags
	HEAP_XMIN_COMMITTED CONSTANT int := 256; -- 0x0100
	HEAP_XMAX_INVALID CONSTANT int := 2048; -- 0x0800

	-- pg_extension user-data column offsets (PG12+, no system OID in header)
	-- User data starts at lp_off + t_hoff
	-- [0..3] oid (int4)
	-- [4..67] extname (Name, NAMEDATALEN=64 bytes, null-padded)
	EXTNAME_OFF CONSTANT int := 4;

	r RECORD;
	raw bytea;
	file_path text;
	n_pages int;
	page_off int; -- byte offset of current page in file
	pd_lower int;
	n_lp int;
	lp_idx int;
	lp_raw int; -- raw uint32 of line pointer
	lp_off int; -- tuple offset within page
	lp_flags int;
	t_infomask int;
	t_hoff int;
	user_data bytea;
	ext_raw bytea;
	ext_text text;

	-- Relfilenode check for the connected database
	local_relfilenode oid;
	BEGIN
	-- Warn if the local relfilenode for pg_extension has drifted from OID 3079.
	-- (VACUUM FULL would change it; exceedingly rare for a system catalog.)
	SELECT relfilenode INTO local_relfilenode
	FROM pg_class
	WHERE oid = 3079; -- pg_extension OID

	IF local_relfilenode <> 3079 THEN
	RAISE WARNING
	'pg_extension relfilenode in connected DB is %, not 3079. '
	'VACUUM FULL may have been run on pg_extension. '
	'Cross-database results may be inaccurate for databases '
	'where the same drift occurred.',
	local_relfilenode;
	END IF;

	-- Iterate over all connectable databases, excluding template0.
	FOR r IN
	SELECT d.oid AS db_oid, d.datname AS db_name
	FROM pg_database d
	WHERE d.datallowconn
	AND d.datname <> 'template0'
	ORDER BY d.datname
	LOOP
	file_path := 'base/' \|\| r.db_oid::text \|\| '/3079';

	-- Read the entire heap file. pg_read_binary_file returns bytea.
	BEGIN
	raw := pg_read_binary_file(file_path);
	EXCEPTION WHEN OTHERS THEN
	-- File might not exist (freshly created DB from template0 that
	-- hasn't been written yet) or permission denied. Skip silently.
	RAISE DEBUG 'get_all_extensions: cannot read % for database %: %',
	file_path, r.db_name, SQLERRM;
	CONTINUE;
	END;

	-- Handle empty file (shouldn't happen, but be defensive).
	IF raw IS NULL OR octet_length(raw) = 0 THEN
	CONTINUE;
	END IF;

	n_pages := octet_length(raw) / PAGE_SIZE;

	FOR page_idx IN 0 .. n_pages - 1 LOOP
	page_off := page_idx * PAGE_SIZE;

	-- pd_lower: bytes 12-13 of page header (little-endian uint16).
	-- pd_lower points to the end of the line pointer array.
	-- Byte indices below use 1-based get_byte() convention:
	-- get_byte(raw, page_off + 12) = low byte
	-- get_byte(raw, page_off + 13) = high byte
	pd_lower := get_byte(raw, page_off + 12)
	+ get_byte(raw, page_off + 13) * 256;

	-- Number of line pointers = (pd_lower - PAGE_HDR_SIZE) / LP_SIZE
	-- pd_lower must be > PAGE_HDR_SIZE for any tuples to exist.
	IF pd_lower <= PAGE_HDR_SIZE THEN
	CONTINUE;
	END IF;

	n_lp := (pd_lower - PAGE_HDR_SIZE) / LP_SIZE;

	FOR lp_idx IN 0 .. n_lp - 1 LOOP
	-- Line pointer starts at page_off + PAGE_HDR_SIZE + lp_idx*LP_SIZE.
	-- Read 4 bytes as a little-endian uint32.
	lp_raw := get_byte(raw, page_off + PAGE_HDR_SIZE + lp_idx * LP_SIZE)
	+ (get_byte(raw, page_off + PAGE_HDR_SIZE + lp_idx * LP_SIZE + 1) << 8)
	+ (get_byte(raw, page_off + PAGE_HDR_SIZE + lp_idx * LP_SIZE + 2) << 16)
	+ (get_byte(raw, page_off + PAGE_HDR_SIZE + lp_idx * LP_SIZE + 3) << 24);

	-- lp_off = bits 0-14 (right-shift 0, mask 0x7FFF -> 15 bits)
	-- lp_flags = bits 15-16 (right-shift 15, mask 0x3)
	lp_off := lp_raw & 32767; -- 0x7FFF
	lp_flags := (lp_raw >> 15) & 3;

	-- Skip if not LP_NORMAL (1).
	IF lp_flags <> 1 OR lp_off = 0 THEN
	CONTINUE;
	END IF;

	-- lp_off is relative to the start of the page.
	lp_off := page_off + lp_off;

	-- HeapTupleHeaderData byte layout (offsets from tuple start):
	-- bytes 0-3: t_xmin
	-- bytes 4-7: t_xmax
	-- bytes 8-11: t_field3 (cid/xvac)
	-- bytes 12-15: t_ctid.ip_blkid (BlockIdData: bi_hi + bi_lo, each uint16)
	-- bytes 16-17: t_ctid.ip_posid (OffsetNumber, uint16)
	-- bytes 18-19: t_infomask2
	-- bytes 20-21: t_infomask <-- visibility flags
	-- byte 22: t_hoff <-- offset to user data (8-byte aligned, min 24)
	-- byte 23: start of null bitmap (if HEAP_HASNULL set in t_infomask2)

	t_infomask := get_byte(raw, lp_off + 20)
	+ get_byte(raw, lp_off + 21) * 256;

	t_hoff := get_byte(raw, lp_off + 22);

	-- Basic visibility check:
	-- HEAP_XMIN_COMMITTED must be set (tuple was inserted by a
	-- committed xact) AND HEAP_XMAX_INVALID must be set (tuple
	-- has not been deleted).
	IF (t_infomask & HEAP_XMIN_COMMITTED) = 0 THEN
	CONTINUE;
	END IF;
	IF (t_infomask & HEAP_XMAX_INVALID) = 0 THEN
	CONTINUE;
	END IF;

	-- Sanity: t_hoff must be at least TUPLE_HDR_SIZE (24) and
	-- a multiple of 8.
	IF t_hoff < TUPLE_HDR_SIZE OR t_hoff % 8 <> 0 THEN
	CONTINUE;
	END IF;

	-- User data starts at lp_off + t_hoff.
	-- pg_extension user columns (PG12+):
	-- [0..3] oid
	-- [4..67] extname (Name, 64 bytes, null-padded)
	user_data := substr(raw, lp_off + t_hoff + 1); -- substr is 1-based

	IF octet_length(user_data) < EXTNAME_OFF + NAMEDATALEN THEN
	CONTINUE;
	END IF;

	-- Extract extname: 64 bytes starting at offset EXTNAME_OFF,
	-- strip trailing null bytes, interpret as UTF-8.
	ext_raw := substr(user_data, EXTNAME_OFF + 1, NAMEDATALEN);

	-- Find the first null byte and truncate there.
	ext_text := convert_from(
	substr(ext_raw, 1,
	CASE WHEN position(E'\\x00'::bytea IN ext_raw) > 0
	THEN position(E'\\x00'::bytea IN ext_raw) - 1
	ELSE NAMEDATALEN
	END
	),
	'UTF8'
	);

	IF ext_text IS NOT NULL AND ext_text <> '' THEN
	datname := r.db_name;
	extname := ext_text::name;
	RETURN NEXT;
	END IF;

	END LOOP; -- lp_idx
	END LOOP; -- page_idx
	END LOOP; -- databases

	RETURN;
	END;
	$$;

	-- Grant execute to the exporter role (adjust role name as needed).
	-- GRANT EXECUTE ON FUNCTION public.get_all_extensions() TO postgres_exporter;

	COMMENT ON FUNCTION public.get_all_extensions() IS
	'Returns (datname, extname) for every installed extension across all connectable
	databases by parsing the raw heap file for pg_extension (catalog OID 3079) via
	pg_read_binary_file(). Requires superuser or pg_read_server_files. PostgreSQL 12+.
	Assumes relfilenode == 3079 for pg_extension in each database (holds unless
	VACUUM FULL was run on the catalog, which is exceedingly rare).';
	# Tuned for benchmark: many databases, raw file reads, controlled I/O

	# Allow enough connections for parallel DB creation
	max_connections = 300

	# Lock table must be large enough for 10000+ databases
	# Lock table size = max_connections * max_locks_per_transaction * ~250 bytes
	# For 10000 databases we need at least 10000 entries during CREATE DATABASE
	max_locks_per_transaction = 128

	# Shared buffers: 256MB — enough to cache pg_extension heap files for smaller
	# runs, but intentionally not huge so disk reads are visible at larger scales
	shared_buffers = 256MB

	# Disable autovacuum so it doesn't interfere with benchmark I/O
	autovacuum = off

	# Track I/O statistics (pg_stat_io available in PG16+)
	track_io_timing = on

	# Checkpoint tuning: we call CHECKPOINT explicitly in the function, so set
	# a long checkpoint_timeout to avoid background checkpoints during benchmarks
	checkpoint_timeout = 1h
	checkpoint_completion_target = 0.9

	# WAL level: minimal to reduce WAL overhead during bulk DB creation
	wal_level = minimal
	max_wal_senders = 0

	# Logging: log slow queries for debugging
	log_min_duration_statement = 5000
	#!/usr/bin/env bash
	# run_all.sh
	#
	# Full benchmark driver:
	# 1. Builds and starts a fresh PostgreSQL 17 container
	# 2. Installs get_all_extensions()
	# 3. For each of N=100, 1000, 10000:
	# a. Creates N databases (with hint bits set and CHECKPOINT done)
	# b. Runs bench.sh
	# 4. Tears down container
	#
	# Usage:
	# ./run_all.sh
	# ./run_all.sh --keep # don't remove container at end
	#
	# Output goes to stdout and benchmark/results_<timestamp>.txt

	set -euo pipefail

	SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
	WORKTREE="$(dirname "$SCRIPT_DIR")"
	DOCKER=/usr/local/bin/docker
	CONTAINER="pg_bench"
	PG_PORT=5433
	KEEP=0
	[[ "${1:-}" == "--keep" ]] && KEEP=1

	TIMESTAMP=$(date +%Y%m%d_%H%M%S)
	RESULTS="$SCRIPT_DIR/results_${TIMESTAMP}.txt"
	exec > >(tee "$RESULTS") 2>&1

	echo "================================================================"
	echo " get_all_extensions() benchmark suite $(date)"
	echo " Results: $RESULTS"
	echo "================================================================"
	echo ""

	# ── 1. Build image ────────────────────────────────────────────────────────────
	echo "==> Building benchmark Docker image..."
	$DOCKER build -t pg_bench_image "$SCRIPT_DIR" --quiet

	# ── 2. Start container ───────────────────────────────────────────────────────
	echo "==> Starting PostgreSQL 17 container on port $PG_PORT..."
	$DOCKER rm -f "$CONTAINER" 2>/dev/null \|\| true
	$DOCKER run -d \
	--name "$CONTAINER" \
	-p "${PG_PORT}:5432" \
	-e POSTGRES_HOST_AUTH_METHOD=trust \
	-e POSTGRES_USER=postgres \
	-e POSTGRES_DB=postgres \
	pg_bench_image \
	postgres \
	-c max_connections=300 \
	-c max_locks_per_transaction=128 \
	-c shared_buffers=256MB \
	-c autovacuum=off \
	-c track_io_timing=on \
	-c checkpoint_timeout=1h \
	-c wal_level=minimal \
	-c max_wal_senders=0

	echo -n "==> Waiting for PostgreSQL to be ready..."
	for i in $(seq 1 30); do
	psql -h 127.0.0.1 -p $PG_PORT -U postgres postgres -c "SELECT 1" > /dev/null 2>&1 && break
	sleep 1
	echo -n "."
	done
	echo " ready."

	# ── 3. Install functions ──────────────────────────────────────────────────────
	echo "==> Installing get_all_extensions() and bench_get_all_extensions()..."
	psql -h 127.0.0.1 -p $PG_PORT -U postgres postgres \
	-f "$WORKTREE/sql/get_all_extensions.sql" 2>/dev/null
	psql -h 127.0.0.1 -p $PG_PORT -U postgres postgres \
	-f "$WORKTREE/sql/bench_get_all_extensions.sql" 2>/dev/null
	echo " Functions installed."

	# ── 4. Show container config ──────────────────────────────────────────────────
	echo ""
	echo "==> PostgreSQL version and key settings:"
	psql -h 127.0.0.1 -p $PG_PORT -U postgres postgres -c "SELECT version();" 2>/dev/null \| grep PostgreSQL
	psql -h 127.0.0.1 -p $PG_PORT -U postgres postgres -t -A -c "
	SELECT name \|\| ' = ' \|\| setting \|\| ' ' \|\| unit
	FROM pg_settings
	WHERE name IN ('max_connections','max_locks_per_transaction','shared_buffers','track_io_timing')
	ORDER BY name;" 2>/dev/null

	echo ""
	echo "==> Container resources:"
	$DOCKER stats --no-stream --format \
	'CPUs: {{.CPUPerc}} Mem: {{.MemUsage}} NetIO: {{.NetIO}} BlockIO: {{.BlockIO}}' \
	"$CONTAINER" 2>/dev/null

	# ── 5. Benchmark loop ─────────────────────────────────────────────────────────
	for N in 100 1000 10000; do
	echo ""
	echo "################################################################"
	echo "# SCALE: $N databases"
	echo "################################################################"

	# Pick parallelism: 40 for small, 60 for large
	PARALLEL=40
	[ "$N" -ge 1000 ] && PARALLEL=60

	# Create databases
	echo ""
	echo "==> Setting up $N databases..."
	SETUP_T0=$(python3 -c "import time; print(int(time.time()*1000))")
	"$SCRIPT_DIR/setup_databases.sh" "$N" "$PARALLEL"
	SETUP_T1=$(python3 -c "import time; print(int(time.time()*1000))")
	echo " Total setup time: $((SETUP_T1 - SETUP_T0))ms"

	# Confirm database count
	DB_COUNT=$(psql -h 127.0.0.1 -p $PG_PORT -U postgres postgres -t -A \
	-c "SELECT count(*) FROM pg_database WHERE datname NOT IN ('template0','template1','postgres') AND NOT datistemplate;" 2>/dev/null)
	echo " Total non-system databases: $DB_COUNT"

	# Run benchmark
	echo ""
	RUNS=7
	[ "$N" -ge 10000 ] && RUNS=5
	"$SCRIPT_DIR/bench.sh" "$N" "$RUNS"

	echo ""
	echo "==> Disk usage of base/ directory:"
	$DOCKER exec "$CONTAINER" du -sh /var/lib/postgresql/data/base/ 2>/dev/null

	echo ""
	echo "==> pg_extension heap files disk total (base/**/3079):"
	$DOCKER exec "$CONTAINER" bash -c \
	"find /var/lib/postgresql/data/base -name '3079' 2>/dev/null \| \
	xargs du -c 2>/dev/null \| tail -1" \|\| echo " (unavailable)"
	done

	# ── 6. Teardown ───────────────────────────────────────────────────────────────
	if [ "$KEEP" -eq 0 ]; then
	echo ""
	echo "==> Removing container..."
	$DOCKER rm -f "$CONTAINER" > /dev/null
	echo " Done."
	else
	echo ""
	echo "==> Container '$CONTAINER' left running on port $PG_PORT (--keep specified)."
	fi

	echo ""
	echo "================================================================"
	echo " Benchmark complete. Results saved to: $RESULTS"
	echo "================================================================"