Skip to content

Instantly share code, notes, and snippets.

@jmcarp
Created May 9, 2026 04:05
Show Gist options
  • Select an option

  • Save jmcarp/eeef6f8b3852fbb7ba8c32d0e22f0a61 to your computer and use it in GitHub Desktop.

Select an option

Save jmcarp/eeef6f8b3852fbb7ba8c32d0e22f0a61 to your computer and use it in GitHub Desktop.
#!/usr/bin/env bash
#
# Empirical test: which settings actually make ALTER TABLE ... MODIFY TTL
# return without waiting for the materialization mutation, on the v23.8 fork
# of ClickHouse used by oximeter.
#
# Predictions from reading work/src/src/Storages/StorageMergeTree.cpp:
# - defaults: ALTER blocks until mutation finishes
# - SETTINGS mutations_sync = 0: identical to defaults (no-op)
# - SETTINGS alter_sync = 0: identical to defaults (no-op in v23.8)
# - SETTINGS materialize_ttl_after_modify = 0:
# ALTER returns instantly; no mutation;
# old rows survive past the new horizon
#
# Each variant runs against a freshly-populated table so we don't get the
# "second run is fast because parts already materialized" artifact.
set -euo pipefail
CH_BIN=${CH_BIN:-/root/code/garbage-compactor/clickhouse/work/src/build/programs/clickhouse}
WORK_DIR=${WORK_DIR:-/var/tmp/ch-ttl-test}
PORT_TCP=${PORT_TCP:-9300}
PORT_HTTP=${PORT_HTTP:-8323}
PORT_INTER=${PORT_INTER:-9309}
ROWS=${ROWS:-100000000}
SPAN_DAYS=${SPAN_DAYS:-60}
OLD_TTL_DAYS=${OLD_TTL_DAYS:-30}
NEW_TTL_DAYS=${NEW_TTL_DAYS:-7}
ch_q() {
"$CH_BIN" client --port "$PORT_TCP" -q "$1"
}
ch_q_silent() {
"$CH_BIN" client --port "$PORT_TCP" -q "$1" >/dev/null
}
start_server() {
rm -rf "$WORK_DIR"
mkdir -p "$WORK_DIR/data" "$WORK_DIR/logs" "$WORK_DIR/tmp" \
"$WORK_DIR/user_files" "$WORK_DIR/format_schemas"
cat > "$WORK_DIR/config.xml" <<EOF
<clickhouse>
<logger>
<level>information</level>
<log>$WORK_DIR/logs/server.log</log>
<errorlog>$WORK_DIR/logs/server.err.log</errorlog>
<size>100M</size>
<count>3</count>
</logger>
<tcp_port>$PORT_TCP</tcp_port>
<http_port>$PORT_HTTP</http_port>
<interserver_http_port>$PORT_INTER</interserver_http_port>
<listen_host>127.0.0.1</listen_host>
<path>$WORK_DIR/data/</path>
<tmp_path>$WORK_DIR/tmp/</tmp_path>
<user_files_path>$WORK_DIR/user_files/</user_files_path>
<format_schema_path>$WORK_DIR/format_schemas/</format_schema_path>
<default_profile>default</default_profile>
<default_database>default</default_database>
<timezone>UTC</timezone>
<mark_cache_size>5368709120</mark_cache_size>
<max_concurrent_queries>100</max_concurrent_queries>
<max_connections>100</max_connections>
<users>
<default>
<password></password>
<networks><ip>::/0</ip></networks>
<profile>default</profile>
<quota>default</quota>
<access_management>1</access_management>
</default>
</users>
<profiles>
<default/>
</profiles>
<quotas>
<default/>
</quotas>
</clickhouse>
EOF
"$CH_BIN" server --config-file="$WORK_DIR/config.xml" \
--pid-file="$WORK_DIR/server.pid" \
--daemon
for _ in $(seq 1 30); do
if ch_q "SELECT 1" >/dev/null 2>&1; then
echo "server up on port $PORT_TCP (pid $(cat "$WORK_DIR/server.pid"))"
return 0
fi
sleep 1
done
echo "server failed to start; see $WORK_DIR/logs/" >&2
return 1
}
stop_server() {
if [[ -f "$WORK_DIR/server.pid" ]]; then
local pid
pid=$(cat "$WORK_DIR/server.pid")
kill "$pid" 2>/dev/null || true
for _ in $(seq 1 15); do
kill -0 "$pid" 2>/dev/null || break
sleep 1
done
kill -9 "$pid" 2>/dev/null || true
fi
}
populate_table() {
local table=$1
ch_q_silent "DROP TABLE IF EXISTS $table"
ch_q_silent "
CREATE TABLE $table (
ts DateTime,
id UInt64,
value Float64,
payload String
)
ENGINE = MergeTree
PARTITION BY toYYYYMMDD(ts)
ORDER BY (id, ts)
TTL ts + INTERVAL $OLD_TTL_DAYS DAY
"
ch_q_silent "
INSERT INTO $table
SELECT
now() - toIntervalSecond(rand() % ($SPAN_DAYS * 86400)) AS ts,
number AS id,
rand() / 0xFFFFFFFF AS value,
toString(number) AS payload
FROM numbers_mt($ROWS)
"
ch_q_silent "OPTIMIZE TABLE $table FINAL"
}
run_variant() {
local name=$1
local settings=$2
local table="t_${name}"
echo
echo "============================================================"
echo "variant: $name"
echo " settings clause: ${settings:-<none>}"
echo "============================================================"
echo "[$(date +%T)] populating $table with $ROWS rows..."
populate_table "$table"
local rows_before parts_before
rows_before=$(ch_q "SELECT count() FROM $table")
parts_before=$(ch_q "SELECT count() FROM system.parts WHERE table = '$table' AND active")
echo " rows before: $rows_before parts: $parts_before"
local alter_sql="ALTER TABLE $table MODIFY TTL ts + INTERVAL $NEW_TTL_DAYS DAY ${settings}"
echo "[$(date +%T)] running ALTER..."
# --time makes clickhouse client print server-side query elapsed seconds
# (e.g. "12.345") to stderr at the end of the query. Capture stderr to
# a file so we can extract the timing line without losing real errors.
local stderr_file
stderr_file=$(mktemp -t ch-ttl-test.XXXXXX)
local t0_ns t1_ns
t0_ns=$(date +%s%N)
if ! "$CH_BIN" client --port "$PORT_TCP" --time -q "$alter_sql" 2>"$stderr_file"; then
echo "ALTER failed; clickhouse client stderr:" >&2
cat "$stderr_file" >&2
rm -f "$stderr_file"
return 1
fi
t1_ns=$(date +%s%N)
local wall_ms=$(( (t1_ns - t0_ns) / 1000000 ))
local server_seconds
server_seconds=$(grep -E '^[0-9]+(\.[0-9]+)?$' "$stderr_file" | tail -1)
rm -f "$stderr_file"
echo "[$(date +%T)] ALTER server-side: ${server_seconds:-?}s wall-clock: ${wall_ms}ms"
# Mutation state immediately after ALTER returns. If the wait was honored,
# is_done = 1 here. If the ALTER bypassed the wait, is_done is likely 0.
# If no mutation was created at all, the row count is 0.
echo " system.mutations immediately after ALTER:"
ch_q "
SELECT mutation_id, is_done, parts_to_do,
formatDateTime(create_time, '%T') AS created,
formatDateTime(latest_fail_time, '%T') AS last_fail
FROM system.mutations
WHERE table = '$table'
ORDER BY create_time DESC
LIMIT 3
FORMAT PrettyCompactMonoBlock
"
# Wait for any in-flight mutation to settle so post-state checks are valid.
echo " waiting for any in-flight mutation to finish..."
for _ in $(seq 1 600); do
local pending
pending=$(ch_q "SELECT count() FROM system.mutations WHERE table = '$table' AND is_done = 0")
[[ "$pending" == "0" ]] && break
sleep 1
done
local rows_after old_rows
rows_after=$(ch_q "SELECT count() FROM $table")
old_rows=$(ch_q "SELECT count() FROM $table WHERE ts < now() - toIntervalDay($NEW_TTL_DAYS)")
echo " rows after mutation settled: $rows_after"
echo " rows older than ${NEW_TTL_DAYS}d (0 means new TTL applied to old parts): $old_rows"
}
trap stop_server EXIT
echo "starting clickhouse $($CH_BIN client --version 2>/dev/null || true)"
start_server
# Each variant runs on its own fresh table.
run_variant "defaults" ""
run_variant "mutations_sync_0" "SETTINGS mutations_sync = 0"
run_variant "alter_sync_0" "SETTINGS alter_sync = 0"
run_variant "materialize_off" "SETTINGS materialize_ttl_after_modify = 0"
echo
echo "============================================================"
echo "done."
echo " data/logs preserved at $WORK_DIR (server stopped on exit)"
echo "============================================================"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment