Created
June 5, 2026 17:43
-
-
Save pdp7/ede9ca442ea2bc1d4033a9899414542c to your computer and use it in GitHub Desktop.
/home/pdp7/bin/update-semcode.sh
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # Refresh a semcode database: source code (current HEAD or branches), | |
| # lore archives + new shards, then regenerate vector embeddings. | |
| # | |
| # Usage: update-semcode.sh [options] | |
| # | |
| # Options: | |
| # -s, --source <dir> source tree to index (default: cwd) | |
| # -d, --database <dir> db dir or its parent (default: same as --source) | |
| # --branches <list> comma-separated branches to update (uses --update-branches) | |
| # --all-branches index every local branch that has moved | |
| # --no-code skip the source-tree indexing step | |
| # --no-lore skip the lore refresh step | |
| # --no-new-shards skip auto-adding newly-published lore shards | |
| # --no-vectors skip vector regeneration | |
| # -h, --help show this help | |
| # | |
| # Without --branches/--all-branches, the source step indexes the current HEAD only. | |
| # Env: SEMCODE_INDEX, LORE_LOG_DIR, SEMCODE_LORE_ARCHIVES (comma-separated) | |
| set -euo pipefail | |
| SEMCODE_INDEX="${SEMCODE_INDEX:-/home/pdp7/dev/semcode/target/release/semcode-index}" | |
| # Archives the resctrl/CBQRI workflow expects to be present. On a fresh DB any | |
| # missing ones are cloned before the refresh phase. Override with | |
| # SEMCODE_LORE_ARCHIVES=a,b,c. | |
| DEFAULT_LORE_ARCHIVES="lkml,linux-riscv,linux-acpi,linux-arm-kernel,linux-devicetree,linux-doc,acpica-devel,qemu-devel" | |
| LORE_ARCHIVES="${SEMCODE_LORE_ARCHIVES:-$DEFAULT_LORE_ARCHIVES}" | |
| SOURCE_DIR="" | |
| DB_DIR="" | |
| BRANCHES="" | |
| ALL_BRANCHES=0 | |
| DO_CODE=1 | |
| DO_LORE=1 | |
| DO_NEW_SHARDS=1 | |
| DO_VECTORS=1 | |
| while [ $# -gt 0 ]; do | |
| case "$1" in | |
| -s|--source) SOURCE_DIR="$2"; shift 2 ;; | |
| -d|--database) DB_DIR="$2"; shift 2 ;; | |
| --branches) BRANCHES="$2"; shift 2 ;; | |
| --all-branches) ALL_BRANCHES=1; shift ;; | |
| --no-code) DO_CODE=0; shift ;; | |
| --no-lore) DO_LORE=0; shift ;; | |
| --no-new-shards) DO_NEW_SHARDS=0; shift ;; | |
| --no-vectors) DO_VECTORS=0; shift ;; | |
| -h|--help) | |
| sed -n '2,18p' "$0" | sed 's/^# \?//' | |
| exit 0 ;; | |
| *) echo "unknown arg: $1" >&2; exit 2 ;; | |
| esac | |
| done | |
| [ -x "$SEMCODE_INDEX" ] || { echo "semcode-index not executable: $SEMCODE_INDEX" >&2; exit 1; } | |
| SOURCE_DIR="${SOURCE_DIR:-$PWD}" | |
| DB_DIR="${DB_DIR:-$SOURCE_DIR}" | |
| LOG_DIR="${LORE_LOG_DIR:-/tmp}" | |
| TS="$(date +%Y%m%d-%H%M%S)" | |
| CODE_LOG="$LOG_DIR/semcode-code-$TS.log" | |
| REFRESH_LOG="$LOG_DIR/semcode-lore-refresh-$TS.log" | |
| ADD_LOG="$LOG_DIR/semcode-lore-add-$TS.log" | |
| VECTOR_LOG="$LOG_DIR/semcode-vectors-$TS.log" | |
| run() { echo "+ $*"; "$@"; } | |
| echo "==> source : $SOURCE_DIR" | |
| echo "==> db : $DB_DIR" | |
| if [ "$DO_CODE" = 1 ]; then | |
| echo "==> Indexing source tree (log: $CODE_LOG)" | |
| code_args=(-s "$SOURCE_DIR" -d "$DB_DIR") | |
| if [ "$ALL_BRANCHES" = 1 ]; then | |
| code_args+=(--all-branches --update-branches) | |
| elif [ -n "$BRANCHES" ]; then | |
| code_args+=(--branches "$BRANCHES" --update-branches) | |
| fi | |
| run "$SEMCODE_INDEX" "${code_args[@]}" 2>&1 | tee "$CODE_LOG" | |
| fi | |
| if [ "$DO_LORE" = 1 ]; then | |
| # Clone any expected archives that aren't yet present. | |
| missing="" | |
| for a in ${LORE_ARCHIVES//,/ }; do | |
| if ! compgen -G "$DB_DIR/.semcode.db/lore/$a"* >/dev/null && \ | |
| ! compgen -G "$DB_DIR/lore/$a"* >/dev/null; then | |
| missing="${missing:+$missing,}$a" | |
| fi | |
| done | |
| if [ -n "$missing" ]; then | |
| ENSURE_LOG="$LOG_DIR/semcode-lore-ensure-$TS.log" | |
| echo "==> Cloning missing archives: $missing (log: $ENSURE_LOG)" | |
| run "$SEMCODE_INDEX" -d "$DB_DIR" --lore "$missing" 2>&1 | tee "$ENSURE_LOG" | |
| fi | |
| echo "==> Refreshing lore archives (log: $REFRESH_LOG)" | |
| run "$SEMCODE_INDEX" -d "$DB_DIR" --lore 2>&1 | tee "$REFRESH_LOG" | |
| if [ "$DO_NEW_SHARDS" = 1 ]; then | |
| # Trailing report looks like: | |
| # New archives available on lore.kernel.org: | |
| # lkml: archive(s) 19 20 | |
| new_shards=$(awk ' | |
| /^New archives available on lore\.kernel\.org:/ { in_block=1; next } | |
| in_block && /^[[:space:]]*$/ { in_block=0 } | |
| in_block { | |
| name=$1; sub(":", "", name) | |
| for (i=1; i<=NF; i++) if ($i ~ /^[0-9]+$/) print name"/"$i | |
| } | |
| ' "$REFRESH_LOG" | sort -u | paste -sd, -) | |
| if [ -n "$new_shards" ]; then | |
| echo "==> Adding new shards: $new_shards (log: $ADD_LOG)" | |
| run "$SEMCODE_INDEX" -d "$DB_DIR" --lore "$new_shards" 2>&1 | tee "$ADD_LOG" | |
| else | |
| echo "==> No new shards to add." | |
| fi | |
| fi | |
| fi | |
| if [ "$DO_VECTORS" = 1 ]; then | |
| echo "==> Regenerating vector embeddings (log: $VECTOR_LOG)" | |
| run "$SEMCODE_INDEX" -s "$SOURCE_DIR" -d "$DB_DIR" --vectors 2>&1 | tee "$VECTOR_LOG" | |
| fi | |
| echo "==> Done." | |
| [ -f "$CODE_LOG" ] && echo " code: $CODE_LOG" | |
| [ -n "${ENSURE_LOG:-}" ] && [ -f "$ENSURE_LOG" ] && echo " ensure: $ENSURE_LOG" | |
| [ -f "$REFRESH_LOG" ] && echo " lore: $REFRESH_LOG" | |
| [ -f "$ADD_LOG" ] && echo " shards: $ADD_LOG" | |
| [ -f "$VECTOR_LOG" ] && echo " vectors: $VECTOR_LOG" |
pdp7
commented
Jun 5, 2026
Author
Author
Prerequisites
Joel needs three things in place first:
1. The semcode-index binary built — e.g. ~/dev/semcode/target/release/semcode-index
2. A copy of update-semcode.sh — yours lives at /home/pdp7/bin/update-semcode.sh; send it to him (it's generic, all paths come from
flags/env)
3. A source tree + DB dir — e.g. his own ~/dev/linux
Unit files
~/.config/systemd/user/update-semcode.service:
[Unit]
Description=Refresh semcode database (source code, lore archives, vectors)
Wants=network-online.target
After=network-online.target
[Service]
Type=oneshot
Environment=SEMCODE_INDEX=%h/dev/semcode/target/release/semcode-index
WorkingDirectory=%h/dev/linux
ExecStart=%h/bin/update-semcode.sh -s %h/dev/linux -d %h/dev/linux
StandardOutput=append:%h/.cache/semcode-update/run.log
StandardError=append:%h/.cache/semcode-update/run.log
Nice=10
IOSchedulingClass=best-effort
IOSchedulingPriority=7
[Install]
WantedBy=default.target
~/.config/systemd/user/update-semcode.timer:
[Unit]
Description=Nightly semcode database refresh
[Timer]
OnCalendar=*-*-* 03:17:00
RandomizedDelaySec=10m
Persistent=true
Unit=update-semcode.service
[Install]
WantedBy=timers.target
%h expands to Joel's home, so this is portable as long as his layout matches ~/dev/semcode, ~/dev/linux, ~/bin. If his paths differ,
hardcode them instead of %h/....
Enable it
# log dir for the run.log append target
mkdir -p ~/.cache/semcode-update
# pick up the new units
systemctl --user daemon-reload
# enable + start the timer (NOT the service)
systemctl --user enable --now update-semcode.timer
# CRITICAL: let user services run without an active login session,
# this needs sudo (root writes /var/lib/systemd/linger/<user>)
sudo loginctl enable-linger "$USER"
The one gotcha to stress
enable-linger is the part people miss. Without it, a user timer only runs while Joel has an active login session — so an unattended 03:17
nightly job silently never fires. Your account has Linger=yes, which is why yours works headless. Make sure Joel runs the loginctl
enable-linger line.
Verify
systemctl --user list-timers update-semcode.timer # shows NEXT/LAST
systemctl --user status update-semcode.service # last result
loginctl show-user "$USER" -p Linger # expect Linger=yes
# optional: run once now to confirm the whole pipeline works
systemctl --user start update-semcode.service
tail -f ~/.cache/semcode-update/run.log
Author
● Yes — lore indexing is a core part of the job, and yesterday's run (Jun 5 03:19) did it. Here's the breakdown.
The script (update-semcode.sh) runs four phases, three of which touch lore:
1. Code — index the source tree HEAD
2. Lore refresh — semcode-index --lore across all tracked archives
3. New shards — auto-detects and adds newly-published lore shards (e.g. when lkml rolls over to a new archive number)
4. Vectors — regenerates embeddings, including lore email vectors
Tracked archives (8): lkml, linux-riscv, linux-acpi, linux-arm-kernel, linux-devicetree, linux-doc, acpica-devel, qemu-devel — exactly the
set your resctrl/CBQRI workflow needs.
Jun 5 run actually pulled new mail — 158,373 new emails indexed:
┌────────────────────┬────────────┬─────────┐
│ Archive │ New emails │ Total │
├────────────────────┼────────────┼─────────┤
│ lkml/18 │ 44,832 │ 219,045 │
├────────────────────┼────────────┼─────────┤
│ lkml/19 │ 22,029 │ 63,896 │
├────────────────────┼────────────┼─────────┤
│ linux-arm-kernel/3 │ 30,478 │ 164,245 │
├────────────────────┼────────────┼─────────┤
│ linux-riscv/0 │ 17,537 │ 103,038 │
├────────────────────┼────────────┼─────────┤
│ linux-doc/0 │ 16,272 │ 180,831 │
├────────────────────┼────────────┼─────────┤
│ linux-devicetree/3 │ 15,907 │ 28,888 │
├────────────────────┼────────────┼─────────┤
│ linux-acpi/0 │ 8,668 │ 160,809 │
├────────────────────┼────────────┼─────────┤
│ acpica-devel/0 │ 2,291 │ 5,919 │
├────────────────────┼────────────┼─────────┤
│ qemu-devel/3 │ 359 │ 180,098 │
└────────────────────┴────────────┴─────────┘
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment