Skip to content

Instantly share code, notes, and snippets.

@boxabirds
Last active April 28, 2026 13:00
Show Gist options
  • Select an option

  • Save boxabirds/13a54abc3314afca59b0bb87e9216b44 to your computer and use it in GitHub Desktop.

Select an option

Save boxabirds/13a54abc3314afca59b0bb87e9216b44 to your computer and use it in GitHub Desktop.
Run DeepSeek v4 on your (128GB) Mac
#!/usr/bin/env bash
# bootstrap-dsv4.sh — recreate the pi.dev + DeepSeek-V4-Flash + llama.cpp stack
# from a clean machine. Sister script to install.sh (which targets the Qwen3.6
# + mlx-lm path). Idempotent: safe to re-run; only missing pieces are added.
#
# What this stack is, with sources:
#
# pi.dev (the agent)
# npm @mariozechner/pi-coding-agent
# repo https://github.com/badlogic/pi-mono
# tui dep @mariozechner/pi-tui (same repo, packages/tui)
#
# llama.cpp (the inference server)
# fork https://github.com/antirez/llama.cpp-deepseek-v4-flash
# why upstream llama.cpp doesn't yet have DeepSeek-V4-Flash; this
# fork adds the model + a hand-written V4 chat template that
# actually wires up tool-calling.
#
# GGUF weights
# repo https://huggingface.co/antirez/deepseek-v4-gguf
# file DeepSeek-V4-Flash-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat-v2.gguf
# size ~87 GB
# quant Q2 routed experts, Q4/Q8 attention/output (antirez recipe)
#
# Repo wrappers (this repo, bin/)
# pi-local health-checks server + auto-starts before pi
# mlxlm-start launches the active serve.sh, registers PID
# mlxlm-stop kills server cleanly
# mlxlm-idle-watcher tails server log; kills after N idle seconds
#
# Config (~/.pi/)
# agent/models.json pi's provider config (llamacpp baseUrl + model)
# ailocal.conf runtime selector (AILOCAL_SERVE_SCRIPT, idle secs)
# mlxlm-active-script auto-written marker of which serve.sh is active
#
# Usage:
# ./bootstrap-dsv4.sh # full bootstrap
# ./bootstrap-dsv4.sh --skip-model # skip 87 GB download (assume present)
# ./bootstrap-dsv4.sh --skip-build # skip llama.cpp build (assume built)
# ./bootstrap-dsv4.sh --skip-smoke # skip end-to-end smoke test
set -uo pipefail
# ---------- Constants ----------
readonly REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
readonly POC_DIR="$REPO_ROOT/pocs/07-llamacpp-dsv4"
readonly LLAMA_DIR="$POC_DIR/llama.cpp"
readonly LLAMA_BIN="$LLAMA_DIR/build/bin/llama-server"
readonly LLAMA_REPO_URL="https://github.com/antirez/llama.cpp-deepseek-v4-flash.git"
readonly TEMPLATE_PATH="$LLAMA_DIR/models/templates/deepseek-ai-DeepSeek-V4.jinja"
readonly MODELS_DIR="$HOME/models/deepseek-v4-gguf"
readonly GGUF_FILE="DeepSeek-V4-Flash-IQ2XXS-w2Q2K-AProjQ8-SExpQ8-OutQ8-chat-v2.gguf"
readonly GGUF_PATH="$MODELS_DIR/$GGUF_FILE"
readonly GGUF_HF_REPO="antirez/deepseek-v4-gguf"
readonly PI_CONFIG_DIR="$HOME/.pi"
readonly PI_MODELS_JSON="$PI_CONFIG_DIR/agent/models.json"
readonly AILOCAL_CONF="$PI_CONFIG_DIR/ailocal.conf"
readonly LOCAL_BIN="$HOME/.local/bin"
readonly REQUIRED_MACOS_MAJOR=14
readonly MIN_RAM_GB=120 # 87 GB weights + KV cache + headroom
readonly MIN_FREE_DISK_GB=120 # 87 GB GGUF + 5 GB build artifacts + slack
readonly HEALTH_URL="http://127.0.0.1:8080/v1/models"
# ---------- Args ----------
SKIP_MODEL=0
SKIP_BUILD=0
SKIP_SMOKE=0
for arg in "$@"; do
case "$arg" in
--skip-model) SKIP_MODEL=1 ;;
--skip-build) SKIP_BUILD=1 ;;
--skip-smoke) SKIP_SMOKE=1 ;;
-h|--help) sed -n '2,40p' "$0"; exit 0 ;;
*) echo "unknown arg: $arg" >&2; exit 2 ;;
esac
done
# ---------- Logging ----------
if [ -t 1 ]; then
C_RED=$'\033[31m'; C_GRN=$'\033[32m'; C_YLW=$'\033[33m'; C_BLU=$'\033[34m'; C_DIM=$'\033[2m'; C_RST=$'\033[0m'
else
C_RED=; C_GRN=; C_YLW=; C_BLU=; C_DIM=; C_RST=
fi
step() { printf "%s==>%s %s\n" "$C_BLU" "$C_RST" "$1"; }
ok() { printf "%s ✓%s %s\n" "$C_GRN" "$C_RST" "$1"; }
warn() { printf "%s ⚠%s %s\n" "$C_YLW" "$C_RST" "$1"; }
fail() { printf "%s ✗%s %s\n" "$C_RED" "$C_RST" "$1" >&2; exit 1; }
note() { printf " %s%s%s\n" "$C_DIM" "$1" "$C_RST"; }
# ---------- 1. Prereqs ----------
step "Checking prerequisites"
[ "$(uname -s)" = "Darwin" ] && [ "$(uname -m)" = "arm64" ] \
|| fail "Apple Silicon Mac required (got $(uname -s)/$(uname -m))"
ok "Apple Silicon ($(uname -m))"
macos_major=$(sw_vers -productVersion | cut -d. -f1)
[ "$macos_major" -ge "$REQUIRED_MACOS_MAJOR" ] \
|| fail "macOS $REQUIRED_MACOS_MAJOR or newer required (got $(sw_vers -productVersion))"
ok "macOS $(sw_vers -productVersion)"
ram_gb=$(sysctl -n hw.memsize | awk '{printf "%d", $1/1024/1024/1024}')
[ "$ram_gb" -ge "$MIN_RAM_GB" ] \
|| fail "Need at least ${MIN_RAM_GB} GB RAM, have ${ram_gb} GB. The GGUF is 87 GB and is mmap'd into wired Metal pages — under-spec'd machines will swap or fail to allocate."
ok "RAM: ${ram_gb} GB"
free_gb=$(df -g "$HOME" | tail -1 | awk '{print $4}')
[ "$free_gb" -ge "$MIN_FREE_DISK_GB" ] \
|| fail "Need at least ${MIN_FREE_DISK_GB} GB free in \$HOME, have ${free_gb} GB. GGUF is 87 GB; plus llama.cpp build artifacts."
ok "Free disk on \$HOME: ${free_gb} GB"
xcode-select -p >/dev/null 2>&1 \
|| fail "Xcode Command Line Tools missing. Run: sudo softwareupdate -i \"Command Line Tools for Xcode 26.4-26.4.1\" --verbose"
ok "Xcode CLT at $(xcode-select -p)"
command -v cmake >/dev/null 2>&1 \
|| fail "cmake missing. Install: brew install cmake"
ok "cmake $(cmake --version | head -1 | awk '{print $3}')"
command -v git >/dev/null 2>&1 || fail "git missing"
ok "git $(git --version | awk '{print $3}')"
command -v node >/dev/null 2>&1 \
|| fail "node missing. Install via nvm: https://github.com/nvm-sh/nvm — needs node ≥20"
node_major=$(node -v | sed 's/^v//' | cut -d. -f1)
[ "$node_major" -ge 20 ] || fail "node ≥20 required, have $(node -v)"
ok "node $(node -v)"
command -v npm >/dev/null 2>&1 || fail "npm missing"
ok "npm $(npm -v)"
if ! command -v hf >/dev/null 2>&1; then
if command -v uv >/dev/null 2>&1; then
step "Installing huggingface_hub CLI via uv"
uv tool install --upgrade "huggingface_hub[cli]" || fail "hf CLI install failed"
else
fail "hf CLI missing and uv not present. Install uv first: curl -LsSf https://astral.sh/uv/install.sh | sh"
fi
fi
ok "hf CLI present"
# ---------- 2. pi.dev ----------
step "Checking pi.dev"
if ! command -v pi >/dev/null 2>&1; then
step "Installing @mariozechner/pi-coding-agent globally"
npm install -g @mariozechner/pi-coding-agent || fail "pi.dev install failed"
fi
PI_VERSION=$(pi --version 2>/dev/null || echo "?")
ok "pi.dev v$PI_VERSION"
# ---------- 3. llama.cpp (antirez fork) ----------
step "Checking llama.cpp (antirez fork)"
mkdir -p "$POC_DIR"
if [ ! -d "$LLAMA_DIR/.git" ]; then
step "Cloning $LLAMA_REPO_URL"
git clone --depth 1 "$LLAMA_REPO_URL" "$LLAMA_DIR" || fail "git clone failed"
fi
ok "Source at $LLAMA_DIR"
[ -f "$TEMPLATE_PATH" ] || fail "Chat template missing at $TEMPLATE_PATH — fork may have moved it. Without this, tool calls don't work."
ok "V4 jinja template present"
if [ "$SKIP_BUILD" -eq 0 ]; then
if [ ! -x "$LLAMA_BIN" ]; then
step "Building llama.cpp with Metal (this takes a few minutes)"
cmake -S "$LLAMA_DIR" -B "$LLAMA_DIR/build" \
-DGGML_METAL=ON \
-DLLAMA_CURL=OFF \
-DCMAKE_BUILD_TYPE=Release \
|| fail "cmake configure failed"
cmake --build "$LLAMA_DIR/build" --target llama-server -j \
|| fail "cmake build failed"
fi
[ -x "$LLAMA_BIN" ] || fail "Build did not produce $LLAMA_BIN"
ok "llama-server built at $LLAMA_BIN"
else
warn "Skipping build (--skip-build)"
fi
# ---------- 4. GGUF weights ----------
step "Checking GGUF weights"
mkdir -p "$MODELS_DIR"
if [ "$SKIP_MODEL" -eq 0 ]; then
if [ ! -f "$GGUF_PATH" ]; then
step "Downloading $GGUF_FILE (~87 GB) from $GGUF_HF_REPO"
note "URL: https://huggingface.co/$GGUF_HF_REPO"
note "This will take a while. Resume-friendly — re-run if it dies."
hf download "$GGUF_HF_REPO" "$GGUF_FILE" \
--local-dir "$MODELS_DIR" \
|| fail "hf download failed"
fi
[ -f "$GGUF_PATH" ] || fail "GGUF still missing at $GGUF_PATH"
size_gb=$(du -g "$GGUF_PATH" | awk '{print $1}')
[ "$size_gb" -ge 80 ] || fail "GGUF size $size_gb GB is suspiciously small (expected ~87 GB) — partial download?"
ok "GGUF at $GGUF_PATH ($size_gb GB)"
else
warn "Skipping model download (--skip-model)"
[ -f "$GGUF_PATH" ] || warn "GGUF not present — server will fail to start"
fi
# ---------- 5. Wrapper scripts ----------
step "Linking wrapper scripts to $LOCAL_BIN"
mkdir -p "$LOCAL_BIN"
chmod +x "$REPO_ROOT"/bin/* "$POC_DIR"/serve.sh 2>/dev/null || true
for w in pi-local mlxlm-start mlxlm-stop mlxlm-idle-watcher; do
src="$REPO_ROOT/bin/$w"
dst="$LOCAL_BIN/$w"
[ -x "$src" ] || fail "Missing wrapper: $src"
if [ -L "$dst" ] && [ "$(readlink "$dst")" = "$src" ]; then
continue
fi
ln -sfn "$src" "$dst"
done
ok "Symlinks: pi-local, mlxlm-start, mlxlm-stop, mlxlm-idle-watcher"
case ":$PATH:" in
*":$LOCAL_BIN:"*) ;;
*) warn "$LOCAL_BIN not on \$PATH. Add to your shell rc: export PATH=\"$LOCAL_BIN:\$PATH\"" ;;
esac
# ---------- 6. pi config ----------
step "Writing pi.dev config"
mkdir -p "$(dirname "$PI_MODELS_JSON")"
cat > "$PI_MODELS_JSON" <<JSON
{
"providers": {
"llamacpp": {
"baseUrl": "http://127.0.0.1:8080/v1",
"api": "openai-completions",
"apiKey": "EMPTY",
"authHeader": true,
"models": [
{
"id": "$GGUF_PATH",
"name": "$GGUF_PATH",
"reasoning": false,
"input": ["text"],
"cost": { "input": 0, "output": 0, "cacheRead": 0, "cacheWrite": 0 },
"contextWindow": 262144,
"maxTokens": 16384
}
]
}
}
}
JSON
ok "Wrote $PI_MODELS_JSON"
# ailocal.conf — preserve any existing settings; only set the keys we manage
mkdir -p "$PI_CONFIG_DIR"
touch "$AILOCAL_CONF"
set_conf() {
local key="$1" val="$2"
if grep -q "^$key=" "$AILOCAL_CONF"; then
# macOS sed needs explicit empty -i ''
sed -i '' "s|^$key=.*|$key=\"$val\"|" "$AILOCAL_CONF"
else
printf '%s="%s"\n' "$key" "$val" >> "$AILOCAL_CONF"
fi
}
set_conf AILOCAL_SERVE_SCRIPT "$POC_DIR/serve.sh"
grep -q "^MLXLM_IDLE_SECONDS=" "$AILOCAL_CONF" || echo 'MLXLM_IDLE_SECONDS=300' >> "$AILOCAL_CONF"
ok "Wrote $AILOCAL_CONF (AILOCAL_SERVE_SCRIPT → POC 7)"
# ---------- 7. Smoke test ----------
if [ "$SKIP_SMOKE" -eq 0 ]; then
step "Smoke test: start server, hit /v1/models, stop"
if curl -fsS -m 1 "$HEALTH_URL" >/dev/null 2>&1; then
note "server already running — leaving it up after smoke test"
ALREADY_UP=1
else
ALREADY_UP=0
"$LOCAL_BIN/mlxlm-start" >/dev/null || fail "mlxlm-start failed"
fi
for i in $(seq 1 60); do
curl -fsS -m 1 "$HEALTH_URL" >/dev/null 2>&1 && break
sleep 1
done
curl -fsS -m 2 "$HEALTH_URL" >/dev/null 2>&1 \
|| fail "Server didn't respond at $HEALTH_URL within 60s. Check ~/Library/Logs/mlxlm.log"
ok "Server healthy at $HEALTH_URL"
step "Round-trip: pi -p 'reply with TEST OK'"
out=$(timeout 60 pi -p --no-session "Reply with exactly: TEST OK" 2>&1 | tail -5)
echo "$out" | grep -q "TEST OK" \
|| { echo "$out"; fail "pi round-trip didn't return expected phrase"; }
ok "pi round-trip OK"
if [ "$ALREADY_UP" -eq 0 ]; then
"$LOCAL_BIN/mlxlm-stop" >/dev/null 2>&1 || true
note "stopped server (was started by smoke test)"
fi
else
warn "Skipping smoke test (--skip-smoke)"
fi
# ---------- Done ----------
echo
ok "Bootstrap complete."
echo
echo "Next steps:"
echo " pi-local # interactive (TUI — see iTerm2 caveats)"
echo " pi-local -p 'hello' # non-interactive (recommended in iTerm2)"
echo " mlxlm-stop # free RAM when done"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment