kibotu · July 17, 2026 07:05 · nikopuf · Mar 16, 2026 · kibotu · Mar 17, 2026
diff --git a/INSTALL.md b/INSTALL.md
diff --git a/install.sh b/install.sh
 #!/usr/bin/env bash
 set -euo pipefail

 # ---------------------------------------------------------------------------
 # install.sh — Sets up Claude Code + Qwen 3.5 local inference on Apple Silicon
 #
 # Safe to run multiple times. Checks for existing installations, skips what's
 # already done, and won't duplicate entries in .zshrc or settings.json.
 # ---------------------------------------------------------------------------

 PORT=8131
 MODEL="unsloth/qwen3.5-35b-a3b"
 ZSHRC="$HOME/.zshrc"
 CLAUDE_SETTINGS_DIR="$HOME/.claude"
 CLAUDE_SETTINGS="$CLAUDE_SETTINGS_DIR/settings.json"

 # --- Colors (because we're not savages) ---
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[0;33m'
 BLUE='\033[0;34m'
 BOLD='\033[1m'
 RESET='\033[0m'

 info()    { echo -e "${BLUE}[info]${RESET}  $1"; }
 ok()      { echo -e "${GREEN}[ok]${RESET}    $1"; }
 skip()    { echo -e "${YELLOW}[skip]${RESET}  $1"; }
 warn()    { echo -e "${YELLOW}[warn]${RESET}  $1"; }
 fail()    { echo -e "${RED}[fail]${RESET}  $1"; exit 1; }
 header()  { echo -e "\n${BOLD}$1${RESET}"; }

 # ---------------------------------------------------------------------------
 # Preflight checks
 # ---------------------------------------------------------------------------
 header "Preflight checks"

 if [[ "$(uname -s)" != "Darwin" ]]; then
    fail "This script is for macOS only. You're on $(uname -s). Godspeed."
 fi

 if [[ "$(uname -m)" != "arm64" ]]; then
    warn "You're on $(uname -m). This guide targets Apple Silicon (arm64). Things might still work, but no promises."
 fi

 RAM_GB=$(( $(sysctl -n hw.memsize) / 1073741824 ))
 if (( RAM_GB < 16 )); then
    fail "You have ${RAM_GB} GB of RAM. The model needs at least 16 GB (24 GB recommended). Sorry."
 elif (( RAM_GB < 24 )); then
    warn "${RAM_GB} GB RAM detected. You'll need to use -c 32768 instead of -c 131072. The run script handles this automatically."
 else
    ok "${RAM_GB} GB unified memory — plenty of room."
 fi

 DISK_FREE_GB=$(df -g "$HOME" | awk 'NR==2 {print $4}')
 if (( DISK_FREE_GB < 25 )); then
    warn "Only ${DISK_FREE_GB} GB free disk space. Model download is ~20 GB. It'll be tight."
 else
    ok "${DISK_FREE_GB} GB free disk space."
 fi

 # ---------------------------------------------------------------------------
 # 1. Homebrew
 # ---------------------------------------------------------------------------
 header "1/6 — Homebrew"

 if command -v brew &>/dev/null; then
    ok "Homebrew $(brew --version | head -1 | awk '{print $2}') already installed."
 else
    info "Installing Homebrew..."
    /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"

    if [[ -f /opt/homebrew/bin/brew ]]; then
        eval "$(/opt/homebrew/bin/brew shellenv)"
    fi
    ok "Homebrew installed."
 fi

 # ---------------------------------------------------------------------------
 # 2. Node.js
 # ---------------------------------------------------------------------------
 header "2/6 — Node.js"

 if command -v node &>/dev/null; then
    NODE_VERSION=$(node --version | sed 's/v//' | cut -d. -f1)
    if (( NODE_VERSION >= 18 )); then
        ok "Node.js $(node --version) already installed."
    else
        info "Node.js $(node --version) is too old (need 18+). Upgrading..."
        brew upgrade node
        ok "Node.js upgraded to $(node --version)."
    fi
 else
    info "Installing Node.js..."
    brew install node
    ok "Node.js $(node --version) installed."
 fi

 # ---------------------------------------------------------------------------
 # 3. llama.cpp
 # ---------------------------------------------------------------------------
 header "3/6 — llama.cpp"

 if command -v llama-server &>/dev/null; then
    ok "llama-server already in PATH."
 else
    info "Installing llama.cpp via Homebrew..."
    brew install llama.cpp
    ok "llama.cpp installed."
 fi

 llama-server --version 2>/dev/null && true

 # ---------------------------------------------------------------------------
 # 4. Claude Code
 # ---------------------------------------------------------------------------
 header "4/6 — Claude Code"

 if command -v claude &>/dev/null; then
    ok "Claude Code $(claude --version 2>/dev/null || echo '(version unknown)') already installed."
 else
    info "Installing Claude Code via Homebrew..."

    if [[ -f /opt/homebrew/bin/claude ]]; then
        warn "Stale binary found at /opt/homebrew/bin/claude — removing it."
        rm -f /opt/homebrew/bin/claude
    fi

    brew install --cask claude-code
    ok "Claude Code installed."
 fi

 # ---------------------------------------------------------------------------
 # 5. Claude Code settings.json
 # ---------------------------------------------------------------------------
 header "5/6 — Claude Code settings (~/.claude/settings.json)"

 mkdir -p "$CLAUDE_SETTINGS_DIR"

 DESIRED_SETTINGS=$(cat << 'SETTINGS_EOF'
 {
  "$schema": "https://json.schemastore.org/claude-code-settings.json",
  "env": {
    "ANTHROPIC_BASE_URL": "http://127.0.0.1:8131",
    "ANTHROPIC_AUTH_TOKEN": "local",
    "ANTHROPIC_MODEL": "unsloth/qwen3.5-35b-a3b",
    "ANTHROPIC_DEFAULT_OPUS_MODEL": "unsloth/qwen3.5-35b-a3b",
    "ANTHROPIC_DEFAULT_SONNET_MODEL": "unsloth/qwen3.5-35b-a3b",
    "ANTHROPIC_DEFAULT_HAIKU_MODEL": "unsloth/qwen3.5-35b-a3b",
    "CLAUDE_CODE_SUBAGENT_MODEL": "unsloth/qwen3.5-35b-a3b",
    "CLAUDE_CODE_MAX_OUTPUT_TOKENS": "128000",
    "DISABLE_PROMPT_CACHING": "1",
    "DISABLE_AUTOUPDATER": "1",
    "DISABLE_TELEMETRY": "1",
    "DISABLE_ERROR_REPORTING": "1",
    "DISABLE_NON_ESSENTIAL_MODEL_CALLS": "1"
  },
  "permissions": {
    "allow": [
      "Bash(git *)",
      "Bash(npm *)",
      "Bash(npx *)",
      "Bash(node *)",
      "Bash(python *)",
      "Bash(python3 *)",
      "Bash(pip *)",
      "Bash(pip3 *)",
      "Bash(brew *)",
      "Bash(ls *)",
      "Bash(cat *)",
      "Bash(head *)",
      "Bash(tail *)",
      "Bash(find *)",
      "Bash(grep *)",
      "Bash(rg *)",
      "Bash(mkdir *)",
      "Bash(cp *)",
      "Bash(mv *)",
      "Bash(rm *)",
      "Bash(echo *)",
      "Bash(curl *)",
      "Bash(which *)",
      "Bash(env *)",
      "Bash(cd *)",
      "Bash(pwd)",
      "Bash(wc *)",
      "Bash(sort *)",
      "Bash(uniq *)",
      "Bash(diff *)",
      "Bash(chmod *)",
      "Bash(touch *)",
      "Bash(sed *)",
      "Bash(awk *)",
      "Bash(xargs *)",
      "Bash(tee *)",
      "Read",
      "Edit",
      "Write",
      "Glob",
      "Grep",
      "WebFetch",
      "WebSearch"
    ],
    "deny": [
      "Read(./.env)",
      "Read(./.env.*)",
      "Read(./secrets/**)"
    ]
  }
 }
 SETTINGS_EOF
 )

 if [[ -f "$CLAUDE_SETTINGS" ]]; then
    EXISTING=$(cat "$CLAUDE_SETTINGS")
    if [[ "$EXISTING" == "{}" || -z "$EXISTING" ]]; then
        info "Empty settings.json found — writing local model configuration..."
        echo "$DESIRED_SETTINGS" > "$CLAUDE_SETTINGS"
        ok "Settings written to $CLAUDE_SETTINGS."
    elif echo "$EXISTING" | grep -q '"ANTHROPIC_BASE_URL"'; then
        skip "settings.json already contains ANTHROPIC_BASE_URL — not overwriting."
        info "Review manually: ${BOLD}$CLAUDE_SETTINGS${RESET}"
    else
        warn "settings.json exists with custom content. Backing up and merging..."
        cp "$CLAUDE_SETTINGS" "$CLAUDE_SETTINGS.backup.$(date +%s)"
        echo "$DESIRED_SETTINGS" > "$CLAUDE_SETTINGS"
        ok "Settings written. Previous version backed up."
    fi
 else
    info "Creating ~/.claude/settings.json..."
    echo "$DESIRED_SETTINGS" > "$CLAUDE_SETTINGS"
    ok "Settings written to $CLAUDE_SETTINGS."
 fi

 # ---------------------------------------------------------------------------
 # 6. Shell configuration (.zshrc)
 # ---------------------------------------------------------------------------
 header "6/6 — Shell configuration (.zshrc)"

 MARKER="# Claude Code + Local Qwen 3.5"

 if grep -qF "$MARKER" "$ZSHRC" 2>/dev/null; then
    skip "Environment variables already in $ZSHRC."
 else
    info "Adding environment variables to $ZSHRC..."
    cat << 'EOF' >> "$ZSHRC"

 # Claude Code + Local Qwen 3.5 — because localhost > cloud
 export LM_MODEL="unsloth/qwen3.5-35b-a3b"
 export ANTHROPIC_BASE_URL="http://127.0.0.1:8131"
 export ANTHROPIC_AUTH_TOKEN="local"
 export CLAUDE_CODE_MAX_OUTPUT_TOKENS=128000
 export ANTHROPIC_MODEL="$LM_MODEL"
 export ANTHROPIC_DEFAULT_OPUS_MODEL="$LM_MODEL"
 export ANTHROPIC_DEFAULT_SONNET_MODEL="$LM_MODEL"
 export ANTHROPIC_DEFAULT_HAIKU_MODEL="$LM_MODEL"
 export CLAUDE_CODE_SUBAGENT_MODEL="$LM_MODEL"
 EOF
    ok "Environment variables added to $ZSHRC."
 fi

 CCLOCAL_MARKER="cclocal()"

 if grep -qF "$CCLOCAL_MARKER" "$ZSHRC" 2>/dev/null; then
    skip "cclocal() helper already in $ZSHRC."
 else
    info "Adding cclocal() helper to $ZSHRC..."
    cat << 'FUNC' >> "$ZSHRC"

 # Quick launcher for Claude Code against any local llama-server port
 cclocal() {
    local port=8131
    if [[ "$1" =~ ^[0-9]+$ ]]; then
        port="$1"
        shift
    fi
    ANTHROPIC_BASE_URL="http://127.0.0.1:${port}" \
    ANTHROPIC_AUTH_TOKEN="local" \
    claude "$@"
 }
 FUNC
    ok "cclocal() helper added."
 fi

 # ---------------------------------------------------------------------------
 # Done
 # ---------------------------------------------------------------------------
 echo ""
 header "Installation complete."
 echo ""
 echo -e "What was configured:"
 echo -e "  ${GREEN}~/.claude/settings.json${RESET}  — env vars, permissions, telemetry settings"
 echo -e "  ${GREEN}~/.zshrc${RESET}                 — env vars (fallback) + cclocal() helper"
 echo ""
 echo -e "Next steps:"
 echo -e "  1. Reload your shell:  ${BOLD}source ~/.zshrc${RESET}"
 echo -e "  2. Start the server:   ${BOLD}./run.sh${RESET}"
 echo -e "  3. In another terminal: ${BOLD}claude${RESET}  (or ${BOLD}cclocal${RESET})"
 echo ""
 echo -e "First run downloads ~20 GB of model weights. Go make coffee."
 echo ""
diff --git a/run.sh b/run.sh
 #!/usr/bin/env bash
 set -euo pipefail

 # ---------------------------------------------------------------------------
 # run.sh — Start (or restart) the Qwen 3.5 llama-server for Claude Code
 #
 # Safe to run multiple times. Kills any existing llama-server on the target
 # port before starting a new one. Auto-detects your chip and RAM.
 # ---------------------------------------------------------------------------

 PORT=8131
 MODEL_HF="unsloth/Qwen3.5-35B-A3B-GGUF:Q4_K_M"

 # --- Colors ---
 RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[0;33m'
 BLUE='\033[0;34m'
 BOLD='\033[1m'
 RESET='\033[0m'

 info()    { echo -e "${BLUE}[info]${RESET}  $1"; }
 ok()      { echo -e "${GREEN}[ok]${RESET}    $1"; }
 warn()    { echo -e "${YELLOW}[warn]${RESET}  $1"; }
 fail()    { echo -e "${RED}[fail]${RESET}  $1"; exit 1; }
 header()  { echo -e "\n${BOLD}$1${RESET}"; }

 # ---------------------------------------------------------------------------
 # Preflight
 # ---------------------------------------------------------------------------
 header "Preflight"

 if ! command -v llama-server &>/dev/null; then
    fail "llama-server not found. Run ./install.sh first."
 fi

 # ---------------------------------------------------------------------------
 # Auto-detect CPU threads
 # ---------------------------------------------------------------------------
 detect_threads() {
    local pcores
    pcores=$(sysctl -n hw.perflevel0.logicalcpu 2>/dev/null || echo "")

    if [[ -n "$pcores" ]]; then
        echo "$pcores"
        return
    fi

    local brand
    brand=$(sysctl -n machdep.cpu.brand_string 2>/dev/null || echo "")
    case "$brand" in
        *"Ultra"*)  echo 16 ;;
        *"Max"*)    echo 8 ;;
        *"Pro"*)    echo 8 ;;
        *)          echo 4 ;;
    esac
 }

 THREADS=$(detect_threads)
 info "Detected ${THREADS} performance cores — using -t ${THREADS}."

 # ---------------------------------------------------------------------------
 # Auto-detect context size based on RAM
 # ---------------------------------------------------------------------------
 RAM_GB=$(( $(sysctl -n hw.memsize) / 1073741824 ))
 if (( RAM_GB < 24 )); then
    CONTEXT=32768
    warn "${RAM_GB} GB RAM — using reduced context (-c ${CONTEXT})."
 elif (( RAM_GB < 48 )); then
    CONTEXT=65536
    info "${RAM_GB} GB RAM — using moderate context (-c ${CONTEXT})."
 else
    CONTEXT=131072
    info "${RAM_GB} GB RAM — using full context (-c ${CONTEXT})."
 fi

 # ---------------------------------------------------------------------------
 # Kill existing llama-server on our port (if any)
 # ---------------------------------------------------------------------------
 header "Port ${PORT}"

 EXISTING_PID=$(lsof -ti :"$PORT" 2>/dev/null || echo "")
 if [[ -n "$EXISTING_PID" ]]; then
    warn "Port ${PORT} is in use (PID ${EXISTING_PID}). Killing it..."
    kill "$EXISTING_PID" 2>/dev/null || true
    sleep 2

    if lsof -ti :"$PORT" &>/dev/null; then
        warn "Still alive. Sending SIGKILL..."
        kill -9 "$EXISTING_PID" 2>/dev/null || true
        sleep 1
    fi

    ok "Port ${PORT} cleared."
 else
    ok "Port ${PORT} is free."
 fi

 # ---------------------------------------------------------------------------
 # Start llama-server
 # ---------------------------------------------------------------------------
 header "Starting llama-server"

 info "Model:   ${MODEL_HF}"
 info "Port:    ${PORT}"
 info "Threads: ${THREADS}"
 info "Context: ${CONTEXT}"
 echo ""

 llama-server \
    -hf "$MODEL_HF" \
    --port "$PORT" \
    -ngl 999 \
    -t "$THREADS" \
    -c "$CONTEXT" \
    -b 512 \
    -ub 1024 \
    --parallel 1 \
    -fa on \
    --jinja \
    --keep 1024 \
    --cache-type-k q8_0 \
    --cache-type-v q8_0 \
    --swa-full \
    --no-context-shift \
    --chat-template-kwargs '{"enable_thinking": false}' \
    --mlock \
    --no-mmap &

 SERVER_PID=$!

 # ---------------------------------------------------------------------------
 # Wait for the server to be ready
 # ---------------------------------------------------------------------------
 info "Waiting for server to be ready (PID ${SERVER_PID})..."

 MAX_WAIT=120
 WAITED=0
 while (( WAITED < MAX_WAIT )); do
    if curl -sf http://localhost:${PORT}/v1/models &>/dev/null; then
        break
    fi

    if ! kill -0 "$SERVER_PID" 2>/dev/null; then
        fail "llama-server exited unexpectedly. Check the output above for errors."
    fi

    sleep 2
    WAITED=$((WAITED + 2))
    printf "\r${BLUE}[info]${RESET}  Waiting... %ds / %ds" "$WAITED" "$MAX_WAIT"
 done
 echo ""

 if (( WAITED >= MAX_WAIT )); then
    fail "Server didn't respond within ${MAX_WAIT}s. First run downloads ~20 GB — try again once the download completes."
 fi

 ok "Server is ready on http://localhost:${PORT}"
 echo ""

 # ---------------------------------------------------------------------------
 # Summary
 # ---------------------------------------------------------------------------
 header "You're good to go."
 echo ""
 echo -e "  The model is serving on ${BOLD}http://localhost:${PORT}${RESET} (PID ${SERVER_PID})."
 echo ""
 echo -e "  Open a new terminal and run:"
 echo ""
 echo -e "    ${BOLD}claude${RESET}                          # if you ran install.sh (env vars are in .zshrc)"
 echo -e "    ${BOLD}cclocal${RESET}                         # same, using the shell helper"
 echo -e "    ${BOLD}cclocal ${PORT}${RESET}                     # explicitly specify this port"
 echo ""
 echo -e "  Or the fully explicit one-liner:"
 echo ""
 echo -e "    ${BOLD}ANTHROPIC_BASE_URL=http://127.0.0.1:${PORT} ANTHROPIC_AUTH_TOKEN=local claude${RESET}"
 echo ""
 echo -e "  To stop the server:  ${BOLD}kill ${SERVER_PID}${RESET}  or  ${BOLD}pkill llama-server${RESET}  or  ${BOLD}Ctrl+C${RESET} here."
 echo ""

 # Keep the script in the foreground so Ctrl+C kills the server
 wait "$SERVER_PID"
What	Minimum	Ideal
Mac	Apple Silicon (M1 / M2 / M3 / M4)	Any Pro / Max variant
Unified Memory	24 GB	64 GB
Free Disk	~25 GB	~40 GB (room to try other models)
macOS	13.0+ (Ventura)	Latest
Node.js	18+	22+
Homebrew	Installed	You're self-hosting LLMs, of course you have Homebrew
Patience	First model download is ~20 GB	Go make coffee
Your Mac	Set `-t` to
M1 / M2 / M3 / M4 (base)	`4`
M1 / M2 / M3 / M4 Pro / Max	`8`
M2 / M3 Ultra	`16`
Flag	Why it matters
`-ngl 999`	Offload every layer to Metal GPU. This is the single biggest speedup. Without it, your CPU does all the work and your GPU sits there looking pretty.
`-t 4`	CPU threads for non-offloaded work. Match to your P-core count (table above).
`-b 512`	Prompt batch size. 512 beats 2048 on M2 base in benchmarks. Counterintuitive, but true.
`--swa-full`	The hidden performance flag. Qwen 3.5 uses sliding window attention. Without this, every follow-up request reprocesses the entire prompt from scratch. With it, prompt caching works. The difference is ~10x on follow-up latency.
`--no-context-shift`	Required when using `--swa-full`. Context shifting is incompatible with SWA.
`--chat-template-kwargs '{"enable_thinking": false}'`	Disables the model's internal chain-of-thought. In agentic workflows, those thinking tokens are wasted — Claude Code manages its own reasoning.
`--cache-type-k/v q8_0`	Quantize the KV cache. Near-zero quality loss, measurable throughput improvement. Free lunch.
`--keep 1024`	Pin the system prompt in cache. Claude Code sends a chunky system prompt — no point re-processing it every turn.
`--mlock`	Lock the model in RAM. Prevents macOS from deciding your model weights are a great candidate for swap.
`--no-mmap`	Don't memory-map the model file. More stable on macOS, especially under memory pressure.
`-fa on`	Flash attention. Faster prompt evaluation.
Setting	Why
`env.ANTHROPIC_BASE_URL`	Points Claude Code at your local llama-server instead of Anthropic's cloud.
`env.ANTHROPIC_AUTH_TOKEN`	Any non-empty string. Satisfies the auth check without a real API key.
`env.*_MODEL`	Maps every model tier (Opus, Sonnet, Haiku, subagents) to your local Qwen 3.5.
`env.CLAUDE_CODE_MAX_OUTPUT_TOKENS`	Allows longer responses. Default is 32K — local models have no billing, so crank it.
`env.DISABLE_PROMPT_CACHING`	Prompt caching is an Anthropic API feature. Your local server handles its own caching via `--swa-full`.
`env.DISABLE_AUTOUPDATER`	You're running local. Auto-updates would just add network calls you don't need.
`env.DISABLE_TELEMETRY`	Running local to keep things private? Then don't phone home.
`env.DISABLE_ERROR_REPORTING`	Same reasoning. Your errors, your business.
`env.DISABLE_NON_ESSENTIAL_MODEL_CALLS`	Stops Claude Code from making extra model calls for things like spinner text. Every token counts on local inference.
`permissions.allow`	Pre-approves common shell commands and all file tools. Without this, Claude Code asks for permission on every single tool call. Gets old fast.
`permissions.deny`	Keeps `.env` and secrets off-limits, because even local models shouldn't read your credentials.
Model	Port	Size	Good For	Command
Qwen3-Coder-30B-A3B	8127	~30 GB	Pure coding. If you only write code, this might be better than Qwen 3.5.	`llama-server --fim-qwen-30b-default --port 8127`
GLM-4.7-Flash	8129	~18 GB	Lighter weight, still capable. Good if you're on 24 GB.	`llama-server -hf unsloth/GLM-4.7-Flash-GGUF:UD-Q4_K_XL --port 8129 -c 131072 -b 2048 -ub 1024 --parallel 1 -fa on --jinja --chat-template-file ~/llama.cpp/models/templates/glm-4.jinja`
GPT-OSS-20B	8123	~20 GB	Fast baseline. 17–38 tok/s on M1 Max.	`llama-server --gpt-oss-20b-default --port 8123`
Qwen3-Coder-Next-80B-A3B	8130	~46 GB	SOTA coder. Needs 64 GB RAM. Worth it if you have the metal.	`llama-server -hf unsloth/Qwen3-Coder-Next-GGUF:UD-Q4_K_XL --port 8130 -c 131072 -b 2048 -ub 1024 --parallel 1 -fa on --jinja`
Symptom	Likely Cause	Fix
`connection refused` on curl	Server still loading	Wait 30s, retry. First load is slow.
Painfully slow responses	Missing `--swa-full` or `-ngl 999`	Restart llama-server with both flags. This is the #1 gotcha.
Claude Code uses the cloud model	Env vars not set	Re-export them in the terminal you're using. Check with `echo $ANTHROPIC_BASE_URL`.
`failed to find a memory slot`	Context too large for available RAM	Use `--parallel 1` and/or reduce `-c` to `32768`.
Auth errors from Claude Code	Missing auth token	`export ANTHROPIC_AUTH_TOKEN=local` — any non-empty string works.
Wrong model responding	Model ID mismatch	Run `curl localhost:8131/v1/models` and compare with `echo $LM_MODEL`.
First request takes forever	Cold start, model loading into memory	Normal. 10–30 seconds. Subsequent requests are fast.
System swapping / beachball	Model doesn't fit in RAM	Reduce `-c`, or try a smaller model like GLM-4.7-Flash (~18 GB).
Homebrew binary conflict	Previous install left `/opt/homebrew/bin/claude`	`rm /opt/homebrew/bin/claude && brew reinstall --cask claude-code`
	#!/usr/bin/env bash
	set -euo pipefail

	# ---------------------------------------------------------------------------
	# install.sh — Sets up Claude Code + Qwen 3.5 local inference on Apple Silicon
	#
	# Safe to run multiple times. Checks for existing installations, skips what's
	# already done, and won't duplicate entries in .zshrc or settings.json.
	# ---------------------------------------------------------------------------

	PORT=8131
	MODEL="unsloth/qwen3.5-35b-a3b"
	ZSHRC="$HOME/.zshrc"
	CLAUDE_SETTINGS_DIR="$HOME/.claude"
	CLAUDE_SETTINGS="$CLAUDE_SETTINGS_DIR/settings.json"

	# --- Colors (because we're not savages) ---
	RED='\033[0;31m'
	GREEN='\033[0;32m'
	YELLOW='\033[0;33m'
	BLUE='\033[0;34m'
	BOLD='\033[1m'
	RESET='\033[0m'

	info() { echo -e "${BLUE}[info]${RESET} $1"; }
	ok() { echo -e "${GREEN}[ok]${RESET} $1"; }
	skip() { echo -e "${YELLOW}[skip]${RESET} $1"; }
	warn() { echo -e "${YELLOW}[warn]${RESET} $1"; }
	fail() { echo -e "${RED}[fail]${RESET} $1"; exit 1; }
	header() { echo -e "\n${BOLD}$1${RESET}"; }

	# ---------------------------------------------------------------------------
	# Preflight checks
	# ---------------------------------------------------------------------------
	header "Preflight checks"

	if [[ "$(uname -s)" != "Darwin" ]]; then
	fail "This script is for macOS only. You're on $(uname -s). Godspeed."
	fi

	if [[ "$(uname -m)" != "arm64" ]]; then
	warn "You're on $(uname -m). This guide targets Apple Silicon (arm64). Things might still work, but no promises."
	fi

	RAM_GB=$(( $(sysctl -n hw.memsize) / 1073741824 ))
	if (( RAM_GB < 16 )); then
	fail "You have ${RAM_GB} GB of RAM. The model needs at least 16 GB (24 GB recommended). Sorry."
	elif (( RAM_GB < 24 )); then
	warn "${RAM_GB} GB RAM detected. You'll need to use -c 32768 instead of -c 131072. The run script handles this automatically."
	else
	ok "${RAM_GB} GB unified memory — plenty of room."
	fi

	DISK_FREE_GB=$(df -g "$HOME" \| awk 'NR==2 {print $4}')
	if (( DISK_FREE_GB < 25 )); then
	warn "Only ${DISK_FREE_GB} GB free disk space. Model download is ~20 GB. It'll be tight."
	else
	ok "${DISK_FREE_GB} GB free disk space."
	fi

	# ---------------------------------------------------------------------------
	# 1. Homebrew
	# ---------------------------------------------------------------------------
	header "1/6 — Homebrew"

	if command -v brew &>/dev/null; then
	ok "Homebrew $(brew --version \| head -1 \| awk '{print $2}') already installed."
	else
	info "Installing Homebrew..."
	/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"

	if [[ -f /opt/homebrew/bin/brew ]]; then
	eval "$(/opt/homebrew/bin/brew shellenv)"
	fi
	ok "Homebrew installed."
	fi

	# ---------------------------------------------------------------------------
	# 2. Node.js
	# ---------------------------------------------------------------------------
	header "2/6 — Node.js"

	if command -v node &>/dev/null; then
	NODE_VERSION=$(node --version \| sed 's/v//' \| cut -d. -f1)
	if (( NODE_VERSION >= 18 )); then
	ok "Node.js $(node --version) already installed."
	else
	info "Node.js $(node --version) is too old (need 18+). Upgrading..."
	brew upgrade node
	ok "Node.js upgraded to $(node --version)."
	fi
	else
	info "Installing Node.js..."
	brew install node
	ok "Node.js $(node --version) installed."
	fi

	# ---------------------------------------------------------------------------
	# 3. llama.cpp
	# ---------------------------------------------------------------------------
	header "3/6 — llama.cpp"

	if command -v llama-server &>/dev/null; then
	ok "llama-server already in PATH."
	else
	info "Installing llama.cpp via Homebrew..."
	brew install llama.cpp
	ok "llama.cpp installed."
	fi

	llama-server --version 2>/dev/null && true

	# ---------------------------------------------------------------------------
	# 4. Claude Code
	# ---------------------------------------------------------------------------
	header "4/6 — Claude Code"

	if command -v claude &>/dev/null; then
	ok "Claude Code $(claude --version 2>/dev/null \|\| echo '(version unknown)') already installed."
	else
	info "Installing Claude Code via Homebrew..."

	if [[ -f /opt/homebrew/bin/claude ]]; then
	warn "Stale binary found at /opt/homebrew/bin/claude — removing it."
	rm -f /opt/homebrew/bin/claude
	fi

	brew install --cask claude-code
	ok "Claude Code installed."
	fi

	# ---------------------------------------------------------------------------
	# 5. Claude Code settings.json
	# ---------------------------------------------------------------------------
	header "5/6 — Claude Code settings (~/.claude/settings.json)"

	mkdir -p "$CLAUDE_SETTINGS_DIR"

	DESIRED_SETTINGS=$(cat << 'SETTINGS_EOF'
	{
	"$schema": "https://json.schemastore.org/claude-code-settings.json",
	"env": {
	"ANTHROPIC_BASE_URL": "http://127.0.0.1:8131",
	"ANTHROPIC_AUTH_TOKEN": "local",
	"ANTHROPIC_MODEL": "unsloth/qwen3.5-35b-a3b",
	"ANTHROPIC_DEFAULT_OPUS_MODEL": "unsloth/qwen3.5-35b-a3b",
	"ANTHROPIC_DEFAULT_SONNET_MODEL": "unsloth/qwen3.5-35b-a3b",
	"ANTHROPIC_DEFAULT_HAIKU_MODEL": "unsloth/qwen3.5-35b-a3b",
	"CLAUDE_CODE_SUBAGENT_MODEL": "unsloth/qwen3.5-35b-a3b",
	"CLAUDE_CODE_MAX_OUTPUT_TOKENS": "128000",
	"DISABLE_PROMPT_CACHING": "1",
	"DISABLE_AUTOUPDATER": "1",
	"DISABLE_TELEMETRY": "1",
	"DISABLE_ERROR_REPORTING": "1",
	"DISABLE_NON_ESSENTIAL_MODEL_CALLS": "1"
	},
	"permissions": {
	"allow": [
	"Bash(git *)",
	"Bash(npm *)",
	"Bash(npx *)",
	"Bash(node *)",
	"Bash(python *)",
	"Bash(python3 *)",
	"Bash(pip *)",
	"Bash(pip3 *)",
	"Bash(brew *)",
	"Bash(ls *)",
	"Bash(cat *)",
	"Bash(head *)",
	"Bash(tail *)",
	"Bash(find *)",
	"Bash(grep *)",
	"Bash(rg *)",
	"Bash(mkdir *)",
	"Bash(cp *)",
	"Bash(mv *)",
	"Bash(rm *)",
	"Bash(echo *)",
	"Bash(curl *)",
	"Bash(which *)",
	"Bash(env *)",
	"Bash(cd *)",
	"Bash(pwd)",
	"Bash(wc *)",
	"Bash(sort *)",
	"Bash(uniq *)",
	"Bash(diff *)",
	"Bash(chmod *)",
	"Bash(touch *)",
	"Bash(sed *)",
	"Bash(awk *)",
	"Bash(xargs *)",
	"Bash(tee *)",
	"Read",
	"Edit",
	"Write",
	"Glob",
	"Grep",
	"WebFetch",
	"WebSearch"
	],
	"deny": [
	"Read(./.env)",
	"Read(./.env.*)",
	"Read(./secrets/**)"
	]
	}
	}
	SETTINGS_EOF
	)

	if [[ -f "$CLAUDE_SETTINGS" ]]; then
	EXISTING=$(cat "$CLAUDE_SETTINGS")
	if [[ "$EXISTING" == "{}" \|\| -z "$EXISTING" ]]; then
	info "Empty settings.json found — writing local model configuration..."
	echo "$DESIRED_SETTINGS" > "$CLAUDE_SETTINGS"
	ok "Settings written to $CLAUDE_SETTINGS."
	elif echo "$EXISTING" \| grep -q '"ANTHROPIC_BASE_URL"'; then
	skip "settings.json already contains ANTHROPIC_BASE_URL — not overwriting."
	info "Review manually: ${BOLD}$CLAUDE_SETTINGS${RESET}"
	else
	warn "settings.json exists with custom content. Backing up and merging..."
	cp "$CLAUDE_SETTINGS" "$CLAUDE_SETTINGS.backup.$(date +%s)"
	echo "$DESIRED_SETTINGS" > "$CLAUDE_SETTINGS"
	ok "Settings written. Previous version backed up."
	fi
	else
	info "Creating ~/.claude/settings.json..."
	echo "$DESIRED_SETTINGS" > "$CLAUDE_SETTINGS"
	ok "Settings written to $CLAUDE_SETTINGS."
	fi

	# ---------------------------------------------------------------------------
	# 6. Shell configuration (.zshrc)
	# ---------------------------------------------------------------------------
	header "6/6 — Shell configuration (.zshrc)"

	MARKER="# Claude Code + Local Qwen 3.5"

	if grep -qF "$MARKER" "$ZSHRC" 2>/dev/null; then
	skip "Environment variables already in $ZSHRC."
	else
	info "Adding environment variables to $ZSHRC..."
	cat << 'EOF' >> "$ZSHRC"

	# Claude Code + Local Qwen 3.5 — because localhost > cloud
	export LM_MODEL="unsloth/qwen3.5-35b-a3b"
	export ANTHROPIC_BASE_URL="http://127.0.0.1:8131"
	export ANTHROPIC_AUTH_TOKEN="local"
	export CLAUDE_CODE_MAX_OUTPUT_TOKENS=128000
	export ANTHROPIC_MODEL="$LM_MODEL"
	export ANTHROPIC_DEFAULT_OPUS_MODEL="$LM_MODEL"
	export ANTHROPIC_DEFAULT_SONNET_MODEL="$LM_MODEL"
	export ANTHROPIC_DEFAULT_HAIKU_MODEL="$LM_MODEL"
	export CLAUDE_CODE_SUBAGENT_MODEL="$LM_MODEL"
	EOF
	ok "Environment variables added to $ZSHRC."
	fi

	CCLOCAL_MARKER="cclocal()"

	if grep -qF "$CCLOCAL_MARKER" "$ZSHRC" 2>/dev/null; then
	skip "cclocal() helper already in $ZSHRC."
	else
	info "Adding cclocal() helper to $ZSHRC..."
	cat << 'FUNC' >> "$ZSHRC"

	# Quick launcher for Claude Code against any local llama-server port
	cclocal() {
	local port=8131
	if [[ "$1" =~ ^[0-9]+$ ]]; then
	port="$1"
	shift
	fi
	ANTHROPIC_BASE_URL="http://127.0.0.1:${port}" \
	ANTHROPIC_AUTH_TOKEN="local" \
	claude "$@"
	}
	FUNC
	ok "cclocal() helper added."
	fi

	# ---------------------------------------------------------------------------
	# Done
	# ---------------------------------------------------------------------------
	echo ""
	header "Installation complete."
	echo ""
	echo -e "What was configured:"
	echo -e " ${GREEN}~/.claude/settings.json${RESET} — env vars, permissions, telemetry settings"
	echo -e " ${GREEN}~/.zshrc${RESET} — env vars (fallback) + cclocal() helper"
	echo ""
	echo -e "Next steps:"
	echo -e " 1. Reload your shell: ${BOLD}source ~/.zshrc${RESET}"
	echo -e " 2. Start the server: ${BOLD}./run.sh${RESET}"
	echo -e " 3. In another terminal: ${BOLD}claude${RESET} (or ${BOLD}cclocal${RESET})"
	echo ""
	echo -e "First run downloads ~20 GB of model weights. Go make coffee."
	echo ""