Skip to content

Instantly share code, notes, and snippets.

@c4software
Last active June 11, 2026 19:32
Show Gist options
  • Select an option

  • Save c4software/e527955657085f401d92c6f1678c0256 to your computer and use it in GitHub Desktop.

Select an option

Save c4software/e527955657085f401d92c6f1678c0256 to your computer and use it in GitHub Desktop.
Setup-LLM - Simple script to setup LLM Server for Ryzen Strix Halo 128Gb
{
"$schema": "https://opencode.ai/config.json",
"model": "llamaswap/balanced",
"small_model": "llamaswap/flash",
"provider": {
"llamaswap": {
"name": "llama-swap (Strix Halo)",
"npm": "@ai-sdk/openai-compatible",
"options": {
"baseURL": "http://100.64.0.4:8009/v1"
},
"models": {
"balanced": {
"name": "balanced"
},
"balanced-nothink": {
"name": "balanced-nothink"
},
"flash": {
"name": "flash"
},
"gemma4": {
"name": "gemma4"
},
"pro": {
"name": "pro"
},
"pro-nothink": {
"name": "pro-nothink"
}
}
}
},
"agent": {
"build": {
"model": "llamaswap/balanced",
"permission": {
"edit": "allow",
"bash": {
"*": "ask"
}
}
},
"plan": {
"model": "llamaswap/balanced-nothink",
"description": "Analyse et planning rapide — design, UI, refactoring simple",
"permission": {
"edit": "deny",
"bash": {
"*": "deny"
}
}
},
"plan-pro": {
"model": "llamaswap/pro-nothink",
"description": "Analyse approfondie — architecture, sécurité, refactoring complexe",
"mode": "primary",
"permission": {
"edit": "deny",
"bash": {
"*": "deny"
}
}
},
"explore": {
"model": "llamaswap/balanced-nothink",
"permission": {
"edit": "deny",
"bash": {
"*": "deny"
}
}
},
"general": {
"model": "llamaswap/balanced-nothink"
},
"title": {
"model": "llamaswap/flash"
},
"summary": {
"model": "llamaswap/flash"
},
"compaction": {
"model": "llamaswap/balanced-nothink"
}
}
}
#!/usr/bin/env bash
set -euo pipefail
# =============================================================================
# setup-llm.sh — llama-swap
# =============================================================================
# =============================================================================
# MODÈLES (repo + fichier)
# =============================================================================
MODEL_MINI_REPO="unsloth/Qwen3.5-2B-GGUF"
MODEL_MINI_FILE="Qwen3.5-2B-UD-Q4_K_XL.gguf"
MODEL_FLASH_REPO="unsloth/Qwen3.5-9B-GGUF"
MODEL_FLASH_FILE="Qwen3.5-9B-UD-Q6_K_XL.gguf"
MODEL_BALANCED_REPO="unsloth/Qwen3.6-35B-A3B-GGUF"
MODEL_BALANCED_FILE="Qwen3.6-35B-A3B-UD-Q6_K_XL.gguf"
# Qwen3-Coder-Next : MoE 80B non-reasoning natif, optimisé agentic coding
MODEL_PRO_REPO="unsloth/Qwen3-Coder-Next-GGUF"
MODEL_PRO_FILE="Qwen3-Coder-Next-UD-Q4_K_XL.gguf"
# Qwen3.6-27B : dense 27B hybrid-thinking, raisonnement général + coding
# UD-Q6_K_XL : 25.6 Go — fichier unique
MODEL_THINKER_REPO="unsloth/Qwen3.6-27B-GGUF"
MODEL_THINKER_FILE="Qwen3.6-27B-UD-Q6_K_XL.gguf"
MODEL_GEMMA_REPO="unsloth/gemma-4-31B-it-GGUF"
MODEL_GEMMA_FILE="gemma-4-31B-it-Q4_K_M.gguf"
MODEL_GPTOSS_REPO="unsloth/gpt-oss-120b-GGUF"
MODEL_GPTOSS_FILE_GLOB="UD-Q4_K_XL/*"
MODEL_GPTOSS_FILE_ENTRY="UD-Q4_K_XL/gpt-oss-120b-UD-Q4_K_XL-00001-of-00002.gguf"
# Qwen3.6-27B-MTP : variante MTP du thinker (même poids, draft intégré)
# UD-Q4_K_XL : 17.9 Go — fichier unique
MODEL_THINKER_MTP_REPO="unsloth/Qwen3.6-27B-MTP-GGUF"
MODEL_THINKER_MTP_FILE="Qwen3.6-27B-UD-Q4_K_XL.gguf"
# Qwen3.6-35B-A3B-MTP : variante MTP du balanced (même poids, draft intégré)
MODEL_BALANCED_MTP_REPO="unsloth/Qwen3.6-35B-A3B-MTP-GGUF"
MODEL_BALANCED_MTP_FILE="Qwen3.6-35B-A3B-UD-Q4_K_XL.gguf"
# =============================================================================
# CHEMINS
# =============================================================================
MODELS_BASE="$HOME/models"
CACHE_DIR="$HOME/.cache/llama-slots"
CONFIG_DIR="$HOME/models"
MINI_PATH="$MODELS_BASE/mini/$MODEL_MINI_FILE"
FLASH_PATH="$MODELS_BASE/flash/$MODEL_FLASH_FILE"
BALANCED_PATH="$MODELS_BASE/balanced/$MODEL_BALANCED_FILE"
PRO_PATH="$MODELS_BASE/pro/$MODEL_PRO_FILE"
THINKER_PATH="$MODELS_BASE/thinker/$MODEL_THINKER_FILE"
GEMMA_PATH="$MODELS_BASE/gemma/$MODEL_GEMMA_FILE"
GPTOSS_PATH="$MODELS_BASE/gpt-oss/$MODEL_GPTOSS_FILE_ENTRY"
THINKER_MTP_PATH="$MODELS_BASE/thinker-mtp/$MODEL_THINKER_MTP_FILE"
BALANCED_MTP_PATH="$MODELS_BASE/balanced-mtp/$MODEL_BALANCED_MTP_FILE"
# =============================================================================
# FLAGS PAR MODÈLE
# =============================================================================
declare -A MODEL_FLAGS
MODEL_FLAGS[mini]="--model $MINI_PATH \
--port 8081 \
--ctx-size 32768 \
--cache-ram 2048 \
--temp 0.7 --top-k 20 --top-p 0.8 --min-p 0.0 \
-np 4"
MODEL_FLAGS[flash]="--model $FLASH_PATH \
--port 8089 \
--ctx-size 32768 \
--cache-ram 2048 \
--temp 0.7 --top-k 20 --top-p 0.8 --min-p 0.0 \
-np 4"
MODEL_FLAGS[balanced]="--model $BALANCED_PATH \
--port 8082 \
--ctx-size 393216 \
--cache-ram 6144 \
--temp 0.6 --top-k 20 --top-p 0.95 --min-p 0.0 \
-np 3"
MODEL_FLAGS[balanced-nothink]="--model $BALANCED_PATH \
--port 8084 \
--ctx-size 524288 \
--cache-ram 12288 \
--reasoning off \
--temp 0.6 --top-k 20 --top-p 0.95 --min-p 0.0 \
-np 2"
# Qwen3-Coder-Next — MoE hybrid-attention, cache-reuse désactivé (incompatible)
MODEL_FLAGS[pro]="--model $PRO_PATH \
--port 8083 \
--ctx-size 131072 \
--cache-ram 4096 \
--temp 1.0 --top-k 40 --top-p 0.95 --min-p 0.01 \
--cache-reuse 0 \
-np 1"
# Qwen3.6-27B thinking — raisonnement activé, tool-calling jinja, presence_penalty recommandé
MODEL_FLAGS[thinker]="--model $THINKER_PATH \
--port 8090 \
--ctx-size 131072 \
--cache-ram 4096 \
--temp 0.6 --top-k 20 --top-p 0.95 --min-p 0.0 \
--presence-penalty 1.5 \
--jinja \
-np 1"
# Qwen3.6-27B nothink — réponses directes sans chain-of-thought
MODEL_FLAGS[thinker-nothink]="--model $THINKER_PATH \
--port 8091 \
--ctx-size 131072 \
--cache-ram 4096 \
--reasoning off \
--temp 0.6 --top-k 20 --top-p 0.95 --min-p 0.0 \
--presence-penalty 1.5 \
-np 1"
MODEL_FLAGS[gemma4]="--model $GEMMA_PATH \
--port 8086 \
--ctx-size 512000 \
--cache-ram 4096 \
--reasoning off \
--temp 1.0 --top-k 64 --top-p 0.95 --min-p 0.01 \
-np 2"
MODEL_FLAGS[gpt-oss]="--model $GPTOSS_PATH \
--port 8087 \
--ctx-size 131072 \
--cache-ram 8192 \
--temp 1.0 --top-k 0 --top-p 1.0 --min-p 0.0 \
-np 1"
# Qwen3.6-27B-MTP nothink — spéculation MTP, cache-reuse désactivé (incompatible MTP)
# Utilise --chat-template-kwargs (pas --reasoning off) pour désactiver le thinking sur Qwen3.6
MODEL_FLAGS[thinker-mtp-nothink]="--model $THINKER_MTP_PATH \
--port 8092 \
--ctx-size 131072 \
--cache-ram 4096 \
--temp 0.7 --top-k 20 --top-p 0.8 --min-p 0.0 \
--presence-penalty 1.5 \
--chat-template-kwargs '{\"enable_thinking\":false}' \
--cache-reuse 0 \
--spec-type draft-mtp --spec-draft-n-max 2 \
--jinja \
-np 1"
# Qwen3.6-35B-A3B-MTP nothink — spéculation MTP, cache-reuse désactivé (incompatible MTP)
MODEL_FLAGS[balanced-mtp-nothink]="--model $BALANCED_MTP_PATH \
--port 8093 \
--ctx-size 131072 \
--cache-ram 6144 \
--temp 0.7 --top-k 20 --top-p 0.8 --min-p 0.0 \
--presence-penalty 1.5 \
--chat-template-kwargs '{\"enable_thinking\":false}' \
--cache-reuse 0 \
--spec-type draft-mtp --spec-draft-n-max 2 \
--jinja \
-np 1"
# =============================================================================
# Construction de la commande
# =============================================================================
build_llama_server_cmd() {
local name="$1"
local flags="${MODEL_FLAGS[$name]:-}"
[[ -z "$flags" ]] && error "Aucun flag défini pour le modèle '$name'"
local cmd="llama-server $flags"
# Flags communs à tous les modèles
# Note : --cache-reuse 0 dans MODEL_FLAGS[pro/thinker-mtp/balanced-mtp] écrase ce --cache-reuse 4096
# car llama-server prend le dernier flag en cas de doublon
cmd="$cmd \
--n-gpu-layers 99 \
--cache-reuse 4096 \
--cache-type-k q4_0 \
--cache-type-v q4_0 \
--flash-attn on \
--prio 2 \
--metrics \
--slot-prompt-similarity 0.5 \
--presence-penalty 0.0"
cmd="$cmd --slot-save-path $CACHE_DIR/$name"
echo "$cmd"
}
# =============================================================================
# Helpers
# =============================================================================
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; NC='\033[0m'
info() { echo -e "${GREEN}[INFO]${NC} $*"; }
warn() { echo -e "${YELLOW}[WARN]${NC} $*"; }
error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; }
_dl() {
local target="$1" repo="$2"; shift 2
if [[ -f "$target" ]]; then info "$(basename "$target") déjà présent, skip."; return; fi
info "Téléchargement $(basename "$target")..."
HF_XET_HIGH_PERFORMANCE=1 hf download "$repo" "$@" --local-dir "$(dirname "$target")"
}
_dl_shard() {
local target="$1" repo="$2" glob="$3"
local shard_dir dest_dir
shard_dir="$(dirname "$target")"
dest_dir="$(dirname "$shard_dir")"
if [[ -f "$target" ]]; then info "$(basename "$target") déjà présent, skip."; return; fi
info "Téléchargement shards $(basename "$shard_dir")..."
HF_XET_HIGH_PERFORMANCE=1 hf download "$repo" --include "$glob" --local-dir "$dest_dir"
}
# =============================================================================
# setup
# =============================================================================
cmd_setup() {
info "Vérification des dépendances..."
PACMAN_PKGS=(curl llama.cpp-vulkan llama-swap-bin python-huggingface-hub python-hf-xet)
MISSING=()
for pkg in "${PACMAN_PKGS[@]}"; do
paru -Qi "$pkg" &>/dev/null || MISSING+=("$pkg")
done
[[ ${#MISSING[@]} -gt 0 ]] && paru -S --noconfirm "${MISSING[@]}"
command -v hf >/dev/null || error "hf introuvable"
info "Création des dossiers..."
mkdir -p "$MODELS_BASE"/{mini,flash,balanced,pro,thinker,gemma,gpt-oss,thinker-mtp,balanced-mtp}
mkdir -p "$CACHE_DIR"/{mini,flash,balanced,balanced-nothink,pro,thinker,thinker-nothink,gemma4,gpt-oss,thinker-mtp-nothink,balanced-mtp-nothink}
_dl "$MINI_PATH" "$MODEL_MINI_REPO" "$MODEL_MINI_FILE"
_dl "$FLASH_PATH" "$MODEL_FLASH_REPO" "$MODEL_FLASH_FILE"
_dl "$BALANCED_PATH" "$MODEL_BALANCED_REPO" "$MODEL_BALANCED_FILE"
_dl "$PRO_PATH" "$MODEL_PRO_REPO" "$MODEL_PRO_FILE"
_dl "$THINKER_PATH" "$MODEL_THINKER_REPO" "$MODEL_THINKER_FILE"
_dl "$GEMMA_PATH" "$MODEL_GEMMA_REPO" "$MODEL_GEMMA_FILE"
_dl_shard "$GPTOSS_PATH" "$MODEL_GPTOSS_REPO" "$MODEL_GPTOSS_FILE_GLOB"
_dl "$THINKER_MTP_PATH" "$MODEL_THINKER_MTP_REPO" "$MODEL_THINKER_MTP_FILE"
_dl "$BALANCED_MTP_PATH" "$MODEL_BALANCED_MTP_REPO" "$MODEL_BALANCED_MTP_FILE"
info "Génération de llama-swap-config.yaml..."
cat > "$CONFIG_DIR/llama-swap-config.yaml" << YAML
healthCheckTimeout: 120
models:
"mini":
proxy: http://127.0.0.1:8081
cmd: >
$(build_llama_server_cmd mini)
"flash":
proxy: http://127.0.0.1:8089
cmd: >
$(build_llama_server_cmd flash)
"balanced":
proxy: http://127.0.0.1:8082
ttl: 300
cmd: >
$(build_llama_server_cmd balanced)
"balanced-nothink":
proxy: http://127.0.0.1:8084
cmd: >
$(build_llama_server_cmd balanced-nothink)
"pro":
proxy: http://127.0.0.1:8083
ttl: 1800
cmd: >
$(build_llama_server_cmd pro)
"thinker":
proxy: http://127.0.0.1:8090
ttl: 600
cmd: >
$(build_llama_server_cmd thinker)
"thinker-nothink":
proxy: http://127.0.0.1:8091
ttl: 600
cmd: >
$(build_llama_server_cmd thinker-nothink)
"gemma4":
proxy: http://127.0.0.1:8086
ttl: 600
cmd: >
$(build_llama_server_cmd gemma4)
"gpt-oss":
proxy: http://127.0.0.1:8087
ttl: 600
cmd: >
$(build_llama_server_cmd gpt-oss)
"thinker-mtp-nothink":
proxy: http://127.0.0.1:8092
ttl: 600
cmd: >
$(build_llama_server_cmd thinker-mtp-nothink)
"balanced-mtp-nothink":
proxy: http://127.0.0.1:8093
ttl: 600
cmd: >
$(build_llama_server_cmd balanced-mtp-nothink)
groups:
"always-on":
exclusive: false
persistent: true
members:
- flash
- balanced-nothink
"heavy":
exclusive: true
members:
- balanced
- pro
- balanced-mtp-nothink
"heavy-thinker":
exclusive: true
members:
- thinker
- thinker-nothink
- thinker-mtp-nothink
"heavy-gemma":
exclusive: true
members:
- gemma4
"heavy-gpt-oss":
exclusive: true
members:
- gpt-oss
YAML
info "✅ Config générée : $CONFIG_DIR/llama-swap-config.yaml"
info "Setup terminé → ./setup-llm.sh --start"
}
# =============================================================================
# start
# =============================================================================
cmd_start() {
export PATH="$HOME/.local/bin:$PATH"
command -v llama-swap >/dev/null || error "llama-swap introuvable"
command -v llama-server >/dev/null || error "llama-server introuvable"
[[ -f "$CONFIG_DIR/llama-swap-config.yaml" ]] || error "Config introuvable"
info "Lancement de llama-swap sur :8009..."
llama-swap --config "$CONFIG_DIR/llama-swap-config.yaml" --listen 0.0.0.0:8009
}
# =============================================================================
# Service systemd système
# =============================================================================
SERVICE_NAME="llama-swap"
SERVICE_FILE="/etc/systemd/system/${SERVICE_NAME}.service"
cmd_install_service() {
info "Génération du service systemd système..."
local script_path
script_path="$(realpath "$0")"
sudo tee "$SERVICE_FILE" > /dev/null << SERVICE
[Unit]
Description=llama-swap — LLM model proxy router
After=network.target
[Service]
Type=simple
User=$USER
Group=$(id -gn)
ExecStart=${script_path} --start
Restart=on-failure
RestartSec=5
StandardOutput=journal
StandardError=journal
Environment="PATH=$HOME/.local/bin:/usr/local/bin:/usr/bin:/bin"
Environment="HOME=$HOME"
[Install]
WantedBy=multi-user.target
SERVICE
sudo systemctl daemon-reload
sudo systemctl enable "$SERVICE_NAME"
info "✅ Service installé : $SERVICE_FILE"
info " Démarrage automatique au boot activé (multi-user.target)"
info "Commandes utiles :"
info " sudo systemctl start $SERVICE_NAME"
info " sudo systemctl stop $SERVICE_NAME"
info " sudo systemctl restart $SERVICE_NAME"
info " sudo systemctl status $SERVICE_NAME"
info " journalctl -u $SERVICE_NAME -f"
}
cmd_uninstall_service() {
if ! systemctl is-enabled "$SERVICE_NAME" &>/dev/null; then
warn "Service '$SERVICE_NAME' non installé, rien à faire."
return
fi
sudo systemctl stop "$SERVICE_NAME" 2>/dev/null || true
sudo systemctl disable "$SERVICE_NAME"
sudo rm -f "$SERVICE_FILE"
sudo systemctl daemon-reload
info "✅ Service '$SERVICE_NAME' désinstallé."
}
# =============================================================================
# Entrypoint
# =============================================================================
case "${1:-}" in
--setup) cmd_setup ;;
--start|"") cmd_start ;;
--install-service) cmd_install_service ;;
--uninstall-service) cmd_uninstall_service ;;
*) error "Usage : $0 [--setup|--start|--install-service|--uninstall-service]" ;;
esac
ANTHROPIC_BASE_URL=http://<IP>:8009/ ANTHROPIC_AUTH_TOKEN=unused ANTHROPIC_DEFAULT_OPUS_MODEL=pro ANTHROPIC_DEFAULT_SONNET_MODEL=balanced ANTHROPIC_DEFAULT_HAIKU_MODEL=flash API_TIMEOUT_MS=3000000 CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1 claude
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment