Last active
June 11, 2026 19:32
-
-
Save c4software/e527955657085f401d92c6f1678c0256 to your computer and use it in GitHub Desktop.
Setup-LLM - Simple script to setup LLM Server for Ryzen Strix Halo 128Gb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "$schema": "https://opencode.ai/config.json", | |
| "model": "llamaswap/balanced", | |
| "small_model": "llamaswap/flash", | |
| "provider": { | |
| "llamaswap": { | |
| "name": "llama-swap (Strix Halo)", | |
| "npm": "@ai-sdk/openai-compatible", | |
| "options": { | |
| "baseURL": "http://100.64.0.4:8009/v1" | |
| }, | |
| "models": { | |
| "balanced": { | |
| "name": "balanced" | |
| }, | |
| "balanced-nothink": { | |
| "name": "balanced-nothink" | |
| }, | |
| "flash": { | |
| "name": "flash" | |
| }, | |
| "gemma4": { | |
| "name": "gemma4" | |
| }, | |
| "pro": { | |
| "name": "pro" | |
| }, | |
| "pro-nothink": { | |
| "name": "pro-nothink" | |
| } | |
| } | |
| } | |
| }, | |
| "agent": { | |
| "build": { | |
| "model": "llamaswap/balanced", | |
| "permission": { | |
| "edit": "allow", | |
| "bash": { | |
| "*": "ask" | |
| } | |
| } | |
| }, | |
| "plan": { | |
| "model": "llamaswap/balanced-nothink", | |
| "description": "Analyse et planning rapide — design, UI, refactoring simple", | |
| "permission": { | |
| "edit": "deny", | |
| "bash": { | |
| "*": "deny" | |
| } | |
| } | |
| }, | |
| "plan-pro": { | |
| "model": "llamaswap/pro-nothink", | |
| "description": "Analyse approfondie — architecture, sécurité, refactoring complexe", | |
| "mode": "primary", | |
| "permission": { | |
| "edit": "deny", | |
| "bash": { | |
| "*": "deny" | |
| } | |
| } | |
| }, | |
| "explore": { | |
| "model": "llamaswap/balanced-nothink", | |
| "permission": { | |
| "edit": "deny", | |
| "bash": { | |
| "*": "deny" | |
| } | |
| } | |
| }, | |
| "general": { | |
| "model": "llamaswap/balanced-nothink" | |
| }, | |
| "title": { | |
| "model": "llamaswap/flash" | |
| }, | |
| "summary": { | |
| "model": "llamaswap/flash" | |
| }, | |
| "compaction": { | |
| "model": "llamaswap/balanced-nothink" | |
| } | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| set -euo pipefail | |
| # ============================================================================= | |
| # setup-llm.sh — llama-swap | |
| # ============================================================================= | |
| # ============================================================================= | |
| # MODÈLES (repo + fichier) | |
| # ============================================================================= | |
| MODEL_MINI_REPO="unsloth/Qwen3.5-2B-GGUF" | |
| MODEL_MINI_FILE="Qwen3.5-2B-UD-Q4_K_XL.gguf" | |
| MODEL_FLASH_REPO="unsloth/Qwen3.5-9B-GGUF" | |
| MODEL_FLASH_FILE="Qwen3.5-9B-UD-Q6_K_XL.gguf" | |
| MODEL_BALANCED_REPO="unsloth/Qwen3.6-35B-A3B-GGUF" | |
| MODEL_BALANCED_FILE="Qwen3.6-35B-A3B-UD-Q6_K_XL.gguf" | |
| # Qwen3-Coder-Next : MoE 80B non-reasoning natif, optimisé agentic coding | |
| MODEL_PRO_REPO="unsloth/Qwen3-Coder-Next-GGUF" | |
| MODEL_PRO_FILE="Qwen3-Coder-Next-UD-Q4_K_XL.gguf" | |
| # Qwen3.6-27B : dense 27B hybrid-thinking, raisonnement général + coding | |
| # UD-Q6_K_XL : 25.6 Go — fichier unique | |
| MODEL_THINKER_REPO="unsloth/Qwen3.6-27B-GGUF" | |
| MODEL_THINKER_FILE="Qwen3.6-27B-UD-Q6_K_XL.gguf" | |
| MODEL_GEMMA_REPO="unsloth/gemma-4-31B-it-GGUF" | |
| MODEL_GEMMA_FILE="gemma-4-31B-it-Q4_K_M.gguf" | |
| MODEL_GPTOSS_REPO="unsloth/gpt-oss-120b-GGUF" | |
| MODEL_GPTOSS_FILE_GLOB="UD-Q4_K_XL/*" | |
| MODEL_GPTOSS_FILE_ENTRY="UD-Q4_K_XL/gpt-oss-120b-UD-Q4_K_XL-00001-of-00002.gguf" | |
| # Qwen3.6-27B-MTP : variante MTP du thinker (même poids, draft intégré) | |
| # UD-Q4_K_XL : 17.9 Go — fichier unique | |
| MODEL_THINKER_MTP_REPO="unsloth/Qwen3.6-27B-MTP-GGUF" | |
| MODEL_THINKER_MTP_FILE="Qwen3.6-27B-UD-Q4_K_XL.gguf" | |
| # Qwen3.6-35B-A3B-MTP : variante MTP du balanced (même poids, draft intégré) | |
| MODEL_BALANCED_MTP_REPO="unsloth/Qwen3.6-35B-A3B-MTP-GGUF" | |
| MODEL_BALANCED_MTP_FILE="Qwen3.6-35B-A3B-UD-Q4_K_XL.gguf" | |
| # ============================================================================= | |
| # CHEMINS | |
| # ============================================================================= | |
| MODELS_BASE="$HOME/models" | |
| CACHE_DIR="$HOME/.cache/llama-slots" | |
| CONFIG_DIR="$HOME/models" | |
| MINI_PATH="$MODELS_BASE/mini/$MODEL_MINI_FILE" | |
| FLASH_PATH="$MODELS_BASE/flash/$MODEL_FLASH_FILE" | |
| BALANCED_PATH="$MODELS_BASE/balanced/$MODEL_BALANCED_FILE" | |
| PRO_PATH="$MODELS_BASE/pro/$MODEL_PRO_FILE" | |
| THINKER_PATH="$MODELS_BASE/thinker/$MODEL_THINKER_FILE" | |
| GEMMA_PATH="$MODELS_BASE/gemma/$MODEL_GEMMA_FILE" | |
| GPTOSS_PATH="$MODELS_BASE/gpt-oss/$MODEL_GPTOSS_FILE_ENTRY" | |
| THINKER_MTP_PATH="$MODELS_BASE/thinker-mtp/$MODEL_THINKER_MTP_FILE" | |
| BALANCED_MTP_PATH="$MODELS_BASE/balanced-mtp/$MODEL_BALANCED_MTP_FILE" | |
| # ============================================================================= | |
| # FLAGS PAR MODÈLE | |
| # ============================================================================= | |
| declare -A MODEL_FLAGS | |
| MODEL_FLAGS[mini]="--model $MINI_PATH \ | |
| --port 8081 \ | |
| --ctx-size 32768 \ | |
| --cache-ram 2048 \ | |
| --temp 0.7 --top-k 20 --top-p 0.8 --min-p 0.0 \ | |
| -np 4" | |
| MODEL_FLAGS[flash]="--model $FLASH_PATH \ | |
| --port 8089 \ | |
| --ctx-size 32768 \ | |
| --cache-ram 2048 \ | |
| --temp 0.7 --top-k 20 --top-p 0.8 --min-p 0.0 \ | |
| -np 4" | |
| MODEL_FLAGS[balanced]="--model $BALANCED_PATH \ | |
| --port 8082 \ | |
| --ctx-size 393216 \ | |
| --cache-ram 6144 \ | |
| --temp 0.6 --top-k 20 --top-p 0.95 --min-p 0.0 \ | |
| -np 3" | |
| MODEL_FLAGS[balanced-nothink]="--model $BALANCED_PATH \ | |
| --port 8084 \ | |
| --ctx-size 524288 \ | |
| --cache-ram 12288 \ | |
| --reasoning off \ | |
| --temp 0.6 --top-k 20 --top-p 0.95 --min-p 0.0 \ | |
| -np 2" | |
| # Qwen3-Coder-Next — MoE hybrid-attention, cache-reuse désactivé (incompatible) | |
| MODEL_FLAGS[pro]="--model $PRO_PATH \ | |
| --port 8083 \ | |
| --ctx-size 131072 \ | |
| --cache-ram 4096 \ | |
| --temp 1.0 --top-k 40 --top-p 0.95 --min-p 0.01 \ | |
| --cache-reuse 0 \ | |
| -np 1" | |
| # Qwen3.6-27B thinking — raisonnement activé, tool-calling jinja, presence_penalty recommandé | |
| MODEL_FLAGS[thinker]="--model $THINKER_PATH \ | |
| --port 8090 \ | |
| --ctx-size 131072 \ | |
| --cache-ram 4096 \ | |
| --temp 0.6 --top-k 20 --top-p 0.95 --min-p 0.0 \ | |
| --presence-penalty 1.5 \ | |
| --jinja \ | |
| -np 1" | |
| # Qwen3.6-27B nothink — réponses directes sans chain-of-thought | |
| MODEL_FLAGS[thinker-nothink]="--model $THINKER_PATH \ | |
| --port 8091 \ | |
| --ctx-size 131072 \ | |
| --cache-ram 4096 \ | |
| --reasoning off \ | |
| --temp 0.6 --top-k 20 --top-p 0.95 --min-p 0.0 \ | |
| --presence-penalty 1.5 \ | |
| -np 1" | |
| MODEL_FLAGS[gemma4]="--model $GEMMA_PATH \ | |
| --port 8086 \ | |
| --ctx-size 512000 \ | |
| --cache-ram 4096 \ | |
| --reasoning off \ | |
| --temp 1.0 --top-k 64 --top-p 0.95 --min-p 0.01 \ | |
| -np 2" | |
| MODEL_FLAGS[gpt-oss]="--model $GPTOSS_PATH \ | |
| --port 8087 \ | |
| --ctx-size 131072 \ | |
| --cache-ram 8192 \ | |
| --temp 1.0 --top-k 0 --top-p 1.0 --min-p 0.0 \ | |
| -np 1" | |
| # Qwen3.6-27B-MTP nothink — spéculation MTP, cache-reuse désactivé (incompatible MTP) | |
| # Utilise --chat-template-kwargs (pas --reasoning off) pour désactiver le thinking sur Qwen3.6 | |
| MODEL_FLAGS[thinker-mtp-nothink]="--model $THINKER_MTP_PATH \ | |
| --port 8092 \ | |
| --ctx-size 131072 \ | |
| --cache-ram 4096 \ | |
| --temp 0.7 --top-k 20 --top-p 0.8 --min-p 0.0 \ | |
| --presence-penalty 1.5 \ | |
| --chat-template-kwargs '{\"enable_thinking\":false}' \ | |
| --cache-reuse 0 \ | |
| --spec-type draft-mtp --spec-draft-n-max 2 \ | |
| --jinja \ | |
| -np 1" | |
| # Qwen3.6-35B-A3B-MTP nothink — spéculation MTP, cache-reuse désactivé (incompatible MTP) | |
| MODEL_FLAGS[balanced-mtp-nothink]="--model $BALANCED_MTP_PATH \ | |
| --port 8093 \ | |
| --ctx-size 131072 \ | |
| --cache-ram 6144 \ | |
| --temp 0.7 --top-k 20 --top-p 0.8 --min-p 0.0 \ | |
| --presence-penalty 1.5 \ | |
| --chat-template-kwargs '{\"enable_thinking\":false}' \ | |
| --cache-reuse 0 \ | |
| --spec-type draft-mtp --spec-draft-n-max 2 \ | |
| --jinja \ | |
| -np 1" | |
| # ============================================================================= | |
| # Construction de la commande | |
| # ============================================================================= | |
| build_llama_server_cmd() { | |
| local name="$1" | |
| local flags="${MODEL_FLAGS[$name]:-}" | |
| [[ -z "$flags" ]] && error "Aucun flag défini pour le modèle '$name'" | |
| local cmd="llama-server $flags" | |
| # Flags communs à tous les modèles | |
| # Note : --cache-reuse 0 dans MODEL_FLAGS[pro/thinker-mtp/balanced-mtp] écrase ce --cache-reuse 4096 | |
| # car llama-server prend le dernier flag en cas de doublon | |
| cmd="$cmd \ | |
| --n-gpu-layers 99 \ | |
| --cache-reuse 4096 \ | |
| --cache-type-k q4_0 \ | |
| --cache-type-v q4_0 \ | |
| --flash-attn on \ | |
| --prio 2 \ | |
| --metrics \ | |
| --slot-prompt-similarity 0.5 \ | |
| --presence-penalty 0.0" | |
| cmd="$cmd --slot-save-path $CACHE_DIR/$name" | |
| echo "$cmd" | |
| } | |
| # ============================================================================= | |
| # Helpers | |
| # ============================================================================= | |
| RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; NC='\033[0m' | |
| info() { echo -e "${GREEN}[INFO]${NC} $*"; } | |
| warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } | |
| error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; } | |
| _dl() { | |
| local target="$1" repo="$2"; shift 2 | |
| if [[ -f "$target" ]]; then info "$(basename "$target") déjà présent, skip."; return; fi | |
| info "Téléchargement $(basename "$target")..." | |
| HF_XET_HIGH_PERFORMANCE=1 hf download "$repo" "$@" --local-dir "$(dirname "$target")" | |
| } | |
| _dl_shard() { | |
| local target="$1" repo="$2" glob="$3" | |
| local shard_dir dest_dir | |
| shard_dir="$(dirname "$target")" | |
| dest_dir="$(dirname "$shard_dir")" | |
| if [[ -f "$target" ]]; then info "$(basename "$target") déjà présent, skip."; return; fi | |
| info "Téléchargement shards $(basename "$shard_dir")..." | |
| HF_XET_HIGH_PERFORMANCE=1 hf download "$repo" --include "$glob" --local-dir "$dest_dir" | |
| } | |
| # ============================================================================= | |
| # setup | |
| # ============================================================================= | |
| cmd_setup() { | |
| info "Vérification des dépendances..." | |
| PACMAN_PKGS=(curl llama.cpp-vulkan llama-swap-bin python-huggingface-hub python-hf-xet) | |
| MISSING=() | |
| for pkg in "${PACMAN_PKGS[@]}"; do | |
| paru -Qi "$pkg" &>/dev/null || MISSING+=("$pkg") | |
| done | |
| [[ ${#MISSING[@]} -gt 0 ]] && paru -S --noconfirm "${MISSING[@]}" | |
| command -v hf >/dev/null || error "hf introuvable" | |
| info "Création des dossiers..." | |
| mkdir -p "$MODELS_BASE"/{mini,flash,balanced,pro,thinker,gemma,gpt-oss,thinker-mtp,balanced-mtp} | |
| mkdir -p "$CACHE_DIR"/{mini,flash,balanced,balanced-nothink,pro,thinker,thinker-nothink,gemma4,gpt-oss,thinker-mtp-nothink,balanced-mtp-nothink} | |
| _dl "$MINI_PATH" "$MODEL_MINI_REPO" "$MODEL_MINI_FILE" | |
| _dl "$FLASH_PATH" "$MODEL_FLASH_REPO" "$MODEL_FLASH_FILE" | |
| _dl "$BALANCED_PATH" "$MODEL_BALANCED_REPO" "$MODEL_BALANCED_FILE" | |
| _dl "$PRO_PATH" "$MODEL_PRO_REPO" "$MODEL_PRO_FILE" | |
| _dl "$THINKER_PATH" "$MODEL_THINKER_REPO" "$MODEL_THINKER_FILE" | |
| _dl "$GEMMA_PATH" "$MODEL_GEMMA_REPO" "$MODEL_GEMMA_FILE" | |
| _dl_shard "$GPTOSS_PATH" "$MODEL_GPTOSS_REPO" "$MODEL_GPTOSS_FILE_GLOB" | |
| _dl "$THINKER_MTP_PATH" "$MODEL_THINKER_MTP_REPO" "$MODEL_THINKER_MTP_FILE" | |
| _dl "$BALANCED_MTP_PATH" "$MODEL_BALANCED_MTP_REPO" "$MODEL_BALANCED_MTP_FILE" | |
| info "Génération de llama-swap-config.yaml..." | |
| cat > "$CONFIG_DIR/llama-swap-config.yaml" << YAML | |
| healthCheckTimeout: 120 | |
| models: | |
| "mini": | |
| proxy: http://127.0.0.1:8081 | |
| cmd: > | |
| $(build_llama_server_cmd mini) | |
| "flash": | |
| proxy: http://127.0.0.1:8089 | |
| cmd: > | |
| $(build_llama_server_cmd flash) | |
| "balanced": | |
| proxy: http://127.0.0.1:8082 | |
| ttl: 300 | |
| cmd: > | |
| $(build_llama_server_cmd balanced) | |
| "balanced-nothink": | |
| proxy: http://127.0.0.1:8084 | |
| cmd: > | |
| $(build_llama_server_cmd balanced-nothink) | |
| "pro": | |
| proxy: http://127.0.0.1:8083 | |
| ttl: 1800 | |
| cmd: > | |
| $(build_llama_server_cmd pro) | |
| "thinker": | |
| proxy: http://127.0.0.1:8090 | |
| ttl: 600 | |
| cmd: > | |
| $(build_llama_server_cmd thinker) | |
| "thinker-nothink": | |
| proxy: http://127.0.0.1:8091 | |
| ttl: 600 | |
| cmd: > | |
| $(build_llama_server_cmd thinker-nothink) | |
| "gemma4": | |
| proxy: http://127.0.0.1:8086 | |
| ttl: 600 | |
| cmd: > | |
| $(build_llama_server_cmd gemma4) | |
| "gpt-oss": | |
| proxy: http://127.0.0.1:8087 | |
| ttl: 600 | |
| cmd: > | |
| $(build_llama_server_cmd gpt-oss) | |
| "thinker-mtp-nothink": | |
| proxy: http://127.0.0.1:8092 | |
| ttl: 600 | |
| cmd: > | |
| $(build_llama_server_cmd thinker-mtp-nothink) | |
| "balanced-mtp-nothink": | |
| proxy: http://127.0.0.1:8093 | |
| ttl: 600 | |
| cmd: > | |
| $(build_llama_server_cmd balanced-mtp-nothink) | |
| groups: | |
| "always-on": | |
| exclusive: false | |
| persistent: true | |
| members: | |
| - flash | |
| - balanced-nothink | |
| "heavy": | |
| exclusive: true | |
| members: | |
| - balanced | |
| - pro | |
| - balanced-mtp-nothink | |
| "heavy-thinker": | |
| exclusive: true | |
| members: | |
| - thinker | |
| - thinker-nothink | |
| - thinker-mtp-nothink | |
| "heavy-gemma": | |
| exclusive: true | |
| members: | |
| - gemma4 | |
| "heavy-gpt-oss": | |
| exclusive: true | |
| members: | |
| - gpt-oss | |
| YAML | |
| info "✅ Config générée : $CONFIG_DIR/llama-swap-config.yaml" | |
| info "Setup terminé → ./setup-llm.sh --start" | |
| } | |
| # ============================================================================= | |
| # start | |
| # ============================================================================= | |
| cmd_start() { | |
| export PATH="$HOME/.local/bin:$PATH" | |
| command -v llama-swap >/dev/null || error "llama-swap introuvable" | |
| command -v llama-server >/dev/null || error "llama-server introuvable" | |
| [[ -f "$CONFIG_DIR/llama-swap-config.yaml" ]] || error "Config introuvable" | |
| info "Lancement de llama-swap sur :8009..." | |
| llama-swap --config "$CONFIG_DIR/llama-swap-config.yaml" --listen 0.0.0.0:8009 | |
| } | |
| # ============================================================================= | |
| # Service systemd système | |
| # ============================================================================= | |
| SERVICE_NAME="llama-swap" | |
| SERVICE_FILE="/etc/systemd/system/${SERVICE_NAME}.service" | |
| cmd_install_service() { | |
| info "Génération du service systemd système..." | |
| local script_path | |
| script_path="$(realpath "$0")" | |
| sudo tee "$SERVICE_FILE" > /dev/null << SERVICE | |
| [Unit] | |
| Description=llama-swap — LLM model proxy router | |
| After=network.target | |
| [Service] | |
| Type=simple | |
| User=$USER | |
| Group=$(id -gn) | |
| ExecStart=${script_path} --start | |
| Restart=on-failure | |
| RestartSec=5 | |
| StandardOutput=journal | |
| StandardError=journal | |
| Environment="PATH=$HOME/.local/bin:/usr/local/bin:/usr/bin:/bin" | |
| Environment="HOME=$HOME" | |
| [Install] | |
| WantedBy=multi-user.target | |
| SERVICE | |
| sudo systemctl daemon-reload | |
| sudo systemctl enable "$SERVICE_NAME" | |
| info "✅ Service installé : $SERVICE_FILE" | |
| info " Démarrage automatique au boot activé (multi-user.target)" | |
| info "Commandes utiles :" | |
| info " sudo systemctl start $SERVICE_NAME" | |
| info " sudo systemctl stop $SERVICE_NAME" | |
| info " sudo systemctl restart $SERVICE_NAME" | |
| info " sudo systemctl status $SERVICE_NAME" | |
| info " journalctl -u $SERVICE_NAME -f" | |
| } | |
| cmd_uninstall_service() { | |
| if ! systemctl is-enabled "$SERVICE_NAME" &>/dev/null; then | |
| warn "Service '$SERVICE_NAME' non installé, rien à faire." | |
| return | |
| fi | |
| sudo systemctl stop "$SERVICE_NAME" 2>/dev/null || true | |
| sudo systemctl disable "$SERVICE_NAME" | |
| sudo rm -f "$SERVICE_FILE" | |
| sudo systemctl daemon-reload | |
| info "✅ Service '$SERVICE_NAME' désinstallé." | |
| } | |
| # ============================================================================= | |
| # Entrypoint | |
| # ============================================================================= | |
| case "${1:-}" in | |
| --setup) cmd_setup ;; | |
| --start|"") cmd_start ;; | |
| --install-service) cmd_install_service ;; | |
| --uninstall-service) cmd_uninstall_service ;; | |
| *) error "Usage : $0 [--setup|--start|--install-service|--uninstall-service]" ;; | |
| esac |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ANTHROPIC_BASE_URL=http://<IP>:8009/ ANTHROPIC_AUTH_TOKEN=unused ANTHROPIC_DEFAULT_OPUS_MODEL=pro ANTHROPIC_DEFAULT_SONNET_MODEL=balanced ANTHROPIC_DEFAULT_HAIKU_MODEL=flash API_TIMEOUT_MS=3000000 CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1 claude |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment