Created
June 17, 2026 20:19
-
-
Save c4software/9c2707dfa1b5f3f650a5c75498aae741 to your computer and use it in GitHub Desktop.
llama server multi models setup (Simple script to setup LLM Server for Ryzen Strix Halo 128Gb)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| set -euo pipefail | |
| # ============================================================================= | |
| # setup-llm.sh — llama-server router mode natif | |
| # ============================================================================= | |
| # ============================================================================= | |
| # MODÈLES (repo + fichier) | |
| # ============================================================================= | |
| MODEL_QWEN35_2B_REPO="unsloth/Qwen3.5-2B-GGUF" | |
| MODEL_QWEN35_2B_FILE="Qwen3.5-2B-UD-Q4_K_XL.gguf" | |
| MODEL_QWEN35_9B_REPO="unsloth/Qwen3.5-9B-GGUF" | |
| MODEL_QWEN35_9B_FILE="Qwen3.5-9B-UD-Q6_K_XL.gguf" | |
| MODEL_QWEN36_35B_A3B_REPO="unsloth/Qwen3.6-35B-A3B-GGUF" | |
| MODEL_QWEN36_35B_A3B_FILE="Qwen3.6-35B-A3B-UD-Q6_K_XL.gguf" | |
| MODEL_QWEN3_CODER_NEXT_REPO="unsloth/Qwen3-Coder-Next-GGUF" | |
| MODEL_QWEN3_CODER_NEXT_FILE="Qwen3-Coder-Next-UD-Q4_K_XL.gguf" | |
| MODEL_QWEN36_27B_REPO="unsloth/Qwen3.6-27B-GGUF" | |
| MODEL_QWEN36_27B_FILE="Qwen3.6-27B-UD-Q6_K_XL.gguf" | |
| # Gemma 4 31B — MTP drafter embarqué dans le repo principal | |
| MODEL_GEMMA_31B_REPO="unsloth/gemma-4-31B-it-GGUF" | |
| MODEL_GEMMA_31B_FILE="gemma-4-31B-it-Q4_K_M.gguf" | |
| MODEL_GEMMA_31B_MTP_FILE="mtp-gemma-4-31B-it.gguf" | |
| # Gemma 4 12B — modèle unifié texte+image+audio+vidéo, MTP drafter embarqué | |
| MODEL_GEMMA_12B_REPO="unsloth/gemma-4-12b-it-GGUF" | |
| MODEL_GEMMA_12B_FILE="gemma-4-12b-it-UD-Q4_K_XL.gguf" | |
| MODEL_GEMMA_12B_MTP_FILE="mtp-gemma-4-12b-it.gguf" | |
| # GPT-OSS 120B — shards UD-Q4_K_XL | |
| MODEL_GPTOSS_REPO="unsloth/gpt-oss-120b-GGUF" | |
| MODEL_GPTOSS_FILE_GLOB="UD-Q4_K_XL/*" | |
| MODEL_GPTOSS_FILE_ENTRY="UD-Q4_K_XL/gpt-oss-120b-UD-Q4_K_XL-00001-of-00002.gguf" | |
| # Qwen3.6-27B-MTP — variante MTP, draft intégré (Q4_K_XL) | |
| MODEL_QWEN36_27B_MTP_REPO="unsloth/Qwen3.6-27B-MTP-GGUF" | |
| MODEL_QWEN36_27B_MTP_FILE="Qwen3.6-27B-UD-Q4_K_XL.gguf" | |
| # Qwen3.6-35B-A3B-MTP — variante MTP, draft intégré (Q4_K_XL) | |
| MODEL_QWEN36_35B_A3B_MTP_REPO="unsloth/Qwen3.6-35B-A3B-MTP-GGUF" | |
| MODEL_QWEN36_35B_A3B_MTP_FILE="Qwen3.6-35B-A3B-UD-Q4_K_XL.gguf" | |
| # MiniMax-M2.7 — MoE 230B (10B actifs), 200K ctx, shards UD-IQ4_XS | |
| MODEL_MINIMAX_REPO="unsloth/MiniMax-M2.7-GGUF" | |
| MODEL_MINIMAX_FILE_GLOB="UD-IQ4_XS/*" | |
| MODEL_MINIMAX_FILE_ENTRY="UD-IQ4_XS/MiniMax-M2.7-UD-IQ4_XS-00001-of-00004.gguf" | |
| # Qwopus3.6-27B-Coder-MTP — fine-tune coder SFT, SWE-bench Verified 67.0% | |
| MODEL_QWOPUS_CODER_MTP_REPO="Jackrong/Qwopus3.6-27B-Coder-MTP-GGUF" | |
| MODEL_QWOPUS_CODER_MTP_FILE="Qwopus3.6-27B-Coder-MTP-Q5_K_M.gguf" | |
| # ============================================================================= | |
| # CHEMINS | |
| # ============================================================================= | |
| MODELS_BASE="$HOME/models" | |
| CONFIG_DIR="$HOME/models" | |
| QWEN35_2B_PATH="$MODELS_BASE/qwen3.5-2b/$MODEL_QWEN35_2B_FILE" | |
| QWEN35_9B_PATH="$MODELS_BASE/qwen3.5-9b/$MODEL_QWEN35_9B_FILE" | |
| QWEN36_35B_A3B_PATH="$MODELS_BASE/qwen3.6-35b-a3b/$MODEL_QWEN36_35B_A3B_FILE" | |
| QWEN3_CODER_NEXT_PATH="$MODELS_BASE/qwen3-coder-next/$MODEL_QWEN3_CODER_NEXT_FILE" | |
| QWEN36_27B_PATH="$MODELS_BASE/qwen3.6-27b/$MODEL_QWEN36_27B_FILE" | |
| GEMMA_31B_PATH="$MODELS_BASE/gemma-31b/$MODEL_GEMMA_31B_FILE" | |
| GEMMA_31B_MTP_PATH="$MODELS_BASE/gemma-31b/$MODEL_GEMMA_31B_MTP_FILE" | |
| GEMMA_12B_PATH="$MODELS_BASE/gemma-12b/$MODEL_GEMMA_12B_FILE" | |
| GEMMA_12B_MTP_PATH="$MODELS_BASE/gemma-12b/$MODEL_GEMMA_12B_MTP_FILE" | |
| GPTOSS_PATH="$MODELS_BASE/gpt-oss/$MODEL_GPTOSS_FILE_ENTRY" | |
| QWEN36_27B_MTP_PATH="$MODELS_BASE/qwen3.6-27b-mtp/$MODEL_QWEN36_27B_MTP_FILE" | |
| QWEN36_35B_A3B_MTP_PATH="$MODELS_BASE/qwen3.6-35b-a3b-mtp/$MODEL_QWEN36_35B_A3B_MTP_FILE" | |
| MINIMAX_PATH="$MODELS_BASE/minimax/$MODEL_MINIMAX_FILE_ENTRY" | |
| QWOPUS_CODER_MTP_PATH="$MODELS_BASE/qwopus3.6-27b-coder-mtp/$MODEL_QWOPUS_CODER_MTP_FILE" | |
| # ============================================================================= | |
| # PRESETS INI | |
| # | |
| # Chaque entrée du tableau associatif définit le corps d'un preset [section]. | |
| # Conventions : | |
| # - Une clé ini par ligne, format "key = value" | |
| # - Les flags globaux ([*]) ne sont pas répétés ici | |
| # - Commentaire de section : ligne(s) commençant par ";" | |
| # - load-on-startup et stop-timeout gèrent le cycle de vie (always-on vs LRU) | |
| # - cache-reuse = 0 écrase le global 4096 pour les modèles MoE/MTP incompatibles | |
| # - cache-type-v = q8_0 en surcharge locale pour les coding agents | |
| # - swa-full + ctx-checkpoints : hybrides Qwen3.5/3.6 uniquement | |
| # - spec-draft-model : drafter externe (Gemma MTP uniquement) | |
| # ============================================================================= | |
| declare -A MODEL_INI | |
| # Qwen3.5-2B : dense 2B hybrid-thinking, ultra-léger | |
| # swa-full : exploite le ctx complet (SWA hybride GatedDeltaNet) | |
| MODEL_INI[qwen3.5-2b]=" | |
| model = $QWEN35_2B_PATH | |
| ctx-size = 32768 | |
| cache-ram = 2048 | |
| temp = 0.7 | |
| top-k = 20 | |
| top-p = 0.8 | |
| min-p = 0.0 | |
| parallel = 4 | |
| swa-full = true | |
| ctx-checkpoints = 128" | |
| # Qwen3.5-9B : dense 9B hybrid-thinking | |
| MODEL_INI[qwen3.5-9b]=" | |
| model = $QWEN35_9B_PATH | |
| ctx-size = 32768 | |
| cache-ram = 2048 | |
| temp = 0.7 | |
| top-k = 20 | |
| top-p = 0.8 | |
| min-p = 0.0 | |
| parallel = 4 | |
| swa-full = true | |
| ctx-checkpoints = 128 | |
| load-on-startup = true" | |
| # Qwen3.6-35B-A3B nothink — MoE hybride, always-on | |
| # cache-reuse 0 : MoE hybride incompatible avec le cache-reuse global | |
| MODEL_INI[qwen3.6-35b-a3b-nothink]=" | |
| model = $QWEN36_35B_A3B_PATH | |
| ctx-size = 524288 | |
| cache-ram = 12288 | |
| reasoning = off | |
| temp = 0.6 | |
| top-k = 20 | |
| top-p = 0.95 | |
| min-p = 0.0 | |
| parallel = 2 | |
| cache-reuse = 0 | |
| swa-full = true | |
| ctx-checkpoints = 128 | |
| load-on-startup = true" | |
| # Qwen3.6-35B-A3B thinking — raisonnement activé | |
| # cache-reuse 0 : MoE hybride incompatible | |
| MODEL_INI[qwen3.6-35b-a3b]=" | |
| model = $QWEN36_35B_A3B_PATH | |
| ctx-size = 393216 | |
| cache-ram = 6144 | |
| temp = 0.6 | |
| top-k = 20 | |
| top-p = 0.95 | |
| min-p = 0.0 | |
| parallel = 3 | |
| cache-reuse = 0 | |
| swa-full = true | |
| ctx-checkpoints = 128 | |
| stop-timeout = 300" | |
| # Qwen3.6-35B-A3B-MTP nothink — spéculation MTP draft=2 | |
| # cache-reuse 0 : incompatible MTP — nothink via chat-template-kwargs (pas --reasoning off) | |
| MODEL_INI[qwen3.6-35b-a3b-mtp-nothink]=" | |
| model = $QWEN36_35B_A3B_MTP_PATH | |
| ctx-size = 131072 | |
| cache-ram = 6144 | |
| temp = 0.7 | |
| top-k = 20 | |
| top-p = 0.8 | |
| min-p = 0.0 | |
| presence-penalty = 1.5 | |
| chat-template-kwargs = {\"enable_thinking\":false} | |
| cache-reuse = 0 | |
| spec-type = draft-mtp | |
| spec-draft-n-max = 2 | |
| jinja = true | |
| parallel = 1 | |
| swa-full = true | |
| ctx-checkpoints = 128 | |
| stop-timeout = 600" | |
| # Qwen3-Coder-Next — MoE 80B hybrid-attention, agentic coding | |
| # cache-type-v q8_0 : précision V critique pour les diffs de code | |
| # cache-reuse 0 : MoE hybrid-attention incompatible | |
| MODEL_INI[qwen3-coder-next]=" | |
| model = $QWEN3_CODER_NEXT_PATH | |
| ctx-size = 131072 | |
| cache-ram = 4096 | |
| temp = 1.0 | |
| top-k = 40 | |
| top-p = 0.95 | |
| min-p = 0.01 | |
| cache-type-v = q8_0 | |
| cache-reuse = 0 | |
| parallel = 1 | |
| stop-timeout = 1800" | |
| # Qwen3.6-27B thinking — raisonnement activé, tool-calling jinja, presence_penalty | |
| MODEL_INI[qwen3.6-27b]=" | |
| model = $QWEN36_27B_PATH | |
| ctx-size = 131072 | |
| cache-ram = 4096 | |
| temp = 0.6 | |
| top-k = 20 | |
| top-p = 0.95 | |
| min-p = 0.0 | |
| presence-penalty = 1.5 | |
| jinja = true | |
| parallel = 1 | |
| swa-full = true | |
| ctx-checkpoints = 128 | |
| stop-timeout = 600" | |
| # Qwen3.6-27B nothink — réponses directes sans chain-of-thought | |
| MODEL_INI[qwen3.6-27b-nothink]=" | |
| model = $QWEN36_27B_PATH | |
| ctx-size = 131072 | |
| cache-ram = 4096 | |
| reasoning = off | |
| temp = 0.6 | |
| top-k = 20 | |
| top-p = 0.95 | |
| min-p = 0.0 | |
| presence-penalty = 1.5 | |
| parallel = 1 | |
| swa-full = true | |
| ctx-checkpoints = 128 | |
| stop-timeout = 600" | |
| # Qwen3.6-27B-MTP nothink — spéculation MTP draft=2 | |
| # cache-reuse 0 : incompatible MTP — nothink via chat-template-kwargs | |
| MODEL_INI[qwen3.6-27b-mtp-nothink]=" | |
| model = $QWEN36_27B_MTP_PATH | |
| ctx-size = 131072 | |
| cache-ram = 4096 | |
| temp = 0.7 | |
| top-k = 20 | |
| top-p = 0.8 | |
| min-p = 0.0 | |
| presence-penalty = 1.5 | |
| chat-template-kwargs = {\"enable_thinking\":false} | |
| cache-reuse = 0 | |
| spec-type = draft-mtp | |
| spec-draft-n-max = 2 | |
| jinja = true | |
| parallel = 1 | |
| swa-full = true | |
| ctx-checkpoints = 128 | |
| stop-timeout = 600" | |
| # Qwopus3.6-27B-Coder-MTP nothink — fine-tune coder SFT, SWE-bench 67.0% | |
| # cache-type-v q8_0 : précision V critique pour les diffs de code | |
| # cache-reuse 0 : incompatible MTP | |
| MODEL_INI[qwopus3.6-27b-coder-mtp-nothink]=" | |
| model = $QWOPUS_CODER_MTP_PATH | |
| ctx-size = 131072 | |
| cache-ram = 4096 | |
| temp = 0.7 | |
| top-k = 20 | |
| top-p = 0.8 | |
| min-p = 0.0 | |
| presence-penalty = 1.5 | |
| chat-template-kwargs = {\"enable_thinking\":false} | |
| cache-type-v = q8_0 | |
| cache-reuse = 0 | |
| spec-type = draft-mtp | |
| spec-draft-n-max = 2 | |
| jinja = true | |
| parallel = 1 | |
| swa-full = true | |
| ctx-checkpoints = 128 | |
| stop-timeout = 600" | |
| # Gemma 4 31B — MTP drafter embarqué, texte seul (no-mmproj), spec-draft-n-max 4 | |
| # Pas de swa-full : architecture ISWA différente (issue #21468) | |
| # chat-template-kwargs + jinja : désactive le thinking (--reasoning off non supporté) | |
| MODEL_INI[gemma4-31b-mtp]=" | |
| model = $GEMMA_31B_PATH | |
| spec-draft-model = $GEMMA_31B_MTP_PATH | |
| ctx-size = 262144 | |
| cache-ram = 4096 | |
| no-mmproj = true | |
| chat-template-kwargs = {\"enable_thinking\":false} | |
| cache-reuse = 0 | |
| spec-type = draft-mtp | |
| spec-draft-n-max = 4 | |
| temp = 1.0 | |
| top-k = 64 | |
| top-p = 0.95 | |
| min-p = 0.01 | |
| jinja = true | |
| parallel = 2 | |
| stop-timeout = 600" | |
| # Gemma 4 12B — MTP drafter embarqué, modèle unifié texte+image+audio+vidéo, texte seul | |
| # Pas de swa-full : architecture ISWA différente (issue #21468) | |
| MODEL_INI[gemma4-12b-mtp]=" | |
| model = $GEMMA_12B_PATH | |
| spec-draft-model = $GEMMA_12B_MTP_PATH | |
| ctx-size = 262144 | |
| cache-ram = 2048 | |
| no-mmproj = true | |
| chat-template-kwargs = {\"enable_thinking\":false} | |
| cache-reuse = 0 | |
| spec-type = draft-mtp | |
| spec-draft-n-max = 2 | |
| temp = 1.0 | |
| top-k = 64 | |
| top-p = 0.95 | |
| min-p = 0.01 | |
| jinja = true | |
| parallel = 3 | |
| stop-timeout = 600" | |
| # GPT-OSS 120B — shards UD-Q4_K_XL | |
| MODEL_INI[gpt-oss]=" | |
| model = $GPTOSS_PATH | |
| ctx-size = 131072 | |
| cache-ram = 8192 | |
| temp = 1.0 | |
| top-k = 0 | |
| top-p = 1.0 | |
| min-p = 0.0 | |
| parallel = 1 | |
| stop-timeout = 600" | |
| # MiniMax-M2.7 — MoE 230B Mamba hybride (10B actifs), 200K ctx, shards UD-IQ4_XS | |
| # cache-reuse 0 : MoE incompatible — pas de swa-full (architecture Mamba, non SWA) | |
| MODEL_INI[minimax]=" | |
| model = $MINIMAX_PATH | |
| ctx-size = 131072 | |
| cache-ram = 8192 | |
| temp = 1.0 | |
| top-k = 40 | |
| top-p = 0.95 | |
| min-p = 0.01 | |
| cache-reuse = 0 | |
| parallel = 1 | |
| stop-timeout = 600" | |
| # ============================================================================= | |
| # Ordre d'émission des presets dans le ini | |
| # (declare -A ne préserve pas l'ordre d'insertion) | |
| # ============================================================================= | |
| PRESET_ORDER=( | |
| # always-on | |
| qwen3.5-2b | |
| qwen3.5-9b | |
| qwen3.6-35b-a3b-nothink | |
| # famille 35B A3B | |
| qwen3.6-35b-a3b | |
| qwen3.6-35b-a3b-mtp-nothink | |
| qwen3-coder-next | |
| # famille 27B | |
| qwen3.6-27b | |
| qwen3.6-27b-nothink | |
| qwen3.6-27b-mtp-nothink | |
| qwopus3.6-27b-coder-mtp-nothink | |
| # Gemma 4 | |
| gemma4-31b-mtp | |
| gemma4-12b-mtp | |
| # géants | |
| gpt-oss | |
| minimax | |
| ) | |
| # ============================================================================= | |
| # Génération du models.ini | |
| # ============================================================================= | |
| generate_models_ini() { | |
| cat <<HEADER | |
| version = 1 | |
| ; ============================================================================= | |
| ; Flags globaux — appliqués à tous les presets sauf surcharge locale | |
| ; ============================================================================= | |
| [*] | |
| n-gpu-layers = 99 | |
| cache-type-k = q8_0 | |
| cache-type-v = q4_0 | |
| flash-attn = on | |
| prio = 2 | |
| metrics = true | |
| slot-prompt-similarity = 0.5 | |
| cache-reuse = 4096 | |
| presence-penalty = 0.0 | |
| ; ============================================================================= | |
| ; always-on — load-on-startup, pas de stop-timeout | |
| ; ============================================================================= | |
| HEADER | |
| local prev_group="" | |
| for name in "${PRESET_ORDER[@]}"; do | |
| # Séparateurs de groupe à partir des commentaires dans PRESET_ORDER | |
| case "$name" in | |
| qwen3.6-35b-a3b) | |
| echo "; ============================================================================="; echo "; Lourds — chargés à la demande, déchargés par LRU après stop-timeout"; echo "; ============================================================================="; echo ""; echo "; --- Famille 35B A3B ---"; echo "" ;; | |
| qwen3.6-27b) | |
| echo "; --- Famille 27B ---"; echo "" ;; | |
| gemma4-31b-mtp) | |
| echo "; --- Famille Gemma 4 — pas de swa-full (architecture ISWA, issue #21468) ---"; echo "" ;; | |
| gpt-oss) | |
| echo "; --- Géants ---"; echo "" ;; | |
| esac | |
| echo "[$name]" | |
| # Émet les lignes du preset en retirant la ligne vide initiale | |
| echo "${MODEL_INI[$name]}" | sed '/^$/d' | |
| echo "" | |
| done | |
| } | |
| # ============================================================================= | |
| # Helpers | |
| # ============================================================================= | |
| RED='\033[0;31m' | |
| GREEN='\033[0;32m' | |
| YELLOW='\033[1;33m' | |
| NC='\033[0m' | |
| info() { echo -e "${GREEN}[INFO]${NC} $*"; } | |
| warn() { echo -e "${YELLOW}[WARN]${NC} $*"; } | |
| error() { echo -e "${RED}[ERROR]${NC} $*"; exit 1; } | |
| _dl() { | |
| local target="$1" repo="$2" | |
| shift 2 | |
| if [[ -f "$target" ]]; then | |
| info "$(basename "$target") déjà présent, skip." | |
| return | |
| fi | |
| info "Téléchargement $(basename "$target")..." | |
| HF_XET_HIGH_PERFORMANCE=1 hf download "$repo" "$@" --local-dir "$(dirname "$target")" | |
| } | |
| _dl_shard() { | |
| local target="$1" repo="$2" glob="$3" | |
| local shard_dir dest_dir | |
| shard_dir="$(dirname "$target")" | |
| dest_dir="$(dirname "$shard_dir")" | |
| if [[ -f "$target" ]]; then | |
| info "$(basename "$target") déjà présent, skip." | |
| return | |
| fi | |
| info "Téléchargement shards $(basename "$shard_dir")..." | |
| HF_XET_HIGH_PERFORMANCE=1 hf download "$repo" --include "$glob" --local-dir "$dest_dir" | |
| } | |
| # ============================================================================= | |
| # setup | |
| # ============================================================================= | |
| cmd_setup() { | |
| info "Vérification des dépendances..." | |
| PACMAN_PKGS=(curl llama.cpp-vulkan python-huggingface-hub python-hf-xet) | |
| MISSING=() | |
| for pkg in "${PACMAN_PKGS[@]}"; do | |
| paru -Qi "$pkg" &>/dev/null || MISSING+=("$pkg") | |
| done | |
| [[ ${#MISSING[@]} -gt 0 ]] && paru -S --noconfirm "${MISSING[@]}" | |
| command -v hf >/dev/null || error "hf introuvable" | |
| info "Création des dossiers..." | |
| mkdir -p "$MODELS_BASE"/{qwen3.5-2b,qwen3.5-9b,qwen3.6-35b-a3b,qwen3-coder-next,qwen3.6-27b,gemma-31b,gemma-12b,gpt-oss,qwen3.6-27b-mtp,qwen3.6-35b-a3b-mtp,minimax,qwopus3.6-27b-coder-mtp} | |
| _dl "$QWEN35_2B_PATH" "$MODEL_QWEN35_2B_REPO" "$MODEL_QWEN35_2B_FILE" | |
| _dl "$QWEN35_9B_PATH" "$MODEL_QWEN35_9B_REPO" "$MODEL_QWEN35_9B_FILE" | |
| _dl "$QWEN36_35B_A3B_PATH" "$MODEL_QWEN36_35B_A3B_REPO" "$MODEL_QWEN36_35B_A3B_FILE" | |
| _dl "$QWEN3_CODER_NEXT_PATH" "$MODEL_QWEN3_CODER_NEXT_REPO" "$MODEL_QWEN3_CODER_NEXT_FILE" | |
| _dl "$QWEN36_27B_PATH" "$MODEL_QWEN36_27B_REPO" "$MODEL_QWEN36_27B_FILE" | |
| _dl "$GEMMA_31B_PATH" "$MODEL_GEMMA_31B_REPO" "$MODEL_GEMMA_31B_FILE" | |
| _dl "$GEMMA_31B_MTP_PATH" "$MODEL_GEMMA_31B_REPO" "$MODEL_GEMMA_31B_MTP_FILE" | |
| _dl "$GEMMA_12B_PATH" "$MODEL_GEMMA_12B_REPO" "$MODEL_GEMMA_12B_FILE" | |
| _dl "$GEMMA_12B_MTP_PATH" "$MODEL_GEMMA_12B_REPO" "$MODEL_GEMMA_12B_MTP_FILE" | |
| _dl_shard "$GPTOSS_PATH" "$MODEL_GPTOSS_REPO" "$MODEL_GPTOSS_FILE_GLOB" | |
| _dl "$QWEN36_27B_MTP_PATH" "$MODEL_QWEN36_27B_MTP_REPO" "$MODEL_QWEN36_27B_MTP_FILE" | |
| _dl "$QWEN36_35B_A3B_MTP_PATH" "$MODEL_QWEN36_35B_A3B_MTP_REPO" "$MODEL_QWEN36_35B_A3B_MTP_FILE" | |
| _dl_shard "$MINIMAX_PATH" "$MODEL_MINIMAX_REPO" "$MODEL_MINIMAX_FILE_GLOB" | |
| _dl "$QWOPUS_CODER_MTP_PATH" "$MODEL_QWOPUS_CODER_MTP_REPO" "$MODEL_QWOPUS_CODER_MTP_FILE" | |
| info "Génération de models.ini..." | |
| generate_models_ini > "$CONFIG_DIR/models.ini" | |
| info "✅ Config générée : $CONFIG_DIR/models.ini" | |
| info "Setup terminé → ./setup-llm.sh --start" | |
| } | |
| # ============================================================================= | |
| # start | |
| # ============================================================================= | |
| cmd_start() { | |
| export PATH="$HOME/.local/bin:$PATH" | |
| command -v llama-server >/dev/null || error "llama-server introuvable" | |
| [[ -f "$CONFIG_DIR/models.ini" ]] || error "Config introuvable — lance d'abord --setup" | |
| info "Lancement de llama-server (router mode) sur :8009..." | |
| # --models-max 3 : 2 always-on + 1 lourd actif en LRU | |
| # WebUI disponible sur http://0.0.0.0:8009 | |
| llama-server \ | |
| --host 0.0.0.0 \ | |
| --port 8009 \ | |
| --models-preset "$CONFIG_DIR/models.ini" \ | |
| --models-max 3 \ | |
| --models-autoload \ | |
| --jinja | |
| } | |
| # ============================================================================= | |
| # Service systemd système | |
| # ============================================================================= | |
| SERVICE_NAME="llama-server" | |
| SERVICE_FILE="/etc/systemd/system/${SERVICE_NAME}.service" | |
| cmd_install_service() { | |
| info "Génération du service systemd système..." | |
| local script_path | |
| script_path="$(realpath "$0")" | |
| sudo tee "$SERVICE_FILE" > /dev/null << SERVICE | |
| [Unit] | |
| Description=llama-server — LLM router mode natif | |
| After=network.target | |
| [Service] | |
| Type=simple | |
| User=$USER | |
| Group=$(id -gn) | |
| ExecStart=${script_path} --start | |
| Restart=on-failure | |
| RestartSec=5 | |
| StandardOutput=journal | |
| StandardError=journal | |
| Environment="PATH=$HOME/.local/bin:/usr/local/bin:/usr/bin:/bin" | |
| Environment="HOME=$HOME" | |
| [Install] | |
| WantedBy=multi-user.target | |
| SERVICE | |
| sudo systemctl daemon-reload | |
| sudo systemctl enable "$SERVICE_NAME" | |
| info "✅ Service installé : $SERVICE_FILE" | |
| info " Démarrage automatique au boot activé (multi-user.target)" | |
| info "Commandes utiles :" | |
| info " sudo systemctl start $SERVICE_NAME" | |
| info " sudo systemctl stop $SERVICE_NAME" | |
| info " sudo systemctl restart $SERVICE_NAME" | |
| info " sudo systemctl status $SERVICE_NAME" | |
| info " journalctl -u $SERVICE_NAME -f" | |
| } | |
| cmd_uninstall_service() { | |
| if ! systemctl is-enabled "$SERVICE_NAME" &>/dev/null; then | |
| warn "Service '$SERVICE_NAME' non installé, rien à faire." | |
| return | |
| fi | |
| sudo systemctl stop "$SERVICE_NAME" 2>/dev/null || true | |
| sudo systemctl disable "$SERVICE_NAME" | |
| sudo rm -f "$SERVICE_FILE" | |
| sudo systemctl daemon-reload | |
| info "✅ Service '$SERVICE_NAME' désinstallé." | |
| } | |
| # ============================================================================= | |
| # Entrypoint | |
| # ============================================================================= | |
| case "${1:-}" in | |
| --setup) cmd_setup ;; | |
| --start | "") cmd_start ;; | |
| --install-service) cmd_install_service ;; | |
| --uninstall-service) cmd_uninstall_service ;; | |
| *) error "Usage : $0 [--setup|--start|--install-service|--uninstall-service]" ;; | |
| esac |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment