rajbos · May 9, 2026 11:54
diff --git a/Measure-LocalAIThroughput.ps1 b/Measure-LocalAIThroughput.ps1
 #Requires -Version 7
 <#
 .SYNOPSIS
    Benchmarks output throughput (tokens/sec) for local AI inference providers.
 .DESCRIPTION
    Sends a standardised prompt to Ollama, LM Studio, Foundry Local, and/or
    vLLM (HuggingFace), measures tokens generated per second using the
    OpenAI-compatible /v1/chat/completions API, and outputs a Markdown summary table.

    Each provider is tested $Runs times; results are averaged.
    Providers that are not reachable are skipped automatically.

 .PARAMETER Runs
    Number of timed test runs per provider/model. Default: 3.

 .PARAMETER MaxTokens
    Maximum output tokens per run. Keep this fixed so runs are comparable.
    Default: 400.

 .PARAMETER TimeoutSec
    HTTP timeout per request in seconds. Default: 120.

 .EXAMPLE
    # Test all running providers with default settings
    .\Measure-LocalAIThroughput.ps1

 .EXAMPLE
    # More runs for a tighter average
    .\Measure-LocalAIThroughput.ps1 -Runs 5

 .NOTES
    What this measures
    ------------------
    Output throughput = completion_tokens / elapsed_seconds
    This is the metric that determines how fast you see text appear in the CLI.
    It is measured from "request sent" to "full response received", which
    includes model decode time but not network latency (all local endpoints).

    What this does NOT measure
    --------------------------
    - Time-to-first-token (TTFT) — not available from non-streaming responses
    - Prompt-processing (prefill) speed
    - Quality / correctness of the output

    Test prompt
    -----------
    The prompt asks for a ~200-350 token PowerShell function so that each run
    generates a meaningful, consistent amount of output across all providers.
    A warm-up run is discarded before the timed runs start.
 #>

 param(
    [int]    $Runs        = 3,
    [int]    $MaxTokens   = 400,
    [int]    $TimeoutSec  = 120,
    # Add Vulkan tensor-split providers (llama.cpp Vulkan build, NVIDIA+Intel Arc).
    # Tests whether offloading layers to the Intel Arc 140T iGPU improves throughput.
    # Devices: Vulkan0=Intel Arc 140T (18 GB shared), Vulkan1=NVIDIA RTX PRO 500 (6 GB)
    [switch] $VulkanSplit
 )

 Set-StrictMode -Version Latest
 $ErrorActionPreference = 'Stop'

 # ---------------------------------------------------------------------------
 # Logging — one timestamped file per invocation, written to .\logs\
 # ---------------------------------------------------------------------------
 $LogDir  = Join-Path $PSScriptRoot 'logs'
 $null    = New-Item -ItemType Directory -Path $LogDir -Force
 $LogFile = Join-Path $LogDir "benchmark-$(Get-Date -Format 'yyyy-MM-dd_HH-mm-ss')_runs${Runs}_tok${MaxTokens}.txt"
 Start-Transcript -Path $LogFile -NoClobber | Out-Null
 Write-Host "  📄  Logging to: $LogFile" -ForegroundColor DarkGray

 # ---------------------------------------------------------------------------
 # Test prompt — produces ~200-350 tokens of code output consistently
 # ---------------------------------------------------------------------------
 $TestMessages = @(
    @{
        role    = 'system'
        content = 'You are a helpful assistant. Answer concisely and completely.'
    }
    @{
        role    = 'user'
        content = @'
 Write a PowerShell function called Get-DiskUsage that:
 - Accepts a -Path parameter (default: current directory)
 - Recursively enumerates all files, skipping inaccessible directories
 - Groups files by extension
 - Returns a [PSCustomObject[]] sorted by total size descending, with columns:
  Extension, FileCount, TotalSizeMB (rounded to 2 decimal places)
 Include parameter validation and a short comment block.
 '@
    }
 )

 # ---------------------------------------------------------------------------
 # Provider definitions
 # ---------------------------------------------------------------------------
 $Providers = [System.Collections.Generic.List[PSCustomObject]]::new()

 # ---------------------------------------------------------------------------
 # llama.cpp server — port must be defined before the provider list
 # ---------------------------------------------------------------------------
 $LlamaServerExe  = "$env:LOCALAPPDATA\llama.cpp\llama-server.exe"
 $LlamaVulkanExe  = "$env:LOCALAPPDATA\llama.cpp-vulkan\llama-server.exe"
 $LlamaServerPort = 8088
 $script:LlamaServerProcess = $null

 # Ollama — 7B instruct (32k context via custom Modelfile)
 $Providers.Add([PSCustomObject]@{
    Name       = 'Ollama'
    BaseUrl    = 'http://localhost:11434/v1'
    Model      = 'qwen2.5:7b-instruct-32k'
    Headers    = @{}
    TimeoutSec = 120
 })

 # Ollama — 1.5B instruct (apples-to-apples with Foundry Local)
 # Prerequisite: ollama pull qwen2.5:1.5b-instruct
 #               ollama create qwen2.5:1.5b-instruct-32k -f Modelfile  (FROM qwen2.5:1.5b-instruct / PARAMETER num_ctx 32768)
 $Providers.Add([PSCustomObject]@{
    Name       = 'Ollama (1.5B)'
    BaseUrl    = 'http://localhost:11434/v1'
    Model      = 'qwen2.5:1.5b-instruct-32k'
    Headers    = @{}
    TimeoutSec = 60
 })

 # LM Studio 7B (IPv4 address — avoids IPv6 ::1 resolution issue on Windows)
 $Providers.Add([PSCustomObject]@{
    Name        = 'LM Studio'
    BaseUrl     = 'http://127.0.0.1:1234/v1'
    Model       = 'local-model'
    LoadModel   = 'qwen2.5-7b-instruct@q4_k_s'
    Headers     = @{}
    TimeoutSec  = 200   # first inference after load can take ~100s on NVIDIA+CPU split
 })

 # LM Studio — 1.5B Coder instruct (apples-to-apples with Foundry Local)
 # Prerequisite: download qwen2.5-coder-1.5b-instruct (Q8_0) in LM Studio Discover tab
 $Providers.Add([PSCustomObject]@{
    Name        = 'LM Studio (1.5B)'
    BaseUrl     = 'http://127.0.0.1:1234/v1'
    Model       = 'local-model'
    LoadModel   = 'qwen2.5-coder-1.5b-instruct'
    Headers     = @{}
    TimeoutSec  = 120
 })

 # LM Studio — Qwen3-8B Q4_K_M (fits fully in 6 GB VRAM — no CPU split needed)
 # --gpu max: 4.68 GB fits entirely in 6 GB GDDR; no layer split required
 # --context-length 8192: limits KV cache so model+cache fits within 6 GB
 # Prerequisite: download lmstudio-community/Qwen3-8B-GGUF (Q4_K_M) in LM Studio Discover tab
 $Providers.Add([PSCustomObject]@{
    Name          = 'LM Studio (Qwen3-8B)'
    BaseUrl       = 'http://127.0.0.1:1234/v1'
    Model         = 'local-model'
    LoadModel     = 'qwen3-8b'
    GpuFraction   = 'max'
    ContextLength = 8192
    Headers       = @{}
    TimeoutSec    = 300   # thinking mode: warm-up + each run can take 100-150s with 400 max_tokens
 })

 # Ollama — Qwen3-8B (fits fully in 6 GB VRAM with 32k context)
 # Prerequisite: ollama pull qwen3:8b
 $Providers.Add([PSCustomObject]@{
    Name       = 'Ollama (Qwen3-8B)'
    BaseUrl    = 'http://localhost:11434/v1'
    Model      = 'qwen3:8b'
    Headers    = @{}
    TimeoutSec = 120
 })

 # llama.cpp — Qwen3-8B Q4_K_M (4.68 GB, all layers on NVIDIA 6 GB GDDR, Flash Attention)
 # Same GGUF as LM Studio Qwen3-8B — direct comparison without LM Studio overhead.
 # ctx=8192 for benchmark comparability with LM Studio (Qwen3-8B) entry.
 # For Copilot CLI use (32k context) Qwen3-8B is tight on 6GB — use Qwen2.5-7B instead.
 # Prerequisite: %LOCALAPPDATA%\llama.cpp\llama-server.exe (CUDA build, see Start-LlamaServer.ps1)
 $Providers.Add([PSCustomObject]@{
    Name        = 'llama.cpp (Qwen3-8B)'
    BaseUrl     = "http://127.0.0.1:${LlamaServerPort}/v1"
    Model       = 'Qwen3-8B-Q4_K_M'
    LoadModel   = "$env:USERPROFILE\.lmstudio\models\lmstudio-community\Qwen3-8B-GGUF\Qwen3-8B-Q4_K_M.gguf"
    NGL         = 999
    CtxSize     = 8192
    CacheTypeK  = 'q8_0'
    CacheTypeV  = 'q8_0'
    Headers     = @{}
    TimeoutSec  = 300   # thinking mode: each run can take 100-150s with 400 max_tokens
    IsLlama     = $true
 })

 # llama.cpp — Qwen2.5-7B Q4_K_S (4.15 GB, same quant as LM Studio 7B entry)
 # 32k context with q8_0 KV cache: 5.53 GB total, 450 MB headroom. No tok/s penalty vs 8k.
 # Best choice for Copilot CLI (system prompt ~22k tokens requires >= 24k context).
 $Providers.Add([PSCustomObject]@{
    Name        = 'llama.cpp (Qwen2.5-7B)'
    BaseUrl     = "http://127.0.0.1:${LlamaServerPort}/v1"
    Model       = 'Qwen2.5-7B-Instruct-Q4_K_S'
    LoadModel   = "$env:USERPROFILE\.lmstudio\models\bartowski\Qwen2.5-7B-Instruct-GGUF\Qwen2.5-7B-Instruct-Q4_K_S.gguf"
    NGL         = 999
    CtxSize     = 32768
    CacheTypeK  = 'q8_0'
    CacheTypeV  = 'q8_0'
    Headers     = @{}
    TimeoutSec  = 180
    IsLlama     = $true
 })

 # ---------------------------------------------------------------------------
 # Vulkan split providers — only added when -VulkanSplit switch is passed.
 # Vulkan device order: Vulkan0=Intel Arc 140T (18 GB shared), Vulkan1=NVIDIA (6 GB).
 # Adds two variants: NVIDIA-only via Vulkan (overhead vs CUDA baseline) and a
 # 1:4 tensor split (Arc:NVIDIA) to measure whether iGPU offload helps throughput.
 # ---------------------------------------------------------------------------
 if ($VulkanSplit) {
    # NVIDIA-only via Vulkan — same model/context as CUDA entry, but Vulkan backend.
    # --main-gpu 1 pins all layers to Vulkan1 (NVIDIA); Arc is not involved.
    # Establishes the Vulkan overhead relative to the CUDA result above.
    $Providers.Add([PSCustomObject]@{
        Name          = 'llama.cpp (Vulkan/NVIDIA)'
        BaseUrl       = "http://127.0.0.1:${LlamaServerPort}/v1"
        Model         = 'Qwen2.5-7B-Instruct-Q4_K_S'
        LoadModel     = "$env:USERPROFILE\.lmstudio\models\bartowski\Qwen2.5-7B-Instruct-GGUF\Qwen2.5-7B-Instruct-Q4_K_S.gguf"
        NGL           = 999
        CtxSize       = 32768
        CacheTypeK    = 'q8_0'
        CacheTypeV    = 'q8_0'
        MainGpu       = 1       # Vulkan1 = NVIDIA RTX PRO 500
        TensorSplit   = ''
        Headers       = @{}
        TimeoutSec    = 180
        IsLlamaVulkan = $true
    })

    # Tensor split: ~20% layers on Arc (Vulkan0), ~80% on NVIDIA (Vulkan1).
    # Arc 140T compute is slower, but this tests whether dual-GPU scheduling
    # or larger effective memory pool improves total throughput.
    $Providers.Add([PSCustomObject]@{
        Name          = 'llama.cpp (Vulkan/split 1:4)'
        BaseUrl       = "http://127.0.0.1:${LlamaServerPort}/v1"
        Model         = 'Qwen2.5-7B-Instruct-Q4_K_S'
        LoadModel     = "$env:USERPROFILE\.lmstudio\models\bartowski\Qwen2.5-7B-Instruct-GGUF\Qwen2.5-7B-Instruct-Q4_K_S.gguf"
        NGL           = 999
        CtxSize       = 32768
        CacheTypeK    = 'q8_0'
        CacheTypeV    = 'q8_0'
        MainGpu       = -1
        TensorSplit   = '1,4'   # Vulkan0/Arc:Vulkan1/NVIDIA = 1:4 → ~20% Arc / ~80% NVIDIA
        Headers       = @{}
        TimeoutSec    = 240     # inter-GPU sync may add latency on first run
        IsLlamaVulkan = $true
    })
 }

 # Foundry Local — port is dynamic, detected at runtime
 function Get-FoundryUrl {
    try {
        $out = foundry service status 2>&1
        $match = ($out | Select-String -Pattern 'http://[\d.:]+').Matches
        if ($match.Count -gt 0) {
            return $match[0].Value.TrimEnd('/')
        }
    } catch { }
    return $null
 }

 $foundryUrl = Get-FoundryUrl
 if ($foundryUrl) {
    $Providers.Add([PSCustomObject]@{
        Name       = 'Foundry Local'
        BaseUrl    = "$foundryUrl/v1"
        Model      = 'qwen2.5-1.5b-instruct-cuda-gpu:4'
        Headers    = @{}
        TimeoutSec = 60
    })
 } else {
    Write-Host "⚠  Foundry Local service not detected — skipping." -ForegroundColor DarkYellow
 }

 # vLLM (HuggingFace) — Docker container, OpenAI-compatible API on port 8000
 # Prerequisite: docker pull vllm/vllm-openai:latest
 #               .\Start-HuggingFaceVllm.ps1  (starts the container, downloads model on first run)
 # Model: Qwen/Qwen2.5-7B-Instruct-AWQ — same family as Ollama/LM Studio, AWQ 4-bit quantisation
 $Providers.Add([PSCustomObject]@{
    Name          = 'vLLM (HuggingFace)'
    BaseUrl       = 'http://127.0.0.1:8000/v1'
    Model         = 'Qwen/Qwen2.5-7B-Instruct-AWQ'
    Headers       = @{}
    TimeoutSec    = 600
    IsVllm        = $true
 })

 # ---------------------------------------------------------------------------
 # LM Studio CLI (lms.exe) — used to load/unload the model between provider runs
 # so Ollama and LM Studio don't compete for the same VRAM.
 # ---------------------------------------------------------------------------
 $LmsExe = "$env:LOCALAPPDATA\Programs\LM Studio\resources\app\.webpack\lms.exe"
 $LmsAvailable = Test-Path $LmsExe

 # ---------------------------------------------------------------------------
 # vLLM (Docker) management helpers
 # ---------------------------------------------------------------------------
 $VllmContainerName = 'vllm-server'

 function Test-VllmRunning {
    $state = docker inspect $VllmContainerName --format '{{.State.Status}}' 2>&1
    return ($state -eq 'running')
 }

 function Stop-VllmContainer {
    if (Test-VllmRunning) {
        Write-Host "  ⏹  Stopping vLLM container to free VRAM..." -ForegroundColor DarkYellow
        docker stop $VllmContainerName 2>&1 | Out-Null
        docker rm   $VllmContainerName 2>&1 | Out-Null
        Write-Host "  ✔  vLLM container stopped." -ForegroundColor DarkGray
    }
 }

 function Start-VllmContainer {
    param([int]$WaitSec = 180)
    $scriptPath = Join-Path $PSScriptRoot 'Start-HuggingFaceVllm.ps1'
    if (-not (Test-Path $scriptPath)) {
        Write-Host "  ⚠  Start-HuggingFaceVllm.ps1 not found at '$scriptPath' — cannot auto-start vLLM." -ForegroundColor DarkYellow
        return $false
    }
    & $scriptPath -WaitSec $WaitSec
    return ($LASTEXITCODE -eq 0)
 }

 function Stop-LMStudioServer {
    # Kill LM Studio entirely rather than using lms commands — lms server stop leaves
    # the app running and holding VRAM, and lms unload hangs when the app isn't running.
    $procs = Get-Process -Name 'LM Studio' -ErrorAction SilentlyContinue
    if (-not $procs) {
        Write-Host "  ⏹  LM Studio not running — nothing to stop." -ForegroundColor DarkGray
        return
    }
    Write-Host "  ⏹  Killing LM Studio ($($procs.Count) processes) to free VRAM..." -ForegroundColor DarkYellow
    $procs | ForEach-Object { Stop-Process -Id $_.Id -Force -ErrorAction SilentlyContinue }
    Start-Sleep -Seconds 3
    Write-Host "  ✔  LM Studio stopped." -ForegroundColor DarkGray
 }

 function Stop-LlamaServer {
    # Kill any running llama-server.exe process to free VRAM before another provider loads.
    $procs = @(Get-Process -Name 'llama-server' -ErrorAction SilentlyContinue)
    if ($procs.Count -eq 0) {
        Write-Host "  ⏹  llama-server not running — nothing to stop." -ForegroundColor DarkGray
        return
    }
    Write-Host "  ⏹  Stopping llama-server ($($procs.Count) process(es)) to free VRAM..." -ForegroundColor DarkYellow
    $procs | ForEach-Object { Stop-Process -Id $_.Id -Force -ErrorAction SilentlyContinue }
    Start-Sleep -Seconds 2
    Write-Host "  ✔  llama-server stopped." -ForegroundColor DarkGray
 }

 function Start-LlamaServerForProvider {
    param(
        [string] $ModelPath,
        [string] $ModelAlias,
        [int]    $NGL         = 999,
        [int]    $CtxSize     = 32768,
        [string] $CacheTypeK  = 'q8_0',
        [string] $CacheTypeV  = 'q8_0',
        [int]    $WaitSec     = 120
    )
    if (-not (Test-Path $LlamaServerExe)) {
        Write-Host "  ⚠  llama-server.exe not found at '$LlamaServerExe' — cannot start llama.cpp." -ForegroundColor DarkYellow
        return $false
    }
    if (-not (Test-Path $ModelPath)) {
        Write-Host "  ⚠  Model file not found: '$ModelPath' — skipping." -ForegroundColor DarkYellow
        return $false
    }

    $serverArgs = @(
        '--model',          $ModelPath,
        '--alias',          $ModelAlias,
        '--n-gpu-layers',   $NGL.ToString(),
        '--ctx-size',       $CtxSize.ToString(),
        '--cache-type-k',   $CacheTypeK,
        '--cache-type-v',   $CacheTypeV,
        '--host',           '127.0.0.1',
        '--port',           $LlamaServerPort.ToString(),
        '--flash-attn',     'on',
        '--log-disable'
    )

    Write-Host "  ▶  Starting llama-server (model: $ModelAlias, NGL=$NGL, ctx=$CtxSize, kv=$CacheTypeK)..." -ForegroundColor DarkYellow
    $script:LlamaServerProcess = Start-Process -FilePath $LlamaServerExe -ArgumentList $serverArgs -PassThru -WindowStyle Hidden
    Write-Host "  PID: $($script:LlamaServerProcess.Id)" -ForegroundColor DarkGray

    $deadline = [System.DateTime]::Now.AddSeconds($WaitSec)
    while ([System.DateTime]::Now -lt $deadline) {
        if ($script:LlamaServerProcess.HasExited) {
            Write-Host "  ✗  llama-server exited unexpectedly." -ForegroundColor Red
            return $false
        }
        try {
            $resp = Invoke-RestMethod "http://127.0.0.1:${LlamaServerPort}/v1/models" -TimeoutSec 3 -ErrorAction Stop
            if ($resp.data.Count -gt 0) {
                Write-Host "  ✔  llama-server ready — model: $($resp.data[0].id)" -ForegroundColor Green
                return $true
            }
        } catch { }
        Start-Sleep -Seconds 3
    }
    Write-Host "  ✗  llama-server did not become ready within ${WaitSec}s." -ForegroundColor Red
    return $false
 }

 function Start-LlamaVulkanServerForProvider {
    param(
        [string] $ModelPath,
        [string] $ModelAlias,
        [int]    $NGL         = 999,
        [int]    $CtxSize     = 32768,
        [string] $CacheTypeK  = 'q8_0',
        [string] $CacheTypeV  = 'q8_0',
        [int]    $MainGpu     = -1,    # -1 = not set; >=0 pins all layers to that Vulkan device
        [string] $TensorSplit = '',    # '' = not set; 'a,b' splits layers proportionally
        [int]    $WaitSec     = 120
    )
    if (-not (Test-Path $LlamaVulkanExe)) {
        Write-Host "  ⚠  Vulkan llama-server.exe not found at '$LlamaVulkanExe'" -ForegroundColor DarkYellow
        return $false
    }
    if (-not (Test-Path $ModelPath)) {
        Write-Host "  ⚠  Model file not found: '$ModelPath' — skipping." -ForegroundColor DarkYellow
        return $false
    }

    $serverArgs = [System.Collections.Generic.List[string]]@(
        '--model',         $ModelPath,
        '--alias',         $ModelAlias,
        '--n-gpu-layers',  $NGL.ToString(),
        '--ctx-size',      $CtxSize.ToString(),
        '--cache-type-k',  $CacheTypeK,
        '--cache-type-v',  $CacheTypeV,
        '--host',          '127.0.0.1',
        '--port',          $LlamaServerPort.ToString(),
        '--log-disable'
        # Note: --flash-attn is CUDA-only; not passed for the Vulkan build
    )
    if ($MainGpu -ge 0)      { $serverArgs.AddRange([string[]]@('--main-gpu',     $MainGpu.ToString())) }
    if ($TensorSplit -ne '') { $serverArgs.AddRange([string[]]@('--tensor-split', $TensorSplit)) }

    $label = if ($TensorSplit -ne '') { "split=$TensorSplit" } elseif ($MainGpu -ge 0) { "main-gpu=$MainGpu" } else { 'default' }
    Write-Host "  ▶  Starting Vulkan llama-server ($label, ctx=$CtxSize, kv=$CacheTypeK)..." -ForegroundColor DarkYellow
    $script:LlamaServerProcess = Start-Process -FilePath $LlamaVulkanExe -ArgumentList $serverArgs -PassThru -WindowStyle Hidden
    Write-Host "  PID: $($script:LlamaServerProcess.Id)" -ForegroundColor DarkGray

    $deadline = [System.DateTime]::Now.AddSeconds($WaitSec)
    while ([System.DateTime]::Now -lt $deadline) {
        if ($script:LlamaServerProcess.HasExited) {
            Write-Host "  ✗  Vulkan llama-server exited unexpectedly." -ForegroundColor Red
            return $false
        }
        try {
            $resp = Invoke-RestMethod "http://127.0.0.1:${LlamaServerPort}/v1/models" -TimeoutSec 3 -ErrorAction Stop
            if ($resp.data.Count -gt 0) {
                Write-Host "  ✔  Vulkan llama-server ready — model: $($resp.data[0].id)" -ForegroundColor Green
                return $true
            }
        } catch { }
        Start-Sleep -Seconds 3
    }
    Write-Host "  ✗  Vulkan llama-server did not become ready within ${WaitSec}s." -ForegroundColor Red
    return $false
 }

 function Start-LMStudioServer {
    param(
        [string]$ModelSearchString,
        [int]$WaitSec = 180,
        [string]$GpuFraction = '0.78',
        [int]$ContextLength = 32768
    )
    if (-not $LmsAvailable) {
        Write-Host "  ⚠  lms.exe not found at expected path — cannot auto-start LM Studio." -ForegroundColor DarkYellow
        return $false
    }
    try {
        # Launch the LM Studio app if it isn't already running — lms commands require it
        if (-not (Get-Process -Name 'LM Studio' -ErrorAction SilentlyContinue)) {
            Write-Host "  ▶  Launching LM Studio app..." -ForegroundColor DarkYellow
            Start-Process "$env:LOCALAPPDATA\Programs\LM Studio\LM Studio.exe"
            Start-Sleep -Seconds 5   # give the app time to initialise before lms commands
        }
        Write-Host "  ▶  Starting LM Studio server..." -ForegroundColor DarkYellow
        Start-Process -FilePath $LmsExe -ArgumentList 'server', 'start' -NoNewWindow -Wait

        Write-Host "  ▶  Unloading any existing models (avoids :2 duplicate instance)..." -ForegroundColor DarkGray
        Start-Process -FilePath $LmsExe -ArgumentList @('unload', '--all') -NoNewWindow -Wait

        Write-Host "  ▶  Loading model '$ModelSearchString' (polling until ready)..." -ForegroundColor DarkYellow
        # Use -Wait so the script blocks here until lms load exits cleanly.
        # The exact model identifier avoids the interactive "multiple matches" prompt
        # that caused lms load to hang indefinitely in previous runs.
        $gpuFrac = $GpuFraction
        $ctxLen  = $ContextLength
        Start-Process -FilePath $LmsExe -ArgumentList @('load', $ModelSearchString, '--gpu', $gpuFrac, '--context-length', $ctxLen) -NoNewWindow -Wait

        # Quick sanity-check poll: verify the server is serving the model before we proceed
        $deadline = [System.DateTime]::Now.AddSeconds($WaitSec)
        while ([System.DateTime]::Now -lt $deadline) {
            try {
                $resp = Invoke-RestMethod 'http://127.0.0.1:1234/v1/models' -TimeoutSec 3 -ErrorAction Stop
                if ($resp.data.Count -gt 0) {
                    $loadedId = $resp.data[0].id
                    Write-Host "  ✔  LM Studio model '$loadedId' loaded and ready." -ForegroundColor Green
                    # Return the real model ID so the caller can use it in API requests
                    return $loadedId
                }
            } catch { }
            Write-Host "  ⏳  Waiting for model to become available..." -ForegroundColor DarkGray
            Start-Sleep -Seconds 3
        }
        Write-Host "  ✗  LM Studio model did not load within ${WaitSec}s." -ForegroundColor Red
        return $null
    } catch {
        Write-Host "  ✗  Failed to start LM Studio: $_" -ForegroundColor Red
        return $null
    }
 }

 # ---------------------------------------------------------------------------
 # Helper: test connectivity to a provider
 # ---------------------------------------------------------------------------
 function Test-ProviderReachable([string]$BaseUrl) {
    try {
        $null = Invoke-WebRequest "$BaseUrl/models" -TimeoutSec 3 -ErrorAction Stop
        return $true
    } catch {
        return $false
    }
 }

 # ---------------------------------------------------------------------------
 # Helper: run a single timed completion request
 # Returns [PSCustomObject] with ElapsedSec, CompletionTokens, TokPerSec
 # Returns $null if the call failed
 # ---------------------------------------------------------------------------
 function Invoke-TimedCompletion {
    param(
        [string]     $BaseUrl,
        [string]     $Model,
        [hashtable]  $Headers,
        [object[]]   $Messages,
        [int]        $MaxTokens,
        [int]        $TimeoutSec
    )

    $body = @{
        model       = $Model
        messages    = $Messages
        max_tokens  = $MaxTokens
        temperature = 0.2   # low temperature → consistent output length across runs
        stream      = $false
    } | ConvertTo-Json -Depth 5

    $sw = [System.Diagnostics.Stopwatch]::StartNew()
    try {
        $response = Invoke-RestMethod `
            -Uri         "$BaseUrl/chat/completions" `
            -Method      Post `
            -Headers     ($Headers + @{ 'Content-Type' = 'application/json' }) `
            -Body        $body `
            -TimeoutSec  $TimeoutSec `
            -ErrorAction Stop
    } catch {
        Write-Host "    ERROR: $_" -ForegroundColor Red
        return $null
    }
    $sw.Stop()

    $elapsed = $sw.Elapsed.TotalSeconds

    # Some providers (e.g. Foundry Local) omit usage; fall back to a character-count estimate.
    # ~4 chars/token is a rough but consistent approximation for English + code output.
    $completionTokens = try { [int]$response.usage.completion_tokens } catch { 0 }
    if ($completionTokens -le 0) {
        $content = try { $response.choices[0].message.content } catch { '' }
        $completionTokens = [math]::Max(1, [int][math]::Round($content.Length / 4))
        Write-Host " (usage field absent — token count estimated from content length)" `
            -ForegroundColor DarkGray -NoNewline
    }

    $tokPerSec = if ($elapsed -gt 0) { [math]::Round($completionTokens / $elapsed, 1) } else { 0 }

    return [PSCustomObject]@{
        ElapsedSec        = [math]::Round($elapsed, 2)
        CompletionTokens  = $completionTokens
        TokPerSec         = $tokPerSec
    }
 }

 # ---------------------------------------------------------------------------
 # Main benchmark loop
 # ---------------------------------------------------------------------------
 $AllResults = [System.Collections.Generic.List[PSCustomObject]]::new()

 foreach ($provider in $Providers) {
    Write-Host ""
    Write-Host "━━━ $($provider.Name)  [$($provider.BaseUrl)]" -ForegroundColor Cyan

    # --- VRAM management between providers ---
    # Ollama, LM Studio, vLLM, and llama.cpp all compete for the NVIDIA 6 GB VRAM.
    # Only one can hold the model at a time.
    if ($provider.Name -like 'Ollama*') {
        # Ensure LM Studio, vLLM, and llama-server aren't holding VRAM before Ollama loads
        Stop-LMStudioServer
        Stop-VllmContainer
        Stop-LlamaServer
    }
    elseif ($provider.Name -like 'LM Studio*') {
        Stop-VllmContainer
        Stop-LlamaServer
        $gpuFrac = if ($provider.PSObject.Properties['GpuFraction']) { $provider.GpuFraction } else { '0.78' }
        $ctxLen  = if ($provider.PSObject.Properties['ContextLength']) { [int]$provider.ContextLength } else { 32768 }
        $loadedModelId = Start-LMStudioServer -ModelSearchString $provider.LoadModel -GpuFraction $gpuFrac -ContextLength $ctxLen
        if ($null -eq $loadedModelId) {
            Write-Host "  Skipping LM Studio — server could not be started." -ForegroundColor DarkYellow
            continue
        }
        # Use the actual model ID returned by the server (e.g. "qwen2.5-7b-instruct@q5_k_m")
        # rather than the placeholder "local-model" — LM Studio now enforces a valid identifier
        $provider.Model = $loadedModelId
    }
    elseif ($provider.PSObject.Properties['IsVllm'] -and $provider.IsVllm) {
        # vLLM: ensure LM Studio and Ollama are not holding VRAM
        Stop-LMStudioServer
        Stop-LlamaServer
        # Ask Ollama to unload the model from VRAM (ollama stop releases GPU memory)
        try { ollama stop $null 2>&1 | Out-Null } catch { }
        if (-not (Test-VllmRunning)) {
            $started = Start-VllmContainer -WaitSec 300
            if (-not $started) {
                Write-Host "  Skipping vLLM — container could not be started." -ForegroundColor DarkYellow
                continue
            }
        }
    }
    elseif ($provider.PSObject.Properties['IsLlama'] -and $provider.IsLlama) {
        # llama.cpp: stop LM Studio, vLLM, Ollama, and any existing llama-server instance
        Stop-LMStudioServer
        Stop-VllmContainer
        try { ollama stop $null 2>&1 | Out-Null } catch { }
        Stop-LlamaServer   # kill previous model if a different one was loaded
        $ngl        = if ($provider.PSObject.Properties['NGL'])        { [int]$provider.NGL }        else { 999 }
        $ctxSize    = if ($provider.PSObject.Properties['CtxSize'])    { [int]$provider.CtxSize }    else { 32768 }
        $cacheTypeK = if ($provider.PSObject.Properties['CacheTypeK']) { $provider.CacheTypeK }      else { 'q8_0' }
        $cacheTypeV = if ($provider.PSObject.Properties['CacheTypeV']) { $provider.CacheTypeV }      else { 'q8_0' }
        $started = Start-LlamaServerForProvider `
            -ModelPath  $provider.LoadModel `
            -ModelAlias $provider.Model `
            -NGL        $ngl `
            -CtxSize    $ctxSize `
            -CacheTypeK $cacheTypeK `
            -CacheTypeV $cacheTypeV
        if (-not $started) {
            Write-Host "  Skipping llama.cpp — server could not be started." -ForegroundColor DarkYellow
            continue
        }
    }
    elseif ($provider.PSObject.Properties['IsLlamaVulkan'] -and $provider.IsLlamaVulkan) {
        # Vulkan build: stops all other backends then starts the Vulkan llama-server
        Stop-LMStudioServer
        Stop-VllmContainer
        try { ollama stop $null 2>&1 | Out-Null } catch { }
        Stop-LlamaServer
        $ngl         = if ($provider.PSObject.Properties['NGL'])          { [int]$provider.NGL }       else { 999 }
        $ctxSize     = if ($provider.PSObject.Properties['CtxSize'])      { [int]$provider.CtxSize }   else { 32768 }
        $cacheTypeK  = if ($provider.PSObject.Properties['CacheTypeK'])   { $provider.CacheTypeK }     else { 'q8_0' }
        $cacheTypeV  = if ($provider.PSObject.Properties['CacheTypeV'])   { $provider.CacheTypeV }     else { 'q8_0' }
        $mainGpu     = if ($provider.PSObject.Properties['MainGpu'])      { [int]$provider.MainGpu }   else { -1 }
        $tensorSplit = if ($provider.PSObject.Properties['TensorSplit'])  { $provider.TensorSplit }    else { '' }
        $started = Start-LlamaVulkanServerForProvider `
            -ModelPath   $provider.LoadModel `
            -ModelAlias  $provider.Model `
            -NGL         $ngl `
            -CtxSize     $ctxSize `
            -CacheTypeK  $cacheTypeK `
            -CacheTypeV  $cacheTypeV `
            -MainGpu     $mainGpu `
            -TensorSplit $tensorSplit
        if (-not $started) {
            Write-Host "  Skipping Vulkan llama.cpp — server could not be started." -ForegroundColor DarkYellow
            continue
        }
    }

    if (-not (Test-ProviderReachable $provider.BaseUrl)) {
        Write-Host "  ✗ Not reachable — skipping." -ForegroundColor DarkYellow
        continue
    }

    Write-Host "  Model : $($provider.Model)  [timeout: $($provider.TimeoutSec)s]"

    # Qwen3 models think by default; inject /no-think so runs are comparable with other providers.
    $messages = if ($provider.PSObject.Properties['DisableThinking'] -and $provider.DisableThinking) {
        Write-Host "  (thinking disabled via /no-think)" -ForegroundColor DarkGray
        @(
            @{ role = 'system'; content = ($TestMessages[0].content + "`n/no-think") }
            $TestMessages[1]
        )
    } else {
        $TestMessages
    }

    Write-Host "  Warm-up run (discarded)..." -NoNewline

    # Warm-up — loads model into VRAM / warms KV cache; result discarded
    $warmup = Invoke-TimedCompletion `
        -BaseUrl    $provider.BaseUrl `
        -Model      $provider.Model `
        -Headers    $provider.Headers `
        -Messages   $messages `
        -MaxTokens  $MaxTokens `
        -TimeoutSec $provider.TimeoutSec

    if ($null -eq $warmup) {
        Write-Host " FAILED — skipping provider." -ForegroundColor Red
        continue
    }
    Write-Host " done ($($warmup.ElapsedSec)s, $($warmup.CompletionTokens) tokens)" -ForegroundColor DarkGray

    $runResults = @()
    for ($i = 1; $i -le $Runs; $i++) {
        Write-Host "  Run $i/$Runs..." -NoNewline
        $r = Invoke-TimedCompletion `
            -BaseUrl    $provider.BaseUrl `
            -Model      $provider.Model `
            -Headers    $provider.Headers `
            -Messages   $messages `
            -MaxTokens  $MaxTokens `
            -TimeoutSec $provider.TimeoutSec

        if ($null -eq $r) {
            Write-Host " FAILED" -ForegroundColor Red
        } else {
            Write-Host " $($r.ElapsedSec)s — $($r.CompletionTokens) tokens — $($r.TokPerSec) tok/s" -ForegroundColor Green
            $runResults += $r
        }
    }

    if ($runResults.Count -eq 0) {
        Write-Host "  All runs failed — no result recorded." -ForegroundColor Red
        continue
    }

    $avgTokPerSec       = [math]::Round(($runResults | Measure-Object -Property TokPerSec -Average).Average, 1)
    $avgElapsed         = [math]::Round(($runResults | Measure-Object -Property ElapsedSec -Average).Average, 2)
    $avgCompletionToks  = [math]::Round(($runResults | Measure-Object -Property CompletionTokens -Average).Average)

    Write-Host "  ─ Average: $avgTokPerSec tok/s  (${avgElapsed}s per run, ~${avgCompletionToks} tokens)" -ForegroundColor White

    $AllResults.Add([PSCustomObject]@{
        Provider         = $provider.Name
        Model            = $provider.Model
        AvgTokPerSec     = $avgTokPerSec
        AvgElapsedSec    = $avgElapsed
        AvgCompletionTok = $avgCompletionToks
        RunCount         = $runResults.Count
    })
 }

 # ---------------------------------------------------------------------------
 # Summary table
 # ---------------------------------------------------------------------------
 if ($AllResults.Count -eq 0) {
    Write-Host "`n⚠  No providers produced results. Make sure at least one is running." -ForegroundColor Yellow
    exit 1
 }

 Write-Host ""
 Write-Host "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" -ForegroundColor Cyan
 Write-Host "  RESULTS" -ForegroundColor Cyan
 Write-Host "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" -ForegroundColor Cyan

 $AllResults | Sort-Object AvgTokPerSec -Descending | Format-Table `
    @{Label='Provider';     Expression={$_.Provider}},
    @{Label='Model';        Expression={$_.Model}},
    @{Label='tok/s (avg)';  Expression={$_.AvgTokPerSec};     Align='Right'},
    @{Label='s/run (avg)';  Expression={$_.AvgElapsedSec};    Align='Right'},
    @{Label='tokens/run';   Expression={$_.AvgCompletionTok}; Align='Right'},
    @{Label='runs';         Expression={$_.RunCount};         Align='Right'} `
    -AutoSize

 # ---------------------------------------------------------------------------
 # Markdown table (copy-paste ready for the blog post)
 # ---------------------------------------------------------------------------
 Write-Host "━━━ Markdown table (for blog post) ━━━" -ForegroundColor Cyan
 Write-Host ""

 $header = "| Provider | Model | Avg tok/s | Avg s/run | Notes |"
 $sep    = "|---|---|---:|---:|---|"
 Write-Host $header
 Write-Host $sep

 foreach ($r in ($AllResults | Sort-Object AvgTokPerSec -Descending)) {
    $notes = switch ($r.Provider) {
        'Ollama'                  { "NVIDIA RTX PRO 500 (CUDA), 32k context" }
        'Ollama (1.5B)'           { "NVIDIA RTX PRO 500 (CUDA), 32k context" }
        'LM Studio'               { "--gpu 0.78 (~25/32 layers NVIDIA GDDR), 32k context" }
        'LM Studio (1.5B)'        { "NVIDIA RTX PRO 500 (CUDA full offload), 32k context" }
        'LM Studio (Qwen3-8B)'    { "NVIDIA RTX PRO 500 (CUDA full offload), 8k context, thinking mode" }
        'Foundry Local'           { "NVIDIA RTX PRO 500 (CUDA)" }
        'vLLM (HuggingFace)'      { "NVIDIA RTX PRO 500 (CUDA), AWQ-Marlin kernel, Docker, 32k context" }
        'llama.cpp (Qwen3-8B)'        { "NVIDIA RTX PRO 500 (CUDA), Flash Attention, 8k context, q8_0 KV, NGL=999, thinking mode" }
        'llama.cpp (Qwen2.5-7B)'      { "NVIDIA RTX PRO 500 (CUDA), Flash Attention, 32k context, q8_0 KV, NGL=999" }
        'llama.cpp (Vulkan/NVIDIA)'   { "NVIDIA RTX PRO 500 (Vulkan only, main-gpu=1), 32k context, q8_0 KV" }
        'llama.cpp (Vulkan/split 1:4)'{ "NVIDIA+Intel Arc 140T (Vulkan tensor split 1:4 → ~20% Arc/~80% NVIDIA), 32k ctx, q8_0 KV" }
        default                   { "" }
    }
    Write-Host "| $($r.Provider) | $($r.Model) | $($r.AvgTokPerSec) | $($r.AvgElapsedSec) | $notes |"
 }

 Write-Host ""
 Write-Host "Test: $Runs timed runs per provider (plus 1 warm-up), max_tokens=$MaxTokens, temperature=0.2"
 Write-Host "Hardware: Dell Pro Max 14 MC14250, Intel Core Ultra 7 265H, NVIDIA RTX PRO 500 (6 GB), Intel Arc 140T (16 GB)"
 Write-Host ""
 Write-Host "  📄  Log saved: $LogFile" -ForegroundColor DarkGray

 # ---------------------------------------------------------------------------
 # Finalise transcript — strip VT/ANSI escape sequences for a clean plain-text log
 # ---------------------------------------------------------------------------
 Stop-Transcript | Out-Null
 $raw = Get-Content $LogFile -Raw -ErrorAction SilentlyContinue
 if ($raw) {
    $clean = $raw -replace '\x1b\[[0-9;]*[A-Za-z]', ''
    [System.IO.File]::WriteAllText($LogFile, $clean)
 }
No results found