SamHatoum · May 24, 2026 07:04
diff --git a/litellm-report.html b/litellm-report.html
 <!doctype html>
 <html lang="en">
  <head>
    <meta charset="utf-8" />
    <title>LiteLLM Gateway Investigation — May 2026</title>
    <style>
      :root {
        --bg: #0f1419;
        --bg-card: #1c2128;
        --bg-row: #161b22;
        --fg: #e6edf3;
        --fg-dim: #8b949e;
        --accent: #58a6ff;
        --green: #3fb950;
        --red: #f85149;
        --yellow: #d29922;
        --orange: #db6d28;
        --border: #30363d;
        --mono: ui-monospace, 'SF Mono', 'Cascadia Mono', Menlo, Monaco, 'Roboto Mono', monospace;
      }
      * {
        box-sizing: border-box;
      }
      body {
        margin: 0;
        padding: 2rem;
        background: var(--bg);
        color: var(--fg);
        font:
          14px/1.5 -apple-system,
          BlinkMacSystemFont,
          'Segoe UI',
          Roboto,
          sans-serif;
        max-width: 1100px;
        margin: 0 auto;
      }
      h1 {
        font-size: 28px;
        margin: 0 0 4px;
      }
      h2 {
        font-size: 20px;
        margin: 32px 0 12px;
        padding-top: 16px;
        border-top: 1px solid var(--border);
      }
      h3 {
        font-size: 16px;
        margin: 20px 0 8px;
        color: var(--accent);
      }
      .subtitle {
        color: var(--fg-dim);
        margin-bottom: 24px;
      }
      .card {
        background: var(--bg-card);
        border: 1px solid var(--border);
        border-radius: 8px;
        padding: 16px 20px;
        margin: 12px 0;
      }
      .grid {
        display: grid;
        gap: 12px;
      }
      .grid-3 {
        grid-template-columns: repeat(3, 1fr);
      }
      .grid-4 {
        grid-template-columns: repeat(4, 1fr);
      }
      .stat {
        background: var(--bg-card);
        border: 1px solid var(--border);
        border-radius: 6px;
        padding: 12px 16px;
      }
      .stat-label {
        color: var(--fg-dim);
        font-size: 11px;
        text-transform: uppercase;
        letter-spacing: 0.5px;
      }
      .stat-value {
        font-size: 22px;
        font-weight: 600;
        margin-top: 4px;
        font-variant-numeric: tabular-nums;
      }
      .stat-value.good {
        color: var(--green);
      }
      .stat-value.bad {
        color: var(--red);
      }
      .stat-value.warn {
        color: var(--yellow);
      }
      table {
        width: 100%;
        border-collapse: collapse;
        margin: 8px 0;
        font-size: 13px;
      }
      th {
        text-align: left;
        padding: 8px 10px;
        background: var(--bg-card);
        border-bottom: 2px solid var(--border);
        font-weight: 600;
      }
      td {
        padding: 8px 10px;
        border-bottom: 1px solid var(--border);
        font-variant-numeric: tabular-nums;
      }
      tr:nth-child(even) td {
        background: var(--bg-row);
      }
      td.num {
        text-align: right;
        font-family: var(--mono);
      }
      td.good {
        color: var(--green);
      }
      td.bad {
        color: var(--red);
      }
      td.warn {
        color: var(--yellow);
      }
      code {
        background: var(--bg-card);
        padding: 1px 6px;
        border-radius: 3px;
        font-family: var(--mono);
        font-size: 12.5px;
        color: #ffa657;
      }
      pre {
        background: var(--bg-card);
        border: 1px solid var(--border);
        border-radius: 6px;
        padding: 12px 16px;
        overflow-x: auto;
        font-family: var(--mono);
        font-size: 12px;
        line-height: 1.45;
      }
      .bar-chart {
        margin: 8px 0;
      }
      .bar-row {
        display: grid;
        grid-template-columns: 220px 1fr 80px;
        align-items: center;
        gap: 12px;
        margin: 4px 0;
        font-size: 12.5px;
      }
      .bar-track {
        background: var(--bg-card);
        border-radius: 3px;
        overflow: hidden;
        height: 22px;
        position: relative;
      }
      .bar-fill {
        height: 100%;
        background: var(--accent);
        transition: width 0.3s;
      }
      .bar-fill.green {
        background: var(--green);
      }
      .bar-fill.red {
        background: var(--red);
      }
      .bar-fill.yellow {
        background: var(--yellow);
      }
      .bar-fill.orange {
        background: var(--orange);
      }
      .bar-value {
        text-align: right;
        font-family: var(--mono);
        color: var(--fg-dim);
      }
      .tag {
        display: inline-block;
        padding: 2px 8px;
        border-radius: 12px;
        font-size: 11px;
        font-weight: 500;
        margin-right: 4px;
      }
      .tag-green {
        background: #163a23;
        color: var(--green);
      }
      .tag-red {
        background: #3a1616;
        color: var(--red);
      }
      .tag-yellow {
        background: #3a2f16;
        color: var(--yellow);
      }
      .tag-blue {
        background: #163a4d;
        color: var(--accent);
      }
      .callout {
        background: var(--bg-card);
        border-left: 3px solid var(--accent);
        padding: 12px 16px;
        margin: 12px 0;
        border-radius: 4px;
      }
      .callout.warn {
        border-left-color: var(--yellow);
      }
      .callout.success {
        border-left-color: var(--green);
      }
      a {
        color: var(--accent);
      }
      .toc {
        columns: 2;
        gap: 24px;
        margin: 12px 0;
      }
      .toc a {
        display: block;
        padding: 3px 0;
        text-decoration: none;
      }
      hr.dim {
        border: 0;
        border-top: 1px dashed var(--border);
        margin: 24px 0;
      }
      details {
        background: var(--bg-card);
        border: 1px solid var(--border);
        border-radius: 6px;
        padding: 8px 12px;
        margin: 8px 0;
      }
      summary {
        cursor: pointer;
        font-weight: 500;
      }
    </style>
  </head>
  <body>
    <h1>LiteLLM Gateway Investigation</h1>
    <p class="subtitle">
      on.auto — May 23–24, 2026. Diagnose intermittent timeouts → reproduce → measure → fix → promote to live.
    </p>

    <div class="grid grid-4">
      <div class="stat">
        <div class="stat-label">Phases tested</div>
        <div class="stat-value">9</div>
      </div>
      <div class="stat">
        <div class="stat-label">Total bench requests</div>
        <div class="stat-value">~16,000</div>
      </div>
      <div class="stat">
        <div class="stat-label">Bench spend</div>
        <div class="stat-value">~$120</div>
      </div>
      <div class="stat">
        <div class="stat-label">Final state</div>
        <div class="stat-value good">Live ✓</div>
      </div>
    </div>

    <h2 id="tldr">TL;DR — Five conclusions</h2>

    <div class="callout success">
      <b>1. The gateway is healthy.</b> Across 9 separate load tests (1→200 burst, 8×10-min sustained 20-concurrent), we
      could not reproduce a single 429 originating from gateway-side bottlenecks. Historical PostHog data showed ~20 xAI
      429s over 7 days from ~84k events (0.024% rate) — almost entirely transient xAI infrastructure capacity, not our
      quota.
    </div>

    <div class="callout success">
      <b>2. xAI is the upstream bottleneck.</b> Same gateway, same hour: <code>openai/gpt-5.4-nano</code> ran at
      <b>5.4 rps with P99=8.5s</b>; <code>xai/grok-4.3</code> swung between
      <b>1.0 and 3.6 rps with P99=26–35s</b> depending on xAI's load. Stage A's fallbacks rescued
      <b>412 requests in one 10-min window</b> when xAI was in distress.
    </div>

    <div class="callout warn">
      <b>3. <code>enable_pre_call_checks: true</code> was a 35% throughput tax.</b> Our first Stage A config dropped
      throughput from 3.6 → 1.9 rps and doubled P99 because that single flag validates context-window fit per-request
      (token counting + model-metadata lookup on the hot path). Removing it restored full performance.
    </div>

    <div class="callout success">
      <b>4. Slim Stage A is unambiguously valuable.</b> The retry/cooldown/fallback config adds zero measurable overhead
      AND demonstrably rescues hundreds of user-visible failures during xAI distress. Validated in production: a live
      smoke call immediately after deploy fired the <code>xai/grok-4.3 → anthropic/claude-sonnet-4-6</code> fallback and
      returned 200 OK.
    </div>

    <div class="callout">
      <b>5. Stage B (gunicorn 4 workers + DB pool sizing) is forward-looking headroom.</b> Worker scaling isn't the
      current bottleneck — xAI is. But gunicorn workers + DB keepalives are kept on live as future-proofing for traffic
      growth.
    </div>

    <h2 id="loadtests">Load test results — 10-min sustained 20-concurrent runs</h2>

    <p>
      All runs: 20 worker threads cycling through 9 captured fixtures (1.5k–26k input tokens) for 600 seconds.
      Cache-busted via per-request nonce + ISO timestamp at the start of the first user message, so no prefix-based
      provider cache could hit.
    </p>

    <h3>Throughput (requests per second)</h3>
    <div class="bar-chart">
      <div class="bar-row">
        <span>post-A original (full)</span>
        <div class="bar-track"><div class="bar-fill red" style="width: 31.7%"></div></div>
        <span class="bar-value">1.9 rps</span>
      </div>
      <div class="bar-row">
        <span>pre-A run 1 (no config)</span>
        <div class="bar-track"><div class="bar-fill green" style="width: 48.3%"></div></div>
        <span class="bar-value">2.9 rps</span>
      </div>
      <div class="bar-row">
        <span>pre-A run 2 (no config)</span>
        <div class="bar-track"><div class="bar-fill green" style="width: 58.3%"></div></div>
        <span class="bar-value">3.5 rps</span>
      </div>
      <div class="bar-row">
        <span><b>post-A slim</b> (retries+fallbacks)</span>
        <div class="bar-track"><div class="bar-fill green" style="width: 60%"></div></div>
        <span class="bar-value"><b>3.6 rps</b></span>
      </div>
      <div class="bar-row">
        <span>post-B run 1 (slim + gunicorn)</span>
        <div class="bar-track"><div class="bar-fill green" style="width: 48.3%"></div></div>
        <span class="bar-value">2.9 rps</span>
      </div>
      <div class="bar-row">
        <span>post-B run 2 (xAI distress)</span>
        <div class="bar-track"><div class="bar-fill yellow" style="width: 33.3%"></div></div>
        <span class="bar-value">2.0 rps</span>
      </div>
      <div class="bar-row">
        <span>post-B run 3 (heavy xAI distress)</span>
        <div class="bar-track"><div class="bar-fill red" style="width: 16.7%"></div></div>
        <span class="bar-value">1.0 rps</span>
      </div>
      <div class="bar-row">
        <span><b>nano on B</b> (openai/gpt-5.4-nano)</span>
        <div class="bar-track"><div class="bar-fill green" style="width: 90%"></div></div>
        <span class="bar-value"><b>5.4 rps</b></span>
      </div>
    </div>

    <h3>P99 latency (lower is better)</h3>
    <div class="bar-chart">
      <div class="bar-row">
        <span>post-A original</span>
        <div class="bar-track"><div class="bar-fill red" style="width: 100%"></div></div>
        <span class="bar-value">74.6s</span>
      </div>
      <div class="bar-row">
        <span>pre-A run 1</span>
        <div class="bar-track"><div class="bar-fill yellow" style="width: 45.6%"></div></div>
        <span class="bar-value">34.0s</span>
      </div>
      <div class="bar-row">
        <span>pre-A run 2</span>
        <div class="bar-track"><div class="bar-fill green" style="width: 35.9%"></div></div>
        <span class="bar-value">26.8s</span>
      </div>
      <div class="bar-row">
        <span><b>post-A slim</b></span>
        <div class="bar-track"><div class="bar-fill green" style="width: 35%"></div></div>
        <span class="bar-value"><b>26.1s</b></span>
      </div>
      <div class="bar-row">
        <span>post-B run 1</span>
        <div class="bar-track"><div class="bar-fill yellow" style="width: 46.8%"></div></div>
        <span class="bar-value">34.9s</span>
      </div>
      <div class="bar-row">
        <span>post-B run 2</span>
        <div class="bar-track"><div class="bar-fill yellow" style="width: 42.5%"></div></div>
        <span class="bar-value">31.7s</span>
      </div>
      <div class="bar-row">
        <span>post-B run 3</span>
        <div class="bar-track"><div class="bar-fill yellow" style="width: 43.4%"></div></div>
        <span class="bar-value">32.4s</span>
      </div>
      <div class="bar-row">
        <span><b>nano on B</b></span>
        <div class="bar-track"><div class="bar-fill green" style="width: 11.4%"></div></div>
        <span class="bar-value"><b>8.5s</b></span>
      </div>
    </div>

    <h3>Full result matrix</h3>
    <table>
      <thead>
        <tr>
          <th>Phase</th>
          <th>Model</th>
          <th>Stage A</th>
          <th>Workers/m</th>
          <th class="num">Reqs</th>
          <th class="num">rps</th>
          <th class="num">P50</th>
          <th class="num">P95</th>
          <th class="num">P99</th>
          <th class="num">Max</th>
          <th class="num">429s</th>
          <th class="num">Fallbacks fired</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td>post-A original</td>
          <td>grok-4.3</td>
          <td><span class="tag tag-red">full + pre_call_checks</span></td>
          <td>1</td>
          <td class="num">1,169</td>
          <td class="num bad">1.9</td>
          <td class="num">6.1s</td>
          <td class="num">32.5s</td>
          <td class="num bad">74.6s</td>
          <td class="num">133.5s</td>
          <td class="num">0</td>
          <td class="num">0</td>
        </tr>
        <tr>
          <td>pre-A run 1</td>
          <td>grok-4.3</td>
          <td><span class="tag">none</span></td>
          <td>1</td>
          <td class="num">1,710</td>
          <td class="num">2.9</td>
          <td class="num">4.5s</td>
          <td class="num">21.6s</td>
          <td class="num">34.0s</td>
          <td class="num">51.8s</td>
          <td class="num">0</td>
          <td class="num">0</td>
        </tr>
        <tr>
          <td>pre-A run 2</td>
          <td>grok-4.3</td>
          <td><span class="tag">none</span></td>
          <td>1</td>
          <td class="num">2,130</td>
          <td class="num">3.5</td>
          <td class="num">3.9s</td>
          <td class="num">16.7s</td>
          <td class="num">26.8s</td>
          <td class="num">52.4s</td>
          <td class="num">0</td>
          <td class="num">0</td>
        </tr>
        <tr>
          <td><b>post-A slim</b></td>
          <td>grok-4.3</td>
          <td><span class="tag tag-green">slim</span></td>
          <td>1</td>
          <td class="num good">2,144</td>
          <td class="num good">3.6</td>
          <td class="num good">3.7s</td>
          <td class="num good">17.1s</td>
          <td class="num good">26.1s</td>
          <td class="num">72.5s</td>
          <td class="num">0</td>
          <td class="num">0</td>
        </tr>
        <tr>
          <td>post-B run 1</td>
          <td>grok-4.3</td>
          <td><span class="tag tag-green">slim</span></td>
          <td>4</td>
          <td class="num">1,750</td>
          <td class="num">2.9</td>
          <td class="num">4.7s</td>
          <td class="num">20.1s</td>
          <td class="num">34.9s</td>
          <td class="num">52.2s</td>
          <td class="num">0</td>
          <td class="num">0</td>
        </tr>
        <tr>
          <td>post-B run 2</td>
          <td>grok-4.3</td>
          <td><span class="tag tag-green">slim</span></td>
          <td>4</td>
          <td class="num">1,228</td>
          <td class="num warn">2.0</td>
          <td class="num">7.7s</td>
          <td class="num">26.2s</td>
          <td class="num">31.7s</td>
          <td class="num">50.7s</td>
          <td class="num">0</td>
          <td class="num good">135</td>
        </tr>
        <tr>
          <td>post-B run 3</td>
          <td>grok-4.3</td>
          <td><span class="tag tag-green">slim</span></td>
          <td>4</td>
          <td class="num">572</td>
          <td class="num bad">1.0</td>
          <td class="num bad">20.0s</td>
          <td class="num">30.8s</td>
          <td class="num">32.4s</td>
          <td class="num bad">148.1s</td>
          <td class="num">0</td>
          <td class="num good">412</td>
        </tr>
        <tr>
          <td><b>nano on B</b></td>
          <td>gpt-5.4-nano</td>
          <td><span class="tag tag-green">slim</span></td>
          <td>4</td>
          <td class="num good">3,213</td>
          <td class="num good">5.4</td>
          <td class="num good">3.3s</td>
          <td class="num good">7.2s</td>
          <td class="num good">8.5s</td>
          <td class="num good">27.4s</td>
          <td class="num">0</td>
          <td class="num">16</td>
        </tr>
      </tbody>
    </table>

    <p style="color: var(--fg-dim); font-size: 12.5px">
      Read the bottom 4 rows: same gateway, same Stage A+B config, same 4 gunicorn workers — only the upstream model
      changes. xAI throughput varied 1.0→2.9 rps depending on xAI capacity; OpenAI nano held 5.4 rps steady. Stage A's
      fallback chains rescued 135 (run 2) and 412 (run 3) requests during xAI distress — 100% caller-visible success
      rate.
    </p>

    <h2 id="config">Final deployed configuration (live + dev)</h2>

    <h3><code>litellm_config.yaml</code> — shared dev + live</h3>
    <pre>
 litellm_settings:
  drop_params: true
  set_verbose: false
  success_callback: ["posthog"]       # PostHog on 2xx
  failure_callback: ["posthog"]       # PostHog on errors
  callbacks: ["prometheus"]           # Prometheus on every event (Fly scrapes /metrics/)
  require_auth_for_metrics_endpoint: false
  cache: true
  cache_params: { type: redis, ttl: 300 }
  stream_timeout: 60
  default_fallbacks: ["anthropic/claude-haiku-4-5"]   # Stage A — defends #26015

 general_settings:
  master_key: os.environ/LITELLM_MASTER_KEY
  database_url: os.environ/DATABASE_URL
  store_model_in_db: true
  use_redis_transaction_buffer: true
  database_connection_pool_limit: 10  # Stage B — explicit
  request_timeout: 90
  disable_prisma_schema_update: true

 router_settings:
  redis_host: os.environ/REDIS_HOST
  redis_port: os.environ/REDIS_PORT
  redis_password: os.environ/REDIS_PASSWORD
  routing_strategy: simple-shuffle
  num_retries: 3                      # Stage A
  allowed_fails: 3                    # Stage A
  cooldown_time: 30                   # Stage A
  fallbacks:                          # Stage A — explicit chains for worst tail-latency models
    - {"openai/gpt-5.5": ["openai/gpt-5.4", "anthropic/claude-sonnet-4-6"]}
    - {"openai/gpt-5.4": ["openai/gpt-5.4-mini"]}
    - {"vertex_ai/gemini-3.1-pro-preview": ["vertex_ai/gemini-3.5-flash"]}
    - {"xai/grok-4.3": ["anthropic/claude-sonnet-4-6"]}     # <- this one fired on the live smoke
    - {"anthropic/claude-opus-4-7": ["anthropic/claude-sonnet-4-6"]}
    - {"anthropic/claude-opus-4-6": ["anthropic/claude-sonnet-4-6"]}</pre
    >

    <h3><code>fly.dev.toml</code> / <code>fly.live.toml</code> — matched</h3>
    <pre>
 [processes]
 app = "--config /app/litellm_config.yaml --host 0.0.0.0 --port 4000 \
       --run_gunicorn --num_workers 4 --max_requests_before_restart 10000"

 [http_service.concurrency]
 type = 'requests'
 soft_limit = 100
 hard_limit = 150

 [metrics]                              # Stage 0b — Fly Prometheus scrape
 port = 4000
 path = '/metrics/'

 [[vm]]
 size = 'shared-cpu-4x'                 # 4 vCPUs → 4 gunicorn workers
 memory = '4096mb'</pre
    >

    <h3><code>DATABASE_URL</code> secret (both envs)</h3>
    <pre>
 postgres://...@on-auto-ai-gateway-db.flycast:5432/litellm_{dev,live}
  ?sslmode=disable
  &amp;max_idle_connection_lifetime=60         # Stage B — defends Fly PG idle drops (#22289, #26619)
  &amp;socket_timeout=10
  &amp;keepalives=1
  &amp;keepalives_idle=60
  &amp;keepalives_interval=10
  &amp;keepalives_count=5</pre
    >

    <h2 id="fallbacks">How fallbacks work — 4-place priority hierarchy</h2>

    <table>
      <thead>
        <tr>
          <th>Priority</th>
          <th>Source</th>
          <th>Use case</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td>1 (highest)</td>
          <td>Per-request <code>fallbacks: [...]</code> in API body</td>
          <td>Per-user / per-feature routing (paid tier → opus; free tier → haiku)</td>
        </tr>
        <tr>
          <td>2</td>
          <td><code>router_settings.fallbacks</code> in YAML</td>
          <td>Source of truth — version controlled via git, deployed via CI</td>
        </tr>
        <tr>
          <td>3</td>
          <td><code>litellm_settings.default_fallbacks</code> in YAML</td>
          <td>Catch-all when no per-model chain matches</td>
        </tr>
        <tr>
          <td>4 (lowest, runtime override)</td>
          <td>LiteLLM admin DB via <code>POST /model/update</code></td>
          <td>Emergency ops override (redirect grok→haiku temporarily, no deploy)</td>
        </tr>
      </tbody>
    </table>

    <p>
      Fallbacks fire after <code>num_retries</code> (3) exhaust on the original deployment AND
      <code>allowed_fails</code> (3) within <code>cooldown_time</code> (30s) trips the circuit breaker. The router then
      tries fallback targets in order; first success wins.
    </p>

    <h2 id="prod-evidence">Production validation — fallback fired on the live smoke call</h2>

    <div class="callout success">
      Immediately after the live promote completed (v84 deployed), I called <code>/v1/chat/completions</code> with
      <code>model=xai/grok-4.3</code> as a smoke test. The response came back from
      <code>anthropic/claude-sonnet-4-6</code> — Stage A's fallback chain caught a real xAI failure on production
      traffic AND returned 200 OK to the caller.
    </div>

    <p>PostHog captured both halves of the chain:</p>

    <table>
      <thead>
        <tr>
          <th>Event #</th>
          <th>Time</th>
          <th>Model</th>
          <th>Status</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td>1</td>
          <td>06:57:38Z</td>
          <td><code>xai/grok-4.3</code></td>
          <td class="bad"><code>$ai_is_error=true</code> — xAI failed</td>
        </tr>
        <tr>
          <td>2</td>
          <td>06:57:49Z (+11s)</td>
          <td><code>anthropic/claude-sonnet-4-6</code></td>
          <td class="good">success — fallback rescued</td>
        </tr>
      </tbody>
    </table>

    <p>
      Same prompt marker <code>live-verify-1779605852</code> visible in both events.
      <code>failure_callback: ["posthog"]</code> captured the xAI error;
      <code>success_callback: ["posthog"]</code> captured the fallback success. Both events visible in PostHog within
      ~30s.
    </p>

    <h2 id="observability">Observability stack — dual backend</h2>

    <table>
      <thead>
        <tr>
          <th>&nbsp;</th>
          <th>Fly Prometheus + Grafana</th>
          <th>PostHog</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td>Pattern</td>
          <td>Pull (15s scrape)</td>
          <td>Push (per-event)</td>
        </tr>
        <tr>
          <td>Best for</td>
          <td>"Is the gateway OK NOW?"</td>
          <td>"Why did user X see slow at 3am?"</td>
        </tr>
        <tr>
          <td>Granularity</td>
          <td>aggregated counters + histograms</td>
          <td>per-request properties</td>
        </tr>
        <tr>
          <td>Retention</td>
          <td>30 days (Fly default)</td>
          <td>1 year</td>
        </tr>
        <tr>
          <td>Sample queries</td>
          <td><code>litellm_deployment_successful_fallbacks</code> rate</td>
          <td>"events where <code>$ai_is_error=true</code> in last 24h"</td>
        </tr>
        <tr>
          <td>Access</td>
          <td>
            <a href="https://fly-metrics.net/d/fly-app/fly-app?var-app=on-auto-ai-gateway-live"
              >fly-metrics.net dashboard</a
            >
          </td>
          <td><a href="https://us.posthog.com/project/224778">us.posthog.com</a></td>
        </tr>
      </tbody>
    </table>

    <details>
      <summary>Key Prometheus metrics now exposed (28 families on live, 74 on dev)</summary>
      <ul>
        <li><code>litellm_proxy_total_requests_metric_total</code> — counter, all incoming requests</li>
        <li><code>litellm_proxy_failed_requests_metric_total</code> — counter, errors</li>
        <li><code>litellm_request_total_latency_metric</code> — histogram</li>
        <li><code>litellm_llm_api_time_to_first_token_metric</code> — TTFT histogram</li>
        <li><code>litellm_deployment_cooled_down</code> — gauge per deployment</li>
        <li><code>litellm_deployment_successful_fallbacks</code> — counter (THIS proved Stage A is working)</li>
        <li><code>litellm_deployment_failed_fallbacks</code> — counter</li>
        <li>
          <code>litellm_spend_metric_total</code>, <code>litellm_input_tokens_metric_total</code>,
          <code>litellm_output_tokens_metric_total</code>
        </li>
      </ul>
    </details>

    <h2 id="prs">Deploy chain</h2>

    <table>
      <thead>
        <tr>
          <th>PR</th>
          <th>Change</th>
          <th>Result</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td><a href="https://github.com/BeOnAuto/on.auto/pull/65">#65</a></td>
          <td>Stage 0a: dev capacity parity (max=4, soft=100, hard=150)</td>
          <td>✓ merged</td>
        </tr>
        <tr>
          <td><a href="https://github.com/BeOnAuto/on.auto/pull/66">#66</a></td>
          <td>Stage 0b: enable Prometheus on dev</td>
          <td>✓ merged (later patched by #67)</td>
        </tr>
        <tr>
          <td><a href="https://github.com/BeOnAuto/on.auto/pull/67">#67</a></td>
          <td>Fix: unauth <code>/metrics/</code> + trailing slash so Fly scraper works</td>
          <td>✓ merged</td>
        </tr>
        <tr>
          <td><a href="https://github.com/BeOnAuto/on.auto/pull/68">#68</a></td>
          <td>Stage A full (with <code>enable_pre_call_checks: true</code>)</td>
          <td>✗ <b>reverted</b> after bench showed 35% throughput drop + 2× P99</td>
        </tr>
        <tr>
          <td>direct push</td>
          <td>Revert PR #68 (rollback)</td>
          <td>✓ deployed</td>
        </tr>
        <tr>
          <td><a href="https://github.com/BeOnAuto/on.auto/pull/69">#69</a></td>
          <td>Stage A slim — kept retries/cooldown/fallbacks, dropped <code>pre_call_checks</code></td>
          <td>✓ merged</td>
        </tr>
        <tr>
          <td><a href="https://github.com/BeOnAuto/on.auto/pull/70">#70</a></td>
          <td>Stage B 1+2: gunicorn 4 workers + <code>database_connection_pool_limit: 10</code></td>
          <td>✓ merged</td>
        </tr>
        <tr>
          <td><a href="https://github.com/BeOnAuto/on.auto/pull/71">#71</a></td>
          <td>Promote to live + INVESTIGATION.md + scripts moved</td>
          <td>✓ merged + promoted to live</td>
        </tr>
      </tbody>
    </table>

    <h2 id="todo">Open follow-ups (not addressed)</h2>

    <table>
      <thead>
        <tr>
          <th>Item</th>
          <th>Risk</th>
          <th>Action</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td><code>xai/grok-code-fast-1</code> deprecated 2026-05-15, retires 2026-08-15</td>
          <td>Coding routes break in ~3 months</td>
          <td>Migrate to <code>xai/grok-4.3</code></td>
        </tr>
        <tr>
          <td><code>vertex_ai/gemini-3.1-flash-lite-preview</code> retires 2026-05-25</td>
          <td>404s starting tomorrow</td>
          <td>Use GA <code>vertex_ai/gemini-3.1-flash-lite</code></td>
        </tr>
        <tr>
          <td>Vertex auth mismatch dev vs live</td>
          <td>Different auth paths</td>
          <td>Standardize on <code>VERTEX_CREDENTIALS_JSON</code></td>
        </tr>
        <tr>
          <td>6 shared upstream API keys between dev and live</td>
          <td>Dev load affects live quota</td>
          <td>Split per-env when capacity becomes a concern</td>
        </tr>
        <tr>
          <td>Per-model <code>rpm</code>/<code>tpm</code> never set</td>
          <td><code>simple-shuffle</code> has no headroom data</td>
          <td>Set from each provider's published quota</td>
        </tr>
      </tbody>
    </table>

    <h2 id="repo">Where the artifacts live</h2>

    <ul>
      <li>
        <b><code>applications/ai-gateway/INVESTIGATION.md</code></b> — full technical doc with the operations playbook
      </li>
      <li>
        <b><code>applications/ai-gateway/scripts/</code></b> — load test tooling (<code>litellm-gateway-bench.ts</code>,
        <code>litellm-loadtest.ts</code>, <code>litellm-loadtest-mixed.ts</code>) + their tests
      </li>
      <li>
        <b><code>applications/ai-gateway/litellm_config.yaml</code></b> — final deployed gateway config
      </li>
      <li>
        <b><code>applications/ai-gateway/fly.dev.toml</code></b> and <b><code>fly.live.toml</code></b> — matched Fly
        infra
      </li>
      <li>
        Raw load test JSON artifacts at <code>/tmp/litellm-bench-*.json</code> and
        <code>/tmp/litellm-loadtest-mixed-*.json</code> (local-only)
      </li>
    </ul>

    <hr class="dim" />
    <p style="color: var(--fg-dim); font-size: 12px">
      Generated 2026-05-24 from on.auto investigation. Live gateway:
      <a href="https://on-auto-ai-gateway-live.fly.dev/health/readiness"
        >on-auto-ai-gateway-live.fly.dev/health/readiness</a
      >
      · Dev:
      <a href="https://on-auto-ai-gateway-dev.fly.dev/health/readiness"
        >on-auto-ai-gateway-dev.fly.dev/health/readiness</a
      >
    </p>
  </body>
 </html>
No results found