lizthegrey · March 4, 2026 20:23
diff --git a/index.html b/index.html
 <!DOCTYPE html>
 <html lang="en">
 <head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>LLM Performance vs Cost - Interactive Pareto Frontier (March 2026)</title>
    <script src="https://d3js.org/d3.v7.min.js"></script>
    <style>
        body {
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
            margin: 0;
            padding: 20px;
            background: #1a1a1a;
        }

        .container {
            max-width: 1400px;
            margin: 0 auto;
            background: #2a2a2a;
            padding: 30px;
            border-radius: 8px;
            box-shadow: 0 2px 8px rgba(0,0,0,0.3);
        }

        h1 {
            margin-top: 0;
            color: #e0e0e0;
        }

        .controls {
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
            gap: 20px;
            margin: 20px 0;
            padding: 20px;
            background: #333;
            border-radius: 4px;
        }

        .control-group {
            display: flex;
            flex-direction: column;
        }

        label {
            font-weight: 600;
            margin-bottom: 8px;
            color: #ccc;
            font-size: 14px;
        }

        input[type="range"] {
            width: 100%;
        }

        .value-display {
            color: #0066cc;
            font-weight: 600;
            margin-top: 4px;
            font-size: 13px;
        }

        .presets {
            display: flex;
            gap: 10px;
            flex-wrap: wrap;
        }

        button {
            padding: 8px 16px;
            border: 1px solid #555;
            background: #3a3a3a;
            color: #e0e0e0;
            border-radius: 4px;
            cursor: pointer;
            font-size: 13px;
            transition: all 0.2s;
        }

        button:hover {
            background: #4a4a4a;
        }

        button.active {
            background: #0066cc;
            color: white;
            border-color: #0066cc;
        }

        .chart-container {
            margin-top: 30px;
        }

        .tooltip {
            position: absolute;
            padding: 10px;
            background: rgba(0, 0, 0, 0.8);
            color: white;
            border-radius: 4px;
            pointer-events: none;
            font-size: 12px;
            opacity: 0;
            transition: opacity 0.2s;
        }

        .legend {
            display: flex;
            gap: 20px;
            margin-top: 20px;
            flex-wrap: wrap;
        }

        .legend-item {
            display: flex;
            align-items: center;
            gap: 8px;
            font-size: 13px;
            color: #ccc;
        }

        .legend-circle {
            width: 12px;
            height: 12px;
            border-radius: 50%;
        }

        .info {
            margin-top: 20px;
            padding: 15px;
            background: #333;
            border-left: 4px solid #0066cc;
            border-radius: 4px;
            font-size: 14px;
            color: #ccc;
        }
    </style>
 </head>
 <body>
    <div class="container">
        <h1>LLM Performance vs Cost: Interactive Pareto Frontier</h1>

        <div class="info">
            <strong>Why this matters:</strong> Vendor benchmark charts assume fixed token ratios with no caching.
            Real-world usage varies dramatically&mdash;coding agents often run 50:1+ output:input ratios with 80-90% cache hits,
            completely changing the cost picture and Pareto frontier.<br><br>
            <strong>Data:</strong> Arena Elo scores from <a href="https://arena.ai/leaderboard" style="color: #6ba3e0;">arena.ai</a> (March 2026).
            Prices are interactive/real-time; batch discounts excluded (too slow for agentic use).
        </div>

        <div class="controls">
            <div class="control-group">
                <label>Benchmark</label>
                <div style="display: flex; gap: 10px;">
                    <label style="display: flex; align-items: center; gap: 6px; font-weight: normal;">
                        <input type="radio" name="benchmark" value="general" checked style="width: auto; margin: 0;">
                        General Chat (Arena)
                    </label>
                    <label style="display: flex; align-items: center; gap: 6px; font-weight: normal;">
                        <input type="radio" name="benchmark" value="coding" style="width: auto; margin: 0;">
                        Coding (Arena)
                    </label>
                    <label style="display: flex; align-items: center; gap: 6px; font-weight: normal;">
                        <input type="radio" name="benchmark" value="swe" style="width: auto; margin: 0;">
                        SWE-Bench Verified
                    </label>
                </div>
            </div>

            <div class="control-group">
                <label for="inputOutputRatio">Input:Output Token Ratio</label>
                <input type="range" id="inputOutputRatio" min="0" max="100" value="5" step="1">
                <div class="value-display" id="ratioDisplay">1:5</div>
            </div>

            <div class="control-group">
                <label for="cacheHitRate">Cache Hit Rate (%)</label>
                <input type="range" id="cacheHitRate" min="0" max="100" value="0" step="5">
                <div class="value-display" id="cacheDisplay">0%</div>
            </div>

            <div class="control-group">
                <label for="thinkingOverhead">Thinking Token Overhead (%)</label>
                <input type="range" id="thinkingOverhead" min="0" max="300" value="100" step="25">
                <div class="value-display" id="thinkingDisplay">100%</div>
            </div>
        </div>

        <div class="presets">
            <button onclick="applyPreset('casual')">Casual Chat (1:1, no cache)</button>
            <button onclick="applyPreset('coding')">Coding Agent (1:50, 85% cache)</button>
            <button onclick="applyPreset('rag')">RAG System (1:10, 70% cache)</button>
            <button onclick="applyPreset('docs')">Document Analysis (100:1, 95% cache)</button>
            <button onclick="applyPreset('google')">Vendor Default (1:5, no cache)</button>
        </div>

        <div class="chart-container">
            <svg id="chart"></svg>
        </div>

        <div class="legend" id="legend"></div>
    </div>

    <div class="tooltip" id="tooltip"></div>

    <script>
        // Model data: Arena Elo scores and API pricing (per million tokens)
        // Arena scores: https://arena.ai/leaderboard (General Chat + Coding, March 4 2026)
        // Pricing: provider docs (Anthropic, OpenAI, Google, xAI, DeepSeek, Alibaba/Qwen)
        // thinkingMode: 'full' = full thinking overhead, 'minimal' = reduced overhead, null = no thinking
        // sweScore: SWE-Bench Verified % (https://www.swebench.com/, Feb 2026) — null where unavailable
        const models = [
            // Anthropic — https://docs.anthropic.com/en/docs/about-claude/models
            { name: 'Opus 4.6 (thinking)', provider: 'Anthropic', generalElo: 1501, codingElo: 1553, sweScore: 80.8, inputPrice: 5.0, outputPrice: 25.0, cachePrice: 0.50, thinkingMode: 'full' },
            { name: 'Opus 4.6', provider: 'Anthropic', generalElo: 1505, codingElo: 1547, sweScore: 80.8, inputPrice: 5.0, outputPrice: 25.0, cachePrice: 0.50, thinkingMode: null },
            { name: 'Sonnet 4.6', provider: 'Anthropic', generalElo: 1460, codingElo: 1528, sweScore: 79.6, inputPrice: 3.0, outputPrice: 15.0, cachePrice: 0.30, thinkingMode: null },
            { name: 'Haiku 4.5', provider: 'Anthropic', generalElo: 1406, codingElo: 1477, sweScore: 73.3, inputPrice: 1.0, outputPrice: 5.0, cachePrice: 0.10, thinkingMode: null },
            // Google — https://ai.google.dev/gemini-api/docs/pricing
            { name: 'Gemini 3.1 Pro', provider: 'Google', generalElo: 1500, codingElo: 1541, sweScore: null, inputPrice: 2.0, outputPrice: 12.0, cachePrice: 0.20, thinkingMode: 'full' },
            { name: 'Gemini 3 Pro', provider: 'Google', generalElo: 1486, codingElo: 1519, sweScore: 76.2, inputPrice: 2.0, outputPrice: 12.0, cachePrice: 0.20, thinkingMode: 'full' },
            { name: 'Gemini 3 Flash', provider: 'Google', generalElo: 1473, codingElo: 1507, sweScore: 76.2, inputPrice: 0.50, outputPrice: 3.0, cachePrice: 0.05, thinkingMode: 'full' },
            { name: 'Gemini 3 Flash (thinking-min)', provider: 'Google', generalElo: 1461, codingElo: 1499, sweScore: null, inputPrice: 0.50, outputPrice: 3.0, cachePrice: 0.05, thinkingMode: 'minimal' },
            { name: 'Gemini 3.1 Flash-Lite', provider: 'Google', generalElo: 1436, codingElo: 1457, sweScore: null, inputPrice: 0.25, outputPrice: 1.50, cachePrice: 0.025, thinkingMode: 'minimal' },
            { name: 'Gemini 2.5 Pro', provider: 'Google', generalElo: 1449, codingElo: 1467, sweScore: 53.6, inputPrice: 1.25, outputPrice: 10.0, cachePrice: 0.125, thinkingMode: 'full' },
            { name: 'Gemini 2.5 Flash', provider: 'Google', generalElo: 1410, codingElo: null, sweScore: null, inputPrice: 0.30, outputPrice: 2.50, cachePrice: 0.03, thinkingMode: 'full' },
            // xAI
            { name: 'Grok 4.1 (thinking)', provider: 'xAI', generalElo: 1473, codingElo: 1506, sweScore: null, inputPrice: 3.0, outputPrice: 15.0, cachePrice: null, thinkingMode: 'full' },
            { name: 'Grok 4.1', provider: 'xAI', generalElo: 1463, codingElo: 1494, sweScore: null, inputPrice: 3.0, outputPrice: 15.0, cachePrice: null, thinkingMode: null },
            { name: 'Grok 4.1 Fast', provider: 'xAI', generalElo: 1430, codingElo: 1457, sweScore: null, inputPrice: 0.20, outputPrice: 0.50, cachePrice: null, thinkingMode: 'full' },
            // OpenAI — https://openai.com/api/pricing/
            { name: 'GPT-5.2', provider: 'OpenAI', generalElo: 1438, codingElo: 1498, sweScore: 80.0, inputPrice: 1.75, outputPrice: 14.0, cachePrice: 0.175, thinkingMode: null },
            { name: 'GPT-5.1 (high)', provider: 'OpenAI', generalElo: 1456, codingElo: 1491, sweScore: 76.3, inputPrice: 1.25, outputPrice: 10.0, cachePrice: 0.125, thinkingMode: 'full' },
            { name: 'GPT-5', provider: 'OpenAI', generalElo: 1426, codingElo: 1463, sweScore: null, inputPrice: 1.25, outputPrice: 10.0, cachePrice: 0.125, thinkingMode: null },
            { name: 'o3', provider: 'OpenAI', generalElo: 1432, codingElo: 1457, sweScore: null, inputPrice: 2.0, outputPrice: 8.0, cachePrice: 1.0, thinkingMode: 'full' },
            { name: 'o4-mini', provider: 'OpenAI', generalElo: 1391, codingElo: 1430, sweScore: null, inputPrice: 1.10, outputPrice: 4.40, cachePrice: 0.275, thinkingMode: 'full' },
            { name: 'GPT-4.1', provider: 'OpenAI', generalElo: 1413, codingElo: 1455, sweScore: null, inputPrice: 2.0, outputPrice: 8.0, cachePrice: 0.50, thinkingMode: null },
            { name: 'GPT-4.1 Mini', provider: 'OpenAI', generalElo: 1382, codingElo: 1432, sweScore: null, inputPrice: 0.40, outputPrice: 1.60, cachePrice: 0.10, thinkingMode: null },
            // DeepSeek — https://api-docs.deepseek.com/quick_start/pricing
            { name: 'DeepSeek V3.2', provider: 'DeepSeek', generalElo: 1421, codingElo: 1467, sweScore: 73.0, inputPrice: 0.14, outputPrice: 0.28, cachePrice: null, thinkingMode: null },
            { name: 'DeepSeek V3.2 (thinking)', provider: 'DeepSeek', generalElo: 1420, codingElo: 1470, sweScore: 73.0, inputPrice: 0.14, outputPrice: 0.28, cachePrice: null, thinkingMode: 'full' },
            { name: 'DeepSeek R1', provider: 'DeepSeek', generalElo: 1419, codingElo: 1463, sweScore: null, inputPrice: 0.55, outputPrice: 2.19, cachePrice: null, thinkingMode: 'full' },
            // Qwen/Alibaba — https://help.aliyun.com/zh/model-studio/getting-started/models
            { name: 'Qwen3 Max', provider: 'Qwen', generalElo: 1434, codingElo: 1482, sweScore: null, inputPrice: 1.20, outputPrice: 6.0, cachePrice: null, thinkingMode: null },
            { name: 'Qwen3-235B', provider: 'Qwen', generalElo: 1422, codingElo: 1471, sweScore: null, inputPrice: 0.20, outputPrice: 1.0, cachePrice: null, thinkingMode: null },
        ];

        // Provider colors
        const providerColors = {
            'Google': '#4285f4',
            'Anthropic': '#d4a373',
            'xAI': '#1da1f2',
            'OpenAI': '#10a37f',
            'DeepSeek': '#7c3aed',
            'Qwen': '#ef4444'
        };

        // Chart dimensions
        const margin = { top: 20, right: 120, bottom: 60, left: 60 };
        const width = 1200 - margin.left - margin.right;
        const height = 600 - margin.top - margin.bottom;

        // Create SVG
        const svg = d3.select('#chart')
            .attr('width', width + margin.left + margin.right)
            .attr('height', height + margin.top + margin.bottom)
            .append('g')
            .attr('transform', `translate(${margin.left},${margin.top})`);

        // Scales - log scale INVERTED (expensive on left, cheap on right like Google's chart)
        const xScale = d3.scaleLog().range([width, 0]);
        const yScale = d3.scaleLinear().range([height, 0]);

        // Axes
        const xAxis = svg.append('g')
            .attr('transform', `translate(0,${height})`);

        const yAxis = svg.append('g');

        // Axis labels
        svg.append('text')
            .attr('x', width / 2)
            .attr('y', height + 50)
            .attr('text-anchor', 'middle')
            .style('font-size', '14px')
            .style('font-weight', '600')
            .style('fill', '#ccc')
            .text('$ Price per million tokens (log scale)');

        const yAxisLabel = svg.append('text')
            .attr('transform', 'rotate(-90)')
            .attr('x', -height / 2)
            .attr('y', -45)
            .attr('text-anchor', 'middle')
            .style('font-size', '14px')
            .style('font-weight', '600')
            .style('fill', '#ccc')
            .text('Arena Elo Score (General Chat)');

        // Pareto frontier line
        const paretoLine = svg.append('path')
            .attr('fill', 'none')
            .attr('stroke', '#ff6b6b')
            .attr('stroke-width', 2)
            .attr('stroke-dasharray', '5,5');

        // Tooltip
        const tooltip = d3.select('#tooltip');

        function calculateCost(model, inputTokens, outputTokens, cacheHitRate, thinkingOverhead) {
            const cachedTokens = model.cachePrice !== null ? inputTokens * (cacheHitRate / 100) : 0;
            const regularInputTokens = inputTokens - cachedTokens;

            // Apply thinking token overhead based on model's thinkingMode property
            let effectiveOutputTokens = outputTokens;
            if (model.thinkingMode === 'full') {
                // Full thinking models: full slider overhead
                effectiveOutputTokens = outputTokens * (1 + thinkingOverhead / 100);
            } else if (model.thinkingMode === 'minimal') {
                // Minimal thinking models: 25% of slider overhead
                effectiveOutputTokens = outputTokens * (1 + thinkingOverhead / 400);
            }
            // Non-thinking models (thinkingMode: null): no overhead applied

            // Prices are per million tokens, so divide token counts by 1M
            let cost = (regularInputTokens * model.inputPrice + effectiveOutputTokens * model.outputPrice) / 1000000;

            if (cachedTokens > 0) {
                cost += (cachedTokens * model.cachePrice) / 1000000;
            }

            return cost;
        }

        function calculateParetoFrontier(points) {
            // Sort by cost ASCENDING (cheap to expensive)
            // On inverted axis, this goes right to left (cheap on right, expensive on left)
            // Keep points with increasing score as we go from cheap to expensive
            const sorted = [...points].sort((a, b) => a.cost - b.cost);
            const frontier = [];
            let maxScore = -Infinity;

            for (const point of sorted) {
                if (point.score > maxScore) {
                    frontier.push(point);
                    maxScore = point.score;
                }
            }

            return frontier;
        }

        function updateChart() {
            // Get current settings
            const inputOutputRatio = parseInt(document.getElementById('inputOutputRatio').value);
            const cacheHitRate = parseInt(document.getElementById('cacheHitRate').value);
            const thinkingOverhead = parseInt(document.getElementById('thinkingOverhead').value);
            const benchmark = document.querySelector('input[name="benchmark"]:checked').value;

            // Calculate costs for each model (per 1M total tokens)
            const inputRatio = 1 / (1 + inputOutputRatio);
            const outputRatio = inputOutputRatio / (1 + inputOutputRatio);

            const inputTokens = 1000000 * inputRatio;
            const outputTokens = 1000000 * outputRatio;

            // Filter models based on benchmark and add cost
            const scoreField = benchmark === 'general' ? 'generalElo' : benchmark === 'coding' ? 'codingElo' : 'sweScore';
            const allDataPoints = models.map(model => ({
                ...model,
                cost: calculateCost(model, inputTokens, outputTokens, cacheHitRate, thinkingOverhead),
                score: model[scoreField]
            }));

            // Filter out models without scores for selected benchmark
            const dataPoints = allDataPoints.filter(d => d.score !== null);

            // Update Y-axis label based on benchmark
            const yLabels = { general: 'Arena Elo Score (General Chat)', coding: 'Arena Elo Score (Coding)', swe: 'SWE-Bench Verified Score (%)' };
            yAxisLabel.text(yLabels[benchmark]);

            // Update scales
            const minCost = d3.min(dataPoints, d => d.cost);
            const maxCost = d3.max(dataPoints, d => d.cost);
            const minScore = d3.min(dataPoints, d => d.score);
            const maxScore = d3.max(dataPoints, d => d.score);

            // Log scale needs positive values, add padding
            xScale.domain([Math.max(minCost * 0.8, 0.01), maxCost * 1.2]);
            if (benchmark === 'swe') {
                yScale.domain([Math.floor(minScore / 10) * 10, Math.ceil(maxScore / 10) * 10]);
            } else {
                yScale.domain([Math.floor(minScore / 50) * 50, Math.ceil(maxScore / 50) * 50]);
            }

            // Update axes
            xAxis.transition().duration(500)
                .call(d3.axisBottom(xScale)
                    .ticks(10)
                    .tickFormat(d => {
                        if (d >= 1) return `$${d.toFixed(0)}`;
                        if (d >= 0.1) return `$${d.toFixed(1)}`;
                        return `$${d.toFixed(2)}`;
                    }))
                .call(g => g.selectAll('.tick text').style('fill', '#ccc'))
                .call(g => g.selectAll('.tick line').style('stroke', '#555'))
                .call(g => g.select('.domain').style('stroke', '#555'));

            yAxis.transition().duration(500)
                .call(d3.axisLeft(yScale).ticks(10))
                .call(g => g.selectAll('.tick text').style('fill', '#ccc'))
                .call(g => g.selectAll('.tick line').style('stroke', '#555'))
                .call(g => g.select('.domain').style('stroke', '#555'));

            // Calculate Pareto frontier
            const frontierPoints = calculateParetoFrontier(dataPoints);

            // Update Pareto line
            const lineGenerator = d3.line()
                .x(d => xScale(d.cost))
                .y(d => yScale(d.score));

            paretoLine.transition().duration(500)
                .attr('d', lineGenerator(frontierPoints));

            // Update circles
            const circles = svg.selectAll('.model-circle')
                .data(dataPoints, d => d.name);

            circles.exit().remove();

            const circlesEnter = circles.enter()
                .append('circle')
                .attr('class', 'model-circle')
                .attr('r', 6)
                .attr('opacity', 0.8)
                .attr('stroke', '#fff')
                .attr('stroke-width', 2)
                .on('mouseover', function(event, d) {
                    d3.select(this)
                        .attr('r', 8)
                        .attr('opacity', 1);

                    const benchmark = document.querySelector('input[name="benchmark"]:checked').value;
                    const scoreLabel = benchmark === 'general' ? 'General Elo' : benchmark === 'coding' ? 'Coding Elo' : 'SWE-Bench';
                    tooltip
                        .style('opacity', 1)
                        .html(`
                            <strong>${d.name}</strong><br>
                            Provider: ${d.provider}<br>
                            ${scoreLabel}: ${d.score}<br>
                            Cost: $${d.cost.toFixed(2)}/M tokens<br>
                            Input: $${d.inputPrice}/M &middot; Output: $${d.outputPrice}/M<br>
                            ${d.cachePrice !== null ? `Cache: $${d.cachePrice}/M` : 'Cache: N/A'}
                            ${d.thinkingMode ? ` &middot; Thinking: ${d.thinkingMode}` : ''}
                        `)
                        .style('left', (event.pageX + 10) + 'px')
                        .style('top', (event.pageY - 10) + 'px');
                })
                .on('mouseout', function() {
                    d3.select(this)
                        .attr('r', 6)
                        .attr('opacity', 0.8);

                    tooltip.style('opacity', 0);
                });

            circles.merge(circlesEnter)
                .transition()
                .duration(500)
                .attr('cx', d => xScale(d.cost))
                .attr('cy', d => yScale(d.score))
                .attr('fill', d => providerColors[d.provider]);

            // Update labels for all models (make frontier models more prominent)
            const frontierSet = new Set(frontierPoints.map(d => d.name));

            const labels = svg.selectAll('.model-label')
                .data(dataPoints, d => d.name);

            labels.exit().remove();

            const labelsEnter = labels.enter()
                .append('text')
                .attr('class', 'model-label')
                .attr('font-size', '11px')
                .style('pointer-events', 'none');

            labels.merge(labelsEnter)
                .transition()
                .duration(500)
                .attr('x', d => xScale(d.cost) + 10)
                .attr('y', d => yScale(d.score) + 4)
                .attr('font-weight', d => frontierSet.has(d.name) ? '600' : '400')
                .attr('fill', d => frontierSet.has(d.name) ? '#e0e0e0' : '#888')
                .attr('opacity', d => frontierSet.has(d.name) ? 1 : 0.6)
                .text(d => d.name);
        }

        function applyPreset(preset) {
            const presets = {
                casual: { ratio: 1, cache: 0 },
                coding: { ratio: 50, cache: 85 },
                rag: { ratio: 10, cache: 70 },
                docs: { ratio: 1, cache: 95 },
                google: { ratio: 5, cache: 0 }
            };

            const config = presets[preset];
            document.getElementById('inputOutputRatio').value = config.ratio;
            document.getElementById('cacheHitRate').value = config.cache;

            updateDisplays();
            updateChart();
        }

        function updateDisplays() {
            const ratio = parseInt(document.getElementById('inputOutputRatio').value);
            const cache = parseInt(document.getElementById('cacheHitRate').value);
            const thinking = parseInt(document.getElementById('thinkingOverhead').value);

            document.getElementById('ratioDisplay').textContent = `1:${ratio}`;
            document.getElementById('cacheDisplay').textContent = `${cache}%`;
            document.getElementById('thinkingDisplay').textContent = `${thinking}%`;
        }

        // Create legend
        function createLegend() {
            const legend = document.getElementById('legend');
            const providers = [...new Set(models.map(m => m.provider))];

            providers.forEach(provider => {
                const item = document.createElement('div');
                item.className = 'legend-item';

                const circle = document.createElement('div');
                circle.className = 'legend-circle';
                circle.style.backgroundColor = providerColors[provider];

                const label = document.createElement('span');
                label.textContent = provider;

                item.appendChild(circle);
                item.appendChild(label);
                legend.appendChild(item);
            });
        }

        // Event listeners
        document.getElementById('inputOutputRatio').addEventListener('input', () => {
            updateDisplays();
            updateChart();
        });

        document.getElementById('cacheHitRate').addEventListener('input', () => {
            updateDisplays();
            updateChart();
        });

        document.getElementById('thinkingOverhead').addEventListener('input', () => {
            updateDisplays();
            updateChart();
        });

        document.querySelectorAll('input[name="benchmark"]').forEach(radio => {
            radio.addEventListener('change', updateChart);
        });

        // Initialize
        createLegend();
        updateDisplays();
        updateChart();
    </script>
 </body>
 </html>
	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>LLM Performance vs Cost - Interactive Pareto Frontier (March 2026)</title>
	<script src="https://d3js.org/d3.v7.min.js"></script>
	<style>
	body {
	font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
	margin: 0;
	padding: 20px;
	background: #1a1a1a;
	}

	.container {
	max-width: 1400px;
	margin: 0 auto;
	background: #2a2a2a;
	padding: 30px;
	border-radius: 8px;
	box-shadow: 0 2px 8px rgba(0,0,0,0.3);
	}

	h1 {
	margin-top: 0;
	color: #e0e0e0;
	}

	.controls {
	display: grid;
	grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
	gap: 20px;
	margin: 20px 0;
	padding: 20px;
	background: #333;
	border-radius: 4px;
	}

	.control-group {
	display: flex;
	flex-direction: column;
	}

	label {
	font-weight: 600;
	margin-bottom: 8px;
	color: #ccc;
	font-size: 14px;
	}

	input[type="range"] {
	width: 100%;
	}

	.value-display {
	color: #0066cc;
	font-weight: 600;
	margin-top: 4px;
	font-size: 13px;
	}

	.presets {
	display: flex;
	gap: 10px;
	flex-wrap: wrap;
	}

	button {
	padding: 8px 16px;
	border: 1px solid #555;
	background: #3a3a3a;
	color: #e0e0e0;
	border-radius: 4px;
	cursor: pointer;
	font-size: 13px;
	transition: all 0.2s;
	}

	button:hover {
	background: #4a4a4a;
	}

	button.active {
	background: #0066cc;
	color: white;
	border-color: #0066cc;
	}

	.chart-container {
	margin-top: 30px;
	}

	.tooltip {
	position: absolute;
	padding: 10px;
	background: rgba(0, 0, 0, 0.8);
	color: white;
	border-radius: 4px;
	pointer-events: none;
	font-size: 12px;
	opacity: 0;
	transition: opacity 0.2s;
	}

	.legend {
	display: flex;
	gap: 20px;
	margin-top: 20px;
	flex-wrap: wrap;
	}

	.legend-item {
	display: flex;
	align-items: center;
	gap: 8px;
	font-size: 13px;
	color: #ccc;
	}

	.legend-circle {
	width: 12px;
	height: 12px;
	border-radius: 50%;
	}

	.info {
	margin-top: 20px;
	padding: 15px;
	background: #333;
	border-left: 4px solid #0066cc;
	border-radius: 4px;
	font-size: 14px;
	color: #ccc;
	}
	</style>
	</head>
	<body>
	<div class="container">
	<h1>LLM Performance vs Cost: Interactive Pareto Frontier</h1>

	<div class="info">
	<strong>Why this matters:</strong> Vendor benchmark charts assume fixed token ratios with no caching.
	Real-world usage varies dramatically—coding agents often run 50:1+ output:input ratios with 80-90% cache hits,
	completely changing the cost picture and Pareto frontier.<br><br>
	<strong>Data:</strong> Arena Elo scores from <a href="https://arena.ai/leaderboard" style="color: #6ba3e0;">arena.ai</a> (March 2026).
	Prices are interactive/real-time; batch discounts excluded (too slow for agentic use).
	</div>

	<div class="controls">
	<div class="control-group">
	<label>Benchmark</label>
	<div style="display: flex; gap: 10px;">
	<label style="display: flex; align-items: center; gap: 6px; font-weight: normal;">
	<input type="radio" name="benchmark" value="general" checked style="width: auto; margin: 0;">
	General Chat (Arena)
	</label>
	<label style="display: flex; align-items: center; gap: 6px; font-weight: normal;">
	<input type="radio" name="benchmark" value="coding" style="width: auto; margin: 0;">
	Coding (Arena)
	</label>
	<label style="display: flex; align-items: center; gap: 6px; font-weight: normal;">
	<input type="radio" name="benchmark" value="swe" style="width: auto; margin: 0;">
	SWE-Bench Verified
	</label>
	</div>
	</div>

	<div class="control-group">
	<label for="inputOutputRatio">Input:Output Token Ratio</label>
	<input type="range" id="inputOutputRatio" min="0" max="100" value="5" step="1">
	<div class="value-display" id="ratioDisplay">1:5</div>
	</div>

	<div class="control-group">
	<label for="cacheHitRate">Cache Hit Rate (%)</label>
	<input type="range" id="cacheHitRate" min="0" max="100" value="0" step="5">
	<div class="value-display" id="cacheDisplay">0%</div>
	</div>

	<div class="control-group">
	<label for="thinkingOverhead">Thinking Token Overhead (%)</label>
	<input type="range" id="thinkingOverhead" min="0" max="300" value="100" step="25">
	<div class="value-display" id="thinkingDisplay">100%</div>
	</div>
	</div>

	<div class="presets">
	<button onclick="applyPreset('casual')">Casual Chat (1:1, no cache)</button>
	<button onclick="applyPreset('coding')">Coding Agent (1:50, 85% cache)</button>
	<button onclick="applyPreset('rag')">RAG System (1:10, 70% cache)</button>
	<button onclick="applyPreset('docs')">Document Analysis (100:1, 95% cache)</button>
	<button onclick="applyPreset('google')">Vendor Default (1:5, no cache)</button>
	</div>

	<div class="chart-container">
	<svg id="chart"></svg>
	</div>

	<div class="legend" id="legend"></div>
	</div>

	<div class="tooltip" id="tooltip"></div>

	<script>
	// Model data: Arena Elo scores and API pricing (per million tokens)
	// Arena scores: https://arena.ai/leaderboard (General Chat + Coding, March 4 2026)
	// Pricing: provider docs (Anthropic, OpenAI, Google, xAI, DeepSeek, Alibaba/Qwen)
	// thinkingMode: 'full' = full thinking overhead, 'minimal' = reduced overhead, null = no thinking
	// sweScore: SWE-Bench Verified % (https://www.swebench.com/, Feb 2026) — null where unavailable
	const models = [
	// Anthropic — https://docs.anthropic.com/en/docs/about-claude/models
	{ name: 'Opus 4.6 (thinking)', provider: 'Anthropic', generalElo: 1501, codingElo: 1553, sweScore: 80.8, inputPrice: 5.0, outputPrice: 25.0, cachePrice: 0.50, thinkingMode: 'full' },
	{ name: 'Opus 4.6', provider: 'Anthropic', generalElo: 1505, codingElo: 1547, sweScore: 80.8, inputPrice: 5.0, outputPrice: 25.0, cachePrice: 0.50, thinkingMode: null },
	{ name: 'Sonnet 4.6', provider: 'Anthropic', generalElo: 1460, codingElo: 1528, sweScore: 79.6, inputPrice: 3.0, outputPrice: 15.0, cachePrice: 0.30, thinkingMode: null },
	{ name: 'Haiku 4.5', provider: 'Anthropic', generalElo: 1406, codingElo: 1477, sweScore: 73.3, inputPrice: 1.0, outputPrice: 5.0, cachePrice: 0.10, thinkingMode: null },
	// Google — https://ai.google.dev/gemini-api/docs/pricing
	{ name: 'Gemini 3.1 Pro', provider: 'Google', generalElo: 1500, codingElo: 1541, sweScore: null, inputPrice: 2.0, outputPrice: 12.0, cachePrice: 0.20, thinkingMode: 'full' },
	{ name: 'Gemini 3 Pro', provider: 'Google', generalElo: 1486, codingElo: 1519, sweScore: 76.2, inputPrice: 2.0, outputPrice: 12.0, cachePrice: 0.20, thinkingMode: 'full' },
	{ name: 'Gemini 3 Flash', provider: 'Google', generalElo: 1473, codingElo: 1507, sweScore: 76.2, inputPrice: 0.50, outputPrice: 3.0, cachePrice: 0.05, thinkingMode: 'full' },
	{ name: 'Gemini 3 Flash (thinking-min)', provider: 'Google', generalElo: 1461, codingElo: 1499, sweScore: null, inputPrice: 0.50, outputPrice: 3.0, cachePrice: 0.05, thinkingMode: 'minimal' },
	{ name: 'Gemini 3.1 Flash-Lite', provider: 'Google', generalElo: 1436, codingElo: 1457, sweScore: null, inputPrice: 0.25, outputPrice: 1.50, cachePrice: 0.025, thinkingMode: 'minimal' },
	{ name: 'Gemini 2.5 Pro', provider: 'Google', generalElo: 1449, codingElo: 1467, sweScore: 53.6, inputPrice: 1.25, outputPrice: 10.0, cachePrice: 0.125, thinkingMode: 'full' },
	{ name: 'Gemini 2.5 Flash', provider: 'Google', generalElo: 1410, codingElo: null, sweScore: null, inputPrice: 0.30, outputPrice: 2.50, cachePrice: 0.03, thinkingMode: 'full' },
	// xAI
	{ name: 'Grok 4.1 (thinking)', provider: 'xAI', generalElo: 1473, codingElo: 1506, sweScore: null, inputPrice: 3.0, outputPrice: 15.0, cachePrice: null, thinkingMode: 'full' },
	{ name: 'Grok 4.1', provider: 'xAI', generalElo: 1463, codingElo: 1494, sweScore: null, inputPrice: 3.0, outputPrice: 15.0, cachePrice: null, thinkingMode: null },
	{ name: 'Grok 4.1 Fast', provider: 'xAI', generalElo: 1430, codingElo: 1457, sweScore: null, inputPrice: 0.20, outputPrice: 0.50, cachePrice: null, thinkingMode: 'full' },
	// OpenAI — https://openai.com/api/pricing/
	{ name: 'GPT-5.2', provider: 'OpenAI', generalElo: 1438, codingElo: 1498, sweScore: 80.0, inputPrice: 1.75, outputPrice: 14.0, cachePrice: 0.175, thinkingMode: null },
	{ name: 'GPT-5.1 (high)', provider: 'OpenAI', generalElo: 1456, codingElo: 1491, sweScore: 76.3, inputPrice: 1.25, outputPrice: 10.0, cachePrice: 0.125, thinkingMode: 'full' },
	{ name: 'GPT-5', provider: 'OpenAI', generalElo: 1426, codingElo: 1463, sweScore: null, inputPrice: 1.25, outputPrice: 10.0, cachePrice: 0.125, thinkingMode: null },
	{ name: 'o3', provider: 'OpenAI', generalElo: 1432, codingElo: 1457, sweScore: null, inputPrice: 2.0, outputPrice: 8.0, cachePrice: 1.0, thinkingMode: 'full' },
	{ name: 'o4-mini', provider: 'OpenAI', generalElo: 1391, codingElo: 1430, sweScore: null, inputPrice: 1.10, outputPrice: 4.40, cachePrice: 0.275, thinkingMode: 'full' },
	{ name: 'GPT-4.1', provider: 'OpenAI', generalElo: 1413, codingElo: 1455, sweScore: null, inputPrice: 2.0, outputPrice: 8.0, cachePrice: 0.50, thinkingMode: null },
	{ name: 'GPT-4.1 Mini', provider: 'OpenAI', generalElo: 1382, codingElo: 1432, sweScore: null, inputPrice: 0.40, outputPrice: 1.60, cachePrice: 0.10, thinkingMode: null },
	// DeepSeek — https://api-docs.deepseek.com/quick_start/pricing
	{ name: 'DeepSeek V3.2', provider: 'DeepSeek', generalElo: 1421, codingElo: 1467, sweScore: 73.0, inputPrice: 0.14, outputPrice: 0.28, cachePrice: null, thinkingMode: null },
	{ name: 'DeepSeek V3.2 (thinking)', provider: 'DeepSeek', generalElo: 1420, codingElo: 1470, sweScore: 73.0, inputPrice: 0.14, outputPrice: 0.28, cachePrice: null, thinkingMode: 'full' },
	{ name: 'DeepSeek R1', provider: 'DeepSeek', generalElo: 1419, codingElo: 1463, sweScore: null, inputPrice: 0.55, outputPrice: 2.19, cachePrice: null, thinkingMode: 'full' },
	// Qwen/Alibaba — https://help.aliyun.com/zh/model-studio/getting-started/models
	{ name: 'Qwen3 Max', provider: 'Qwen', generalElo: 1434, codingElo: 1482, sweScore: null, inputPrice: 1.20, outputPrice: 6.0, cachePrice: null, thinkingMode: null },
	{ name: 'Qwen3-235B', provider: 'Qwen', generalElo: 1422, codingElo: 1471, sweScore: null, inputPrice: 0.20, outputPrice: 1.0, cachePrice: null, thinkingMode: null },
	];

	// Provider colors
	const providerColors = {
	'Google': '#4285f4',
	'Anthropic': '#d4a373',
	'xAI': '#1da1f2',
	'OpenAI': '#10a37f',
	'DeepSeek': '#7c3aed',
	'Qwen': '#ef4444'
	};

	// Chart dimensions
	const margin = { top: 20, right: 120, bottom: 60, left: 60 };
	const width = 1200 - margin.left - margin.right;
	const height = 600 - margin.top - margin.bottom;

	// Create SVG
	const svg = d3.select('#chart')
	.attr('width', width + margin.left + margin.right)
	.attr('height', height + margin.top + margin.bottom)
	.append('g')
	.attr('transform', `translate(${margin.left},${margin.top})`);

	// Scales - log scale INVERTED (expensive on left, cheap on right like Google's chart)
	const xScale = d3.scaleLog().range([width, 0]);
	const yScale = d3.scaleLinear().range([height, 0]);

	// Axes
	const xAxis = svg.append('g')
	.attr('transform', `translate(0,${height})`);

	const yAxis = svg.append('g');

	// Axis labels
	svg.append('text')
	.attr('x', width / 2)
	.attr('y', height + 50)
	.attr('text-anchor', 'middle')
	.style('font-size', '14px')
	.style('font-weight', '600')
	.style('fill', '#ccc')
	.text('$ Price per million tokens (log scale)');

	const yAxisLabel = svg.append('text')
	.attr('transform', 'rotate(-90)')
	.attr('x', -height / 2)
	.attr('y', -45)
	.attr('text-anchor', 'middle')
	.style('font-size', '14px')
	.style('font-weight', '600')
	.style('fill', '#ccc')
	.text('Arena Elo Score (General Chat)');

	// Pareto frontier line
	const paretoLine = svg.append('path')
	.attr('fill', 'none')
	.attr('stroke', '#ff6b6b')
	.attr('stroke-width', 2)
	.attr('stroke-dasharray', '5,5');

	// Tooltip
	const tooltip = d3.select('#tooltip');

	function calculateCost(model, inputTokens, outputTokens, cacheHitRate, thinkingOverhead) {
	const cachedTokens = model.cachePrice !== null ? inputTokens * (cacheHitRate / 100) : 0;
	const regularInputTokens = inputTokens - cachedTokens;

	// Apply thinking token overhead based on model's thinkingMode property
	let effectiveOutputTokens = outputTokens;
	if (model.thinkingMode === 'full') {
	// Full thinking models: full slider overhead
	effectiveOutputTokens = outputTokens * (1 + thinkingOverhead / 100);
	} else if (model.thinkingMode === 'minimal') {
	// Minimal thinking models: 25% of slider overhead
	effectiveOutputTokens = outputTokens * (1 + thinkingOverhead / 400);
	}
	// Non-thinking models (thinkingMode: null): no overhead applied

	// Prices are per million tokens, so divide token counts by 1M
	let cost = (regularInputTokens * model.inputPrice + effectiveOutputTokens * model.outputPrice) / 1000000;

	if (cachedTokens > 0) {
	cost += (cachedTokens * model.cachePrice) / 1000000;
	}

	return cost;
	}

	function calculateParetoFrontier(points) {
	// Sort by cost ASCENDING (cheap to expensive)
	// On inverted axis, this goes right to left (cheap on right, expensive on left)
	// Keep points with increasing score as we go from cheap to expensive
	const sorted = [...points].sort((a, b) => a.cost - b.cost);
	const frontier = [];
	let maxScore = -Infinity;

	for (const point of sorted) {
	if (point.score > maxScore) {
	frontier.push(point);
	maxScore = point.score;
	}
	}

	return frontier;
	}

	function updateChart() {
	// Get current settings
	const inputOutputRatio = parseInt(document.getElementById('inputOutputRatio').value);
	const cacheHitRate = parseInt(document.getElementById('cacheHitRate').value);
	const thinkingOverhead = parseInt(document.getElementById('thinkingOverhead').value);
	const benchmark = document.querySelector('input[name="benchmark"]:checked').value;

	// Calculate costs for each model (per 1M total tokens)
	const inputRatio = 1 / (1 + inputOutputRatio);
	const outputRatio = inputOutputRatio / (1 + inputOutputRatio);

	const inputTokens = 1000000 * inputRatio;
	const outputTokens = 1000000 * outputRatio;

	// Filter models based on benchmark and add cost
	const scoreField = benchmark === 'general' ? 'generalElo' : benchmark === 'coding' ? 'codingElo' : 'sweScore';
	const allDataPoints = models.map(model => ({
	...model,
	cost: calculateCost(model, inputTokens, outputTokens, cacheHitRate, thinkingOverhead),
	score: model[scoreField]
	}));

	// Filter out models without scores for selected benchmark
	const dataPoints = allDataPoints.filter(d => d.score !== null);

	// Update Y-axis label based on benchmark
	const yLabels = { general: 'Arena Elo Score (General Chat)', coding: 'Arena Elo Score (Coding)', swe: 'SWE-Bench Verified Score (%)' };
	yAxisLabel.text(yLabels[benchmark]);

	// Update scales
	const minCost = d3.min(dataPoints, d => d.cost);
	const maxCost = d3.max(dataPoints, d => d.cost);
	const minScore = d3.min(dataPoints, d => d.score);
	const maxScore = d3.max(dataPoints, d => d.score);

	// Log scale needs positive values, add padding
	xScale.domain([Math.max(minCost * 0.8, 0.01), maxCost * 1.2]);
	if (benchmark === 'swe') {
	yScale.domain([Math.floor(minScore / 10) * 10, Math.ceil(maxScore / 10) * 10]);
	} else {
	yScale.domain([Math.floor(minScore / 50) * 50, Math.ceil(maxScore / 50) * 50]);
	}

	// Update axes
	xAxis.transition().duration(500)
	.call(d3.axisBottom(xScale)
	.ticks(10)
	.tickFormat(d => {
	if (d >= 1) return `$${d.toFixed(0)}`;
	if (d >= 0.1) return `$${d.toFixed(1)}`;
	return `$${d.toFixed(2)}`;
	}))
	.call(g => g.selectAll('.tick text').style('fill', '#ccc'))
	.call(g => g.selectAll('.tick line').style('stroke', '#555'))
	.call(g => g.select('.domain').style('stroke', '#555'));

	yAxis.transition().duration(500)
	.call(d3.axisLeft(yScale).ticks(10))
	.call(g => g.selectAll('.tick text').style('fill', '#ccc'))
	.call(g => g.selectAll('.tick line').style('stroke', '#555'))
	.call(g => g.select('.domain').style('stroke', '#555'));

	// Calculate Pareto frontier
	const frontierPoints = calculateParetoFrontier(dataPoints);

	// Update Pareto line
	const lineGenerator = d3.line()
	.x(d => xScale(d.cost))
	.y(d => yScale(d.score));

	paretoLine.transition().duration(500)
	.attr('d', lineGenerator(frontierPoints));

	// Update circles
	const circles = svg.selectAll('.model-circle')
	.data(dataPoints, d => d.name);

	circles.exit().remove();

	const circlesEnter = circles.enter()
	.append('circle')
	.attr('class', 'model-circle')
	.attr('r', 6)
	.attr('opacity', 0.8)
	.attr('stroke', '#fff')
	.attr('stroke-width', 2)
	.on('mouseover', function(event, d) {
	d3.select(this)
	.attr('r', 8)
	.attr('opacity', 1);

	const benchmark = document.querySelector('input[name="benchmark"]:checked').value;
	const scoreLabel = benchmark === 'general' ? 'General Elo' : benchmark === 'coding' ? 'Coding Elo' : 'SWE-Bench';
	tooltip
	.style('opacity', 1)
	.html(`
	<strong>${d.name}</strong><br>
	Provider: ${d.provider}<br>
	${scoreLabel}: ${d.score}<br>
	Cost: $${d.cost.toFixed(2)}/M tokens<br>
	Input: $${d.inputPrice}/M · Output: $${d.outputPrice}/M<br>
	${d.cachePrice !== null ? `Cache: $${d.cachePrice}/M` : 'Cache: N/A'}
	${d.thinkingMode ? ` · Thinking: ${d.thinkingMode}` : ''}
	`)
	.style('left', (event.pageX + 10) + 'px')
	.style('top', (event.pageY - 10) + 'px');
	})
	.on('mouseout', function() {
	d3.select(this)
	.attr('r', 6)
	.attr('opacity', 0.8);

	tooltip.style('opacity', 0);
	});

	circles.merge(circlesEnter)
	.transition()
	.duration(500)
	.attr('cx', d => xScale(d.cost))
	.attr('cy', d => yScale(d.score))
	.attr('fill', d => providerColors[d.provider]);

	// Update labels for all models (make frontier models more prominent)
	const frontierSet = new Set(frontierPoints.map(d => d.name));

	const labels = svg.selectAll('.model-label')
	.data(dataPoints, d => d.name);

	labels.exit().remove();

	const labelsEnter = labels.enter()
	.append('text')
	.attr('class', 'model-label')
	.attr('font-size', '11px')
	.style('pointer-events', 'none');

	labels.merge(labelsEnter)
	.transition()
	.duration(500)
	.attr('x', d => xScale(d.cost) + 10)
	.attr('y', d => yScale(d.score) + 4)
	.attr('font-weight', d => frontierSet.has(d.name) ? '600' : '400')
	.attr('fill', d => frontierSet.has(d.name) ? '#e0e0e0' : '#888')
	.attr('opacity', d => frontierSet.has(d.name) ? 1 : 0.6)
	.text(d => d.name);
	}

	function applyPreset(preset) {
	const presets = {
	casual: { ratio: 1, cache: 0 },
	coding: { ratio: 50, cache: 85 },
	rag: { ratio: 10, cache: 70 },
	docs: { ratio: 1, cache: 95 },
	google: { ratio: 5, cache: 0 }
	};

	const config = presets[preset];
	document.getElementById('inputOutputRatio').value = config.ratio;
	document.getElementById('cacheHitRate').value = config.cache;

	updateDisplays();
	updateChart();
	}

	function updateDisplays() {
	const ratio = parseInt(document.getElementById('inputOutputRatio').value);
	const cache = parseInt(document.getElementById('cacheHitRate').value);
	const thinking = parseInt(document.getElementById('thinkingOverhead').value);

	document.getElementById('ratioDisplay').textContent = `1:${ratio}`;
	document.getElementById('cacheDisplay').textContent = `${cache}%`;
	document.getElementById('thinkingDisplay').textContent = `${thinking}%`;
	}

	// Create legend
	function createLegend() {
	const legend = document.getElementById('legend');
	const providers = [...new Set(models.map(m => m.provider))];

	providers.forEach(provider => {
	const item = document.createElement('div');
	item.className = 'legend-item';

	const circle = document.createElement('div');
	circle.className = 'legend-circle';
	circle.style.backgroundColor = providerColors[provider];

	const label = document.createElement('span');
	label.textContent = provider;

	item.appendChild(circle);
	item.appendChild(label);
	legend.appendChild(item);
	});
	}

	// Event listeners
	document.getElementById('inputOutputRatio').addEventListener('input', () => {
	updateDisplays();
	updateChart();
	});

	document.getElementById('cacheHitRate').addEventListener('input', () => {
	updateDisplays();
	updateChart();
	});

	document.getElementById('thinkingOverhead').addEventListener('input', () => {
	updateDisplays();
	updateChart();
	});

	document.querySelectorAll('input[name="benchmark"]').forEach(radio => {
	radio.addEventListener('change', updateChart);
	});

	// Initialize
	createLegend();
	updateDisplays();
	updateChart();
	</script>
	</body>
	</html>
No results found