Skip to content

Instantly share code, notes, and snippets.

@lizthegrey
Last active March 4, 2026 20:23
Show Gist options
  • Select an option

  • Save lizthegrey/eef57b3d40430ecea339604275b4f1d6 to your computer and use it in GitHub Desktop.

Select an option

Save lizthegrey/eef57b3d40430ecea339604275b4f1d6 to your computer and use it in GitHub Desktop.
Interactive LLM Performance vs Cost Pareto Frontier - Compare models across different usage patterns (input:output ratios, cache hit rates, thinking overhead)
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>LLM Performance vs Cost - Interactive Pareto Frontier (March 2026)</title>
<script src="https://d3js.org/d3.v7.min.js"></script>
<style>
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
margin: 0;
padding: 20px;
background: #1a1a1a;
}
.container {
max-width: 1400px;
margin: 0 auto;
background: #2a2a2a;
padding: 30px;
border-radius: 8px;
box-shadow: 0 2px 8px rgba(0,0,0,0.3);
}
h1 {
margin-top: 0;
color: #e0e0e0;
}
.controls {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
gap: 20px;
margin: 20px 0;
padding: 20px;
background: #333;
border-radius: 4px;
}
.control-group {
display: flex;
flex-direction: column;
}
label {
font-weight: 600;
margin-bottom: 8px;
color: #ccc;
font-size: 14px;
}
input[type="range"] {
width: 100%;
}
.value-display {
color: #0066cc;
font-weight: 600;
margin-top: 4px;
font-size: 13px;
}
.presets {
display: flex;
gap: 10px;
flex-wrap: wrap;
}
button {
padding: 8px 16px;
border: 1px solid #555;
background: #3a3a3a;
color: #e0e0e0;
border-radius: 4px;
cursor: pointer;
font-size: 13px;
transition: all 0.2s;
}
button:hover {
background: #4a4a4a;
}
button.active {
background: #0066cc;
color: white;
border-color: #0066cc;
}
.chart-container {
margin-top: 30px;
}
.tooltip {
position: absolute;
padding: 10px;
background: rgba(0, 0, 0, 0.8);
color: white;
border-radius: 4px;
pointer-events: none;
font-size: 12px;
opacity: 0;
transition: opacity 0.2s;
}
.legend {
display: flex;
gap: 20px;
margin-top: 20px;
flex-wrap: wrap;
}
.legend-item {
display: flex;
align-items: center;
gap: 8px;
font-size: 13px;
color: #ccc;
}
.legend-circle {
width: 12px;
height: 12px;
border-radius: 50%;
}
.info {
margin-top: 20px;
padding: 15px;
background: #333;
border-left: 4px solid #0066cc;
border-radius: 4px;
font-size: 14px;
color: #ccc;
}
</style>
</head>
<body>
<div class="container">
<h1>LLM Performance vs Cost: Interactive Pareto Frontier</h1>
<div class="info">
<strong>Why this matters:</strong> Vendor benchmark charts assume fixed token ratios with no caching.
Real-world usage varies dramatically&mdash;coding agents often run 50:1+ output:input ratios with 80-90% cache hits,
completely changing the cost picture and Pareto frontier.<br><br>
<strong>Data:</strong> Arena Elo scores from <a href="https://arena.ai/leaderboard" style="color: #6ba3e0;">arena.ai</a> (March 2026).
Prices are interactive/real-time; batch discounts excluded (too slow for agentic use).
</div>
<div class="controls">
<div class="control-group">
<label>Benchmark</label>
<div style="display: flex; gap: 10px;">
<label style="display: flex; align-items: center; gap: 6px; font-weight: normal;">
<input type="radio" name="benchmark" value="general" checked style="width: auto; margin: 0;">
General Chat (Arena)
</label>
<label style="display: flex; align-items: center; gap: 6px; font-weight: normal;">
<input type="radio" name="benchmark" value="coding" style="width: auto; margin: 0;">
Coding (Arena)
</label>
<label style="display: flex; align-items: center; gap: 6px; font-weight: normal;">
<input type="radio" name="benchmark" value="swe" style="width: auto; margin: 0;">
SWE-Bench Verified
</label>
</div>
</div>
<div class="control-group">
<label for="inputOutputRatio">Input:Output Token Ratio</label>
<input type="range" id="inputOutputRatio" min="0" max="100" value="5" step="1">
<div class="value-display" id="ratioDisplay">1:5</div>
</div>
<div class="control-group">
<label for="cacheHitRate">Cache Hit Rate (%)</label>
<input type="range" id="cacheHitRate" min="0" max="100" value="0" step="5">
<div class="value-display" id="cacheDisplay">0%</div>
</div>
<div class="control-group">
<label for="thinkingOverhead">Thinking Token Overhead (%)</label>
<input type="range" id="thinkingOverhead" min="0" max="300" value="100" step="25">
<div class="value-display" id="thinkingDisplay">100%</div>
</div>
</div>
<div class="presets">
<button onclick="applyPreset('casual')">Casual Chat (1:1, no cache)</button>
<button onclick="applyPreset('coding')">Coding Agent (1:50, 85% cache)</button>
<button onclick="applyPreset('rag')">RAG System (1:10, 70% cache)</button>
<button onclick="applyPreset('docs')">Document Analysis (100:1, 95% cache)</button>
<button onclick="applyPreset('google')">Vendor Default (1:5, no cache)</button>
</div>
<div class="chart-container">
<svg id="chart"></svg>
</div>
<div class="legend" id="legend"></div>
</div>
<div class="tooltip" id="tooltip"></div>
<script>
// Model data: Arena Elo scores and API pricing (per million tokens)
// Arena scores: https://arena.ai/leaderboard (General Chat + Coding, March 4 2026)
// Pricing: provider docs (Anthropic, OpenAI, Google, xAI, DeepSeek, Alibaba/Qwen)
// thinkingMode: 'full' = full thinking overhead, 'minimal' = reduced overhead, null = no thinking
// sweScore: SWE-Bench Verified % (https://www.swebench.com/, Feb 2026) — null where unavailable
const models = [
// Anthropic — https://docs.anthropic.com/en/docs/about-claude/models
{ name: 'Opus 4.6 (thinking)', provider: 'Anthropic', generalElo: 1501, codingElo: 1553, sweScore: 80.8, inputPrice: 5.0, outputPrice: 25.0, cachePrice: 0.50, thinkingMode: 'full' },
{ name: 'Opus 4.6', provider: 'Anthropic', generalElo: 1505, codingElo: 1547, sweScore: 80.8, inputPrice: 5.0, outputPrice: 25.0, cachePrice: 0.50, thinkingMode: null },
{ name: 'Sonnet 4.6', provider: 'Anthropic', generalElo: 1460, codingElo: 1528, sweScore: 79.6, inputPrice: 3.0, outputPrice: 15.0, cachePrice: 0.30, thinkingMode: null },
{ name: 'Haiku 4.5', provider: 'Anthropic', generalElo: 1406, codingElo: 1477, sweScore: 73.3, inputPrice: 1.0, outputPrice: 5.0, cachePrice: 0.10, thinkingMode: null },
// Google — https://ai.google.dev/gemini-api/docs/pricing
{ name: 'Gemini 3.1 Pro', provider: 'Google', generalElo: 1500, codingElo: 1541, sweScore: null, inputPrice: 2.0, outputPrice: 12.0, cachePrice: 0.20, thinkingMode: 'full' },
{ name: 'Gemini 3 Pro', provider: 'Google', generalElo: 1486, codingElo: 1519, sweScore: 76.2, inputPrice: 2.0, outputPrice: 12.0, cachePrice: 0.20, thinkingMode: 'full' },
{ name: 'Gemini 3 Flash', provider: 'Google', generalElo: 1473, codingElo: 1507, sweScore: 76.2, inputPrice: 0.50, outputPrice: 3.0, cachePrice: 0.05, thinkingMode: 'full' },
{ name: 'Gemini 3 Flash (thinking-min)', provider: 'Google', generalElo: 1461, codingElo: 1499, sweScore: null, inputPrice: 0.50, outputPrice: 3.0, cachePrice: 0.05, thinkingMode: 'minimal' },
{ name: 'Gemini 3.1 Flash-Lite', provider: 'Google', generalElo: 1436, codingElo: 1457, sweScore: null, inputPrice: 0.25, outputPrice: 1.50, cachePrice: 0.025, thinkingMode: 'minimal' },
{ name: 'Gemini 2.5 Pro', provider: 'Google', generalElo: 1449, codingElo: 1467, sweScore: 53.6, inputPrice: 1.25, outputPrice: 10.0, cachePrice: 0.125, thinkingMode: 'full' },
{ name: 'Gemini 2.5 Flash', provider: 'Google', generalElo: 1410, codingElo: null, sweScore: null, inputPrice: 0.30, outputPrice: 2.50, cachePrice: 0.03, thinkingMode: 'full' },
// xAI
{ name: 'Grok 4.1 (thinking)', provider: 'xAI', generalElo: 1473, codingElo: 1506, sweScore: null, inputPrice: 3.0, outputPrice: 15.0, cachePrice: null, thinkingMode: 'full' },
{ name: 'Grok 4.1', provider: 'xAI', generalElo: 1463, codingElo: 1494, sweScore: null, inputPrice: 3.0, outputPrice: 15.0, cachePrice: null, thinkingMode: null },
{ name: 'Grok 4.1 Fast', provider: 'xAI', generalElo: 1430, codingElo: 1457, sweScore: null, inputPrice: 0.20, outputPrice: 0.50, cachePrice: null, thinkingMode: 'full' },
// OpenAI — https://openai.com/api/pricing/
{ name: 'GPT-5.2', provider: 'OpenAI', generalElo: 1438, codingElo: 1498, sweScore: 80.0, inputPrice: 1.75, outputPrice: 14.0, cachePrice: 0.175, thinkingMode: null },
{ name: 'GPT-5.1 (high)', provider: 'OpenAI', generalElo: 1456, codingElo: 1491, sweScore: 76.3, inputPrice: 1.25, outputPrice: 10.0, cachePrice: 0.125, thinkingMode: 'full' },
{ name: 'GPT-5', provider: 'OpenAI', generalElo: 1426, codingElo: 1463, sweScore: null, inputPrice: 1.25, outputPrice: 10.0, cachePrice: 0.125, thinkingMode: null },
{ name: 'o3', provider: 'OpenAI', generalElo: 1432, codingElo: 1457, sweScore: null, inputPrice: 2.0, outputPrice: 8.0, cachePrice: 1.0, thinkingMode: 'full' },
{ name: 'o4-mini', provider: 'OpenAI', generalElo: 1391, codingElo: 1430, sweScore: null, inputPrice: 1.10, outputPrice: 4.40, cachePrice: 0.275, thinkingMode: 'full' },
{ name: 'GPT-4.1', provider: 'OpenAI', generalElo: 1413, codingElo: 1455, sweScore: null, inputPrice: 2.0, outputPrice: 8.0, cachePrice: 0.50, thinkingMode: null },
{ name: 'GPT-4.1 Mini', provider: 'OpenAI', generalElo: 1382, codingElo: 1432, sweScore: null, inputPrice: 0.40, outputPrice: 1.60, cachePrice: 0.10, thinkingMode: null },
// DeepSeek — https://api-docs.deepseek.com/quick_start/pricing
{ name: 'DeepSeek V3.2', provider: 'DeepSeek', generalElo: 1421, codingElo: 1467, sweScore: 73.0, inputPrice: 0.14, outputPrice: 0.28, cachePrice: null, thinkingMode: null },
{ name: 'DeepSeek V3.2 (thinking)', provider: 'DeepSeek', generalElo: 1420, codingElo: 1470, sweScore: 73.0, inputPrice: 0.14, outputPrice: 0.28, cachePrice: null, thinkingMode: 'full' },
{ name: 'DeepSeek R1', provider: 'DeepSeek', generalElo: 1419, codingElo: 1463, sweScore: null, inputPrice: 0.55, outputPrice: 2.19, cachePrice: null, thinkingMode: 'full' },
// Qwen/Alibaba — https://help.aliyun.com/zh/model-studio/getting-started/models
{ name: 'Qwen3 Max', provider: 'Qwen', generalElo: 1434, codingElo: 1482, sweScore: null, inputPrice: 1.20, outputPrice: 6.0, cachePrice: null, thinkingMode: null },
{ name: 'Qwen3-235B', provider: 'Qwen', generalElo: 1422, codingElo: 1471, sweScore: null, inputPrice: 0.20, outputPrice: 1.0, cachePrice: null, thinkingMode: null },
];
// Provider colors
const providerColors = {
'Google': '#4285f4',
'Anthropic': '#d4a373',
'xAI': '#1da1f2',
'OpenAI': '#10a37f',
'DeepSeek': '#7c3aed',
'Qwen': '#ef4444'
};
// Chart dimensions
const margin = { top: 20, right: 120, bottom: 60, left: 60 };
const width = 1200 - margin.left - margin.right;
const height = 600 - margin.top - margin.bottom;
// Create SVG
const svg = d3.select('#chart')
.attr('width', width + margin.left + margin.right)
.attr('height', height + margin.top + margin.bottom)
.append('g')
.attr('transform', `translate(${margin.left},${margin.top})`);
// Scales - log scale INVERTED (expensive on left, cheap on right like Google's chart)
const xScale = d3.scaleLog().range([width, 0]);
const yScale = d3.scaleLinear().range([height, 0]);
// Axes
const xAxis = svg.append('g')
.attr('transform', `translate(0,${height})`);
const yAxis = svg.append('g');
// Axis labels
svg.append('text')
.attr('x', width / 2)
.attr('y', height + 50)
.attr('text-anchor', 'middle')
.style('font-size', '14px')
.style('font-weight', '600')
.style('fill', '#ccc')
.text('$ Price per million tokens (log scale)');
const yAxisLabel = svg.append('text')
.attr('transform', 'rotate(-90)')
.attr('x', -height / 2)
.attr('y', -45)
.attr('text-anchor', 'middle')
.style('font-size', '14px')
.style('font-weight', '600')
.style('fill', '#ccc')
.text('Arena Elo Score (General Chat)');
// Pareto frontier line
const paretoLine = svg.append('path')
.attr('fill', 'none')
.attr('stroke', '#ff6b6b')
.attr('stroke-width', 2)
.attr('stroke-dasharray', '5,5');
// Tooltip
const tooltip = d3.select('#tooltip');
function calculateCost(model, inputTokens, outputTokens, cacheHitRate, thinkingOverhead) {
const cachedTokens = model.cachePrice !== null ? inputTokens * (cacheHitRate / 100) : 0;
const regularInputTokens = inputTokens - cachedTokens;
// Apply thinking token overhead based on model's thinkingMode property
let effectiveOutputTokens = outputTokens;
if (model.thinkingMode === 'full') {
// Full thinking models: full slider overhead
effectiveOutputTokens = outputTokens * (1 + thinkingOverhead / 100);
} else if (model.thinkingMode === 'minimal') {
// Minimal thinking models: 25% of slider overhead
effectiveOutputTokens = outputTokens * (1 + thinkingOverhead / 400);
}
// Non-thinking models (thinkingMode: null): no overhead applied
// Prices are per million tokens, so divide token counts by 1M
let cost = (regularInputTokens * model.inputPrice + effectiveOutputTokens * model.outputPrice) / 1000000;
if (cachedTokens > 0) {
cost += (cachedTokens * model.cachePrice) / 1000000;
}
return cost;
}
function calculateParetoFrontier(points) {
// Sort by cost ASCENDING (cheap to expensive)
// On inverted axis, this goes right to left (cheap on right, expensive on left)
// Keep points with increasing score as we go from cheap to expensive
const sorted = [...points].sort((a, b) => a.cost - b.cost);
const frontier = [];
let maxScore = -Infinity;
for (const point of sorted) {
if (point.score > maxScore) {
frontier.push(point);
maxScore = point.score;
}
}
return frontier;
}
function updateChart() {
// Get current settings
const inputOutputRatio = parseInt(document.getElementById('inputOutputRatio').value);
const cacheHitRate = parseInt(document.getElementById('cacheHitRate').value);
const thinkingOverhead = parseInt(document.getElementById('thinkingOverhead').value);
const benchmark = document.querySelector('input[name="benchmark"]:checked').value;
// Calculate costs for each model (per 1M total tokens)
const inputRatio = 1 / (1 + inputOutputRatio);
const outputRatio = inputOutputRatio / (1 + inputOutputRatio);
const inputTokens = 1000000 * inputRatio;
const outputTokens = 1000000 * outputRatio;
// Filter models based on benchmark and add cost
const scoreField = benchmark === 'general' ? 'generalElo' : benchmark === 'coding' ? 'codingElo' : 'sweScore';
const allDataPoints = models.map(model => ({
...model,
cost: calculateCost(model, inputTokens, outputTokens, cacheHitRate, thinkingOverhead),
score: model[scoreField]
}));
// Filter out models without scores for selected benchmark
const dataPoints = allDataPoints.filter(d => d.score !== null);
// Update Y-axis label based on benchmark
const yLabels = { general: 'Arena Elo Score (General Chat)', coding: 'Arena Elo Score (Coding)', swe: 'SWE-Bench Verified Score (%)' };
yAxisLabel.text(yLabels[benchmark]);
// Update scales
const minCost = d3.min(dataPoints, d => d.cost);
const maxCost = d3.max(dataPoints, d => d.cost);
const minScore = d3.min(dataPoints, d => d.score);
const maxScore = d3.max(dataPoints, d => d.score);
// Log scale needs positive values, add padding
xScale.domain([Math.max(minCost * 0.8, 0.01), maxCost * 1.2]);
if (benchmark === 'swe') {
yScale.domain([Math.floor(minScore / 10) * 10, Math.ceil(maxScore / 10) * 10]);
} else {
yScale.domain([Math.floor(minScore / 50) * 50, Math.ceil(maxScore / 50) * 50]);
}
// Update axes
xAxis.transition().duration(500)
.call(d3.axisBottom(xScale)
.ticks(10)
.tickFormat(d => {
if (d >= 1) return `$${d.toFixed(0)}`;
if (d >= 0.1) return `$${d.toFixed(1)}`;
return `$${d.toFixed(2)}`;
}))
.call(g => g.selectAll('.tick text').style('fill', '#ccc'))
.call(g => g.selectAll('.tick line').style('stroke', '#555'))
.call(g => g.select('.domain').style('stroke', '#555'));
yAxis.transition().duration(500)
.call(d3.axisLeft(yScale).ticks(10))
.call(g => g.selectAll('.tick text').style('fill', '#ccc'))
.call(g => g.selectAll('.tick line').style('stroke', '#555'))
.call(g => g.select('.domain').style('stroke', '#555'));
// Calculate Pareto frontier
const frontierPoints = calculateParetoFrontier(dataPoints);
// Update Pareto line
const lineGenerator = d3.line()
.x(d => xScale(d.cost))
.y(d => yScale(d.score));
paretoLine.transition().duration(500)
.attr('d', lineGenerator(frontierPoints));
// Update circles
const circles = svg.selectAll('.model-circle')
.data(dataPoints, d => d.name);
circles.exit().remove();
const circlesEnter = circles.enter()
.append('circle')
.attr('class', 'model-circle')
.attr('r', 6)
.attr('opacity', 0.8)
.attr('stroke', '#fff')
.attr('stroke-width', 2)
.on('mouseover', function(event, d) {
d3.select(this)
.attr('r', 8)
.attr('opacity', 1);
const benchmark = document.querySelector('input[name="benchmark"]:checked').value;
const scoreLabel = benchmark === 'general' ? 'General Elo' : benchmark === 'coding' ? 'Coding Elo' : 'SWE-Bench';
tooltip
.style('opacity', 1)
.html(`
<strong>${d.name}</strong><br>
Provider: ${d.provider}<br>
${scoreLabel}: ${d.score}<br>
Cost: $${d.cost.toFixed(2)}/M tokens<br>
Input: $${d.inputPrice}/M &middot; Output: $${d.outputPrice}/M<br>
${d.cachePrice !== null ? `Cache: $${d.cachePrice}/M` : 'Cache: N/A'}
${d.thinkingMode ? ` &middot; Thinking: ${d.thinkingMode}` : ''}
`)
.style('left', (event.pageX + 10) + 'px')
.style('top', (event.pageY - 10) + 'px');
})
.on('mouseout', function() {
d3.select(this)
.attr('r', 6)
.attr('opacity', 0.8);
tooltip.style('opacity', 0);
});
circles.merge(circlesEnter)
.transition()
.duration(500)
.attr('cx', d => xScale(d.cost))
.attr('cy', d => yScale(d.score))
.attr('fill', d => providerColors[d.provider]);
// Update labels for all models (make frontier models more prominent)
const frontierSet = new Set(frontierPoints.map(d => d.name));
const labels = svg.selectAll('.model-label')
.data(dataPoints, d => d.name);
labels.exit().remove();
const labelsEnter = labels.enter()
.append('text')
.attr('class', 'model-label')
.attr('font-size', '11px')
.style('pointer-events', 'none');
labels.merge(labelsEnter)
.transition()
.duration(500)
.attr('x', d => xScale(d.cost) + 10)
.attr('y', d => yScale(d.score) + 4)
.attr('font-weight', d => frontierSet.has(d.name) ? '600' : '400')
.attr('fill', d => frontierSet.has(d.name) ? '#e0e0e0' : '#888')
.attr('opacity', d => frontierSet.has(d.name) ? 1 : 0.6)
.text(d => d.name);
}
function applyPreset(preset) {
const presets = {
casual: { ratio: 1, cache: 0 },
coding: { ratio: 50, cache: 85 },
rag: { ratio: 10, cache: 70 },
docs: { ratio: 1, cache: 95 },
google: { ratio: 5, cache: 0 }
};
const config = presets[preset];
document.getElementById('inputOutputRatio').value = config.ratio;
document.getElementById('cacheHitRate').value = config.cache;
updateDisplays();
updateChart();
}
function updateDisplays() {
const ratio = parseInt(document.getElementById('inputOutputRatio').value);
const cache = parseInt(document.getElementById('cacheHitRate').value);
const thinking = parseInt(document.getElementById('thinkingOverhead').value);
document.getElementById('ratioDisplay').textContent = `1:${ratio}`;
document.getElementById('cacheDisplay').textContent = `${cache}%`;
document.getElementById('thinkingDisplay').textContent = `${thinking}%`;
}
// Create legend
function createLegend() {
const legend = document.getElementById('legend');
const providers = [...new Set(models.map(m => m.provider))];
providers.forEach(provider => {
const item = document.createElement('div');
item.className = 'legend-item';
const circle = document.createElement('div');
circle.className = 'legend-circle';
circle.style.backgroundColor = providerColors[provider];
const label = document.createElement('span');
label.textContent = provider;
item.appendChild(circle);
item.appendChild(label);
legend.appendChild(item);
});
}
// Event listeners
document.getElementById('inputOutputRatio').addEventListener('input', () => {
updateDisplays();
updateChart();
});
document.getElementById('cacheHitRate').addEventListener('input', () => {
updateDisplays();
updateChart();
});
document.getElementById('thinkingOverhead').addEventListener('input', () => {
updateDisplays();
updateChart();
});
document.querySelectorAll('input[name="benchmark"]').forEach(radio => {
radio.addEventListener('change', updateChart);
});
// Initialize
createLegend();
updateDisplays();
updateChart();
</script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment