aashari · May 25, 2025 10:22
diff --git a/Advanced Language Model Benchmarks: Intelligence, Reasoning & Coding Evaluation b/Advanced Language Model Benchmarks: Intelligence, Reasoning & Coding Evaluation
 "Model","Provider","Release Date","Intelligence Index","MMLU-Pro/MMMLU (%)","GPQA Diamond (%)","SWE-Bench Verified/LCB (%)","MATH-500 (%)","AIME (%)","Context Window","Notes"
 "Claude Opus 4","Anthropic","May 2025","72","87.4","74.9","72.5 (SWE)","-","-","200K","Most intelligent; excels in coding, long-running tasks, memory capabilities."
 "Grok 3 (Think)","xAI","Feb 2025","70","79.9","84.6","79.4 (LCB)","-","93.3 (2025)","1M","Tops Chatbot Arena (Elo 1402), strong reasoning, AIME cons@64 controversy."
 "o4-mini (high)","OpenAI","Apr 2025","70","83","78","-","99","94 (2024)","128K","Top in math (MATH-500: 99%), visual reasoning (MMMU: 82.9%), cost-efficient."
 "Gemini 2.5 Pro","Google","Mar 2025","69","84.1","83.0","63.8 (SWE)","-","83.0 (2025)","1M","Advanced reasoning, leads WebDevidemia, LMArena, multimodal support."
 "Claude Sonnet 4","Anthropic","May 2025","68","80.2","79.6","72.7 (SWE)","92","33.1 (2024)","200K","Cost-efficient, strong coding (SWE-Bench: 72.7%), high-volume tasks."
 "o3","OpenAI","Apr 2025","67","-","-","69.1 (SWE)","-","-","128K","Strong coding, ARC-AGI-Pub (87.5% high-compute), hallucination concerns (33% PersonQA)."
 "Claude 3.7 Sonnet","Anthropic","Feb 2025","60","-","84.8","70.3 (SWE)","-","-","128K (256K beta)","Hybrid reasoning, strong in Pokémon Red, coding-focused."
 "Grok 3","xAI","Feb 2025","58","79.9","75.4","79.4 (LCB)","-","87 (2025)","1M","Non-reasoning, excels in real-world knowledge, lags in SimpleQA (43.6%)."
 "Claude 3.5 Sonnet","Anthropic","Oct 2024","55","90.4","59.4","49.0 (SWE)","-","-","200K","Improved coding, computer use feature, outperforms Claude 3 Opus."
 "GPT-4.1","OpenAI","Apr 2025","53","90.2","67","-","91","44 (2024)","1M","Balanced, strong in MMLU and math, large context window."
 "GPT-4.1 mini","OpenAI","Apr 2025","53","-","-","-","-","-","1M","Matches GPT-4o in many evals, low latency, cost-efficient."
 "Gemini 2.5 Flash","Google","Apr 2025","53","80.9","83.0","63.2 (SWE)","-","83.0 (2025)","1M","Balances price/performance, well-rounded, multimodal."
 "Claude 3.5 Haiku","Anthropic","Oct 2024","50","86.6","-","-","-","-","200K","Cost-efficient, text-only initially, excels in user-facing apps."
 "GPT-4o (Mar '25)","OpenAI","Mar 2025","50","-","-","-","-","-","128K","Competitive, lower intelligence index, multimodal."
 "Grok 3 mini (Think)","xAI","Feb 2025","45","-","-","-","-","93 (2025)","1M","Reasoning model, high AIME (unverified), cost-efficient, lags in SimpleQA (21.7%)."
 "Grok 3 mini","xAI","Feb 2025","42","-","-","-","-","87 (2025)","1M","Non-reasoning, cost-efficient, AIME score unverified, lower performance."
 "GPT-4.1 nano","OpenAI","Apr 2025","41","80.1 (MMLU, unverified)","50.3 (GPQA, unverified)","-","-","-","1M","Fastest (0.33s latency), cost-efficient, MMLU/GPQA scores likely not Pro/Diamond."
 "Grok 2","xAI","Aug 2024","40","70.3 (unverified)","-","-","-","-","128K","Enhanced reasoning over Grok 1.5, to be open-sourced, MMLU-Pro score unverified."
 "Gemma 3","Google","Mar 2025","-","-","-","-","-","-","128K","Lightweight open model, runs on single TPU/GPU, Elo 1338 on LMArena, context window verified."
	"Model","Provider","Release Date","Intelligence Index","MMLU-Pro/MMMLU (%)","GPQA Diamond (%)","SWE-Bench Verified/LCB (%)","MATH-500 (%)","AIME (%)","Context Window","Notes"
	"Claude Opus 4","Anthropic","May 2025","72","87.4","74.9","72.5 (SWE)","-","-","200K","Most intelligent; excels in coding, long-running tasks, memory capabilities."
	"Grok 3 (Think)","xAI","Feb 2025","70","79.9","84.6","79.4 (LCB)","-","93.3 (2025)","1M","Tops Chatbot Arena (Elo 1402), strong reasoning, AIME cons@64 controversy."
	"o4-mini (high)","OpenAI","Apr 2025","70","83","78","-","99","94 (2024)","128K","Top in math (MATH-500: 99%), visual reasoning (MMMU: 82.9%), cost-efficient."
	"Gemini 2.5 Pro","Google","Mar 2025","69","84.1","83.0","63.8 (SWE)","-","83.0 (2025)","1M","Advanced reasoning, leads WebDevidemia, LMArena, multimodal support."
	"Claude Sonnet 4","Anthropic","May 2025","68","80.2","79.6","72.7 (SWE)","92","33.1 (2024)","200K","Cost-efficient, strong coding (SWE-Bench: 72.7%), high-volume tasks."
	"o3","OpenAI","Apr 2025","67","-","-","69.1 (SWE)","-","-","128K","Strong coding, ARC-AGI-Pub (87.5% high-compute), hallucination concerns (33% PersonQA)."
	"Claude 3.7 Sonnet","Anthropic","Feb 2025","60","-","84.8","70.3 (SWE)","-","-","128K (256K beta)","Hybrid reasoning, strong in Pokémon Red, coding-focused."
	"Grok 3","xAI","Feb 2025","58","79.9","75.4","79.4 (LCB)","-","87 (2025)","1M","Non-reasoning, excels in real-world knowledge, lags in SimpleQA (43.6%)."
	"Claude 3.5 Sonnet","Anthropic","Oct 2024","55","90.4","59.4","49.0 (SWE)","-","-","200K","Improved coding, computer use feature, outperforms Claude 3 Opus."
	"GPT-4.1","OpenAI","Apr 2025","53","90.2","67","-","91","44 (2024)","1M","Balanced, strong in MMLU and math, large context window."
	"GPT-4.1 mini","OpenAI","Apr 2025","53","-","-","-","-","-","1M","Matches GPT-4o in many evals, low latency, cost-efficient."
	"Gemini 2.5 Flash","Google","Apr 2025","53","80.9","83.0","63.2 (SWE)","-","83.0 (2025)","1M","Balances price/performance, well-rounded, multimodal."
	"Claude 3.5 Haiku","Anthropic","Oct 2024","50","86.6","-","-","-","-","200K","Cost-efficient, text-only initially, excels in user-facing apps."
	"GPT-4o (Mar '25)","OpenAI","Mar 2025","50","-","-","-","-","-","128K","Competitive, lower intelligence index, multimodal."
	"Grok 3 mini (Think)","xAI","Feb 2025","45","-","-","-","-","93 (2025)","1M","Reasoning model, high AIME (unverified), cost-efficient, lags in SimpleQA (21.7%)."
	"Grok 3 mini","xAI","Feb 2025","42","-","-","-","-","87 (2025)","1M","Non-reasoning, cost-efficient, AIME score unverified, lower performance."
	"GPT-4.1 nano","OpenAI","Apr 2025","41","80.1 (MMLU, unverified)","50.3 (GPQA, unverified)","-","-","-","1M","Fastest (0.33s latency), cost-efficient, MMLU/GPQA scores likely not Pro/Diamond."
	"Grok 2","xAI","Aug 2024","40","70.3 (unverified)","-","-","-","-","128K","Enhanced reasoning over Grok 1.5, to be open-sourced, MMLU-Pro score unverified."
	"Gemma 3","Google","Mar 2025","-","-","-","-","-","-","128K","Lightweight open model, runs on single TPU/GPU, Elo 1338 on LMArena, context window verified."