Skip to content

Instantly share code, notes, and snippets.

@youqad
Created February 10, 2025 10:26
Show Gist options
  • Save youqad/877f630e7f3fb59e561365479b3d6f4b to your computer and use it in GitHub Desktop.
Save youqad/877f630e7f3fb59e561365479b3d6f4b to your computer and use it in GitHub Desktop.
Little CLI to estimate the compute budget for an LLM research project.
"""
Grant Budget & GPU Resource Estimator
A script for estimating compute requirements and costs for LLM training:
1. Compute budget estimation (FLOP-based calculations):
- GPU-hours and costs for fine-tuning runs
- Hyperparameter search
- Ablations
2. Memory requirement estimation using EleutherAI's formula (https://blog.eleuther.ai/transformer-math/):
Total Memory = Model Memory + Optimizer Memory + Gradient Memory + Activation Memory
3. Synthetic data generation cost estimation:
- Based on input/output token counts and provider pricing tiers
- Cost = (tokens/1e6) × (input_fraction × input_price + output_fraction × output_price)
- Prices in USD per million tokens, varying by provider/model
- synth_tokens: Total number of synthetic tokens to generate
- synth_tier: Provider/model tier for generation
- synth_input_fraction: Fraction of tokens that are inputs
- synth_output_fraction: Fraction of tokens that are outputs
Supports dense transformer models and Mixture-of-Experts (MoE) from Hugging Face.
Default assumptions (configurable via JSON or CLI):
- Model parameters: FP16 (2 bytes/param)
- Optimizer state: 8 bytes/param (AdamW - momentum & variance in FP32)
- Gradient storage: Same as model memory
- Activation memory: batch_size × seq_length × hidden_size × num_layers × activation_bytes
- FLOPs per parameter-token: 6
- Fine-tuning compute ratio: 1/fine_tuning_scaling_factor of full training
- Synthetic data: 100M tokens total (10% input, 90% output) using DeepSeek Reasoner tier
Usage:
Single model:
python grant_budget_estimator.py --model_name <HuggingFace model name>
Multiple models:
python grant_budget_estimator.py --config <config.json>
Example config.json:
{
"models": [
{"model_name": "deepseek-ai/DeepSeek-R1", "weight": 1},
{"model_name": "mistralai/Mistral-Small-24B-Instruct-2501", "weight": 2},
{ "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "weight": 2}
],
"total_tokens": 54e9,
"flops_per_param_token": 6,
"fine_tuning_scaling_factor": 1000,
"gpu_vram_gb": 80,
"fine_tuning_precision_bytes": 2,
"batch_size": 16,
"seq_length": 4096,
"activation_bytes": 2,
"ft_runs": 100,
"hp_configs": 200,
"hp_hours_per_config": 50,
"abl_experiments": 50,
"abl_hours_per_experiment": 200,
"synth_tokens": 4e9,
"synth_tier": "DeepSeek Reasoner",
"synth_input_fraction": 0.1,
"synth_output_fraction": 0.9,
"gpu_cost_usd": 3.00,
"usd_to_gbp": 0.81,
"gpu_flops": 5.0e14,
"num_ft_gpus": null
}
"""
import argparse
import math
import os
import json
from transformers import AutoConfig
# synthetic data generation prices (USD per million tokens)
PRESET_SYNTH_PRICES = {
"Gemini 1.5 Flash ≤128k": {"input": 0.07, "output": 0.3},
"Gemini 1.5 Flash >128k": {"input": 0.15, "output": 0.6},
"Gemini 1.5 Flash-8B ≤128k": {"input": 0.04, "output": 0.15},
"Gemini 1.5 Flash-8B >128k": {"input": 0.07, "output": 0.3},
"Gemini 1.5 Pro ≤128k": {"input": 1.25, "output": 5},
"Gemini 1.5 Pro >128k": {"input": 2.5, "output": 10},
"Claude 3.5 Sonnet": {"input": 3, "output": 15},
"Claude 3 Opus": {"input": 15, "output": 75},
"Claude 3 Haiku": {"input": 0.25, "output": 1.25},
"Claude 3.5 Haiku": {"input": 0.8, "output": 4},
"GPT-4o": {"input": 2.5, "output": 10},
"GPT-4o Mini": {"input": 0.15, "output": 0.6},
"o1 and o1-preview": {"input": 15, "output": 60},
"o1-mini": {"input": 1.1, "output": 4.4},
"o3-mini": {"input": 1.1, "output": 4.4},
"Amazon Nova Micro": {"input": 0.04, "output": 0.14},
"Amazon Nova Lite": {"input": 0.06, "output": 0.24},
"Amazon Nova Pro": {"input": 0.8, "output": 3.2},
"DeepSeek Chat": {"input": 0.14, "output": 0.28},
"DeepSeek Reasoner": {"input": 0.55, "output": 2.19},
}
def load_json_config(filepath):
"""Load and parse a JSON configuration file."""
if os.path.exists(filepath):
with open(filepath, "r") as f:
return json.load(f)
else:
raise FileNotFoundError(f"Configuration file '{filepath}' not found. 🚫")
def load_model_config(model_name):
"""Fetch model configuration from Hugging Face."""
return AutoConfig.from_pretrained(model_name)
def estimate_activated_moe_params_general(config: dict):
"""
Estimate the per-token activated parameter count and the full parameter count for a MoE-based HF model like DeepSeek-R1.
Returns a 2-tuple: (activated_params, full_params)
- 'activated_params' is computed with a fraction (experts_per_token/n_routed_experts)
of the MoE feed-forward cost.
- 'full_params' assumes that all experts are loaded in memory (i.e. the full cost).
"""
hidden_size = config.get("hidden_size", 7168)
vocab_size = config.get("vocab_size", 129280)
n_layers = config.get("num_hidden_layers", 61)
first_k_dense = config.get("first_k_dense_replace", 3)
int_size_dense = config.get("intermediate_size", 4 * hidden_size)
int_size_moe = config.get("moe_intermediate_size", 2048)
n_routed_experts = config.get("n_routed_experts", 256)
n_shared_experts = config.get("n_shared_experts", 1)
experts_per_token = config.get("num_experts_per_tok", 8)
moe_layer_freq = config.get("moe_layer_freq", 1)
q_lora_rank = config.get("q_lora_rank", 1536)
kv_lora_rank = config.get("kv_lora_rank", 512)
num_nextn_predict_layers = config.get("num_nextn_predict_layers", 1)
tie_embeddings = config.get("tie_word_embeddings", False)
# embedding parameters
if tie_embeddings:
emb_params = hidden_size * vocab_size
else:
emb_params = 2.0 * hidden_size * vocab_size
# attention cost
attn_base = 4.0 * (hidden_size**2)
mla_overhead = 30e6 * ((hidden_size / 7168.0) ** 2)
attn_cost = attn_base + mla_overhead
def lora_cost(h, rank):
return 2.0 * h * rank
lora_overhead = lora_cost(hidden_size, q_lora_rank) + 2 * lora_cost(
hidden_size, kv_lora_rank
)
total_attn_cost = attn_cost + lora_overhead
# MLP cost
dense_ffn_cost = 2.0 * hidden_size * int_size_dense
moe_expert_cost = 2.0 * hidden_size * int_size_moe
# for activated parameter count, only a fraction of experts is active:
fraction_active = (
experts_per_token / float(n_routed_experts) if n_routed_experts > 0 else 0.0
)
activated_expert_cost = (
moe_expert_cost * n_routed_experts * fraction_active
+ moe_expert_cost * n_shared_experts
)
# for full parameter count, all experts are loaded:
full_expert_cost = (
moe_expert_cost * n_routed_experts + moe_expert_cost * n_shared_experts
)
# gating cost for MoE layers
gating_cost = hidden_size * n_routed_experts if n_routed_experts > 0 else 0.0
total_layer_cost_activated = 0.0
total_layer_cost_full = 0.0
for layer_idx in range(n_layers):
layer_cost_attn = total_attn_cost
if layer_idx < first_k_dense:
layer_cost_dense = dense_ffn_cost
layer_cost_moe_activated = 0.0
layer_cost_moe_full = 0.0
else:
# if this layer is designated as an MoE layer:
if n_routed_experts > 0 and (
(layer_idx - first_k_dense) % moe_layer_freq == 0
):
layer_cost_moe_activated = activated_expert_cost + gating_cost
layer_cost_moe_full = full_expert_cost + gating_cost
layer_cost_dense = 0.0
else:
layer_cost_dense = dense_ffn_cost
layer_cost_moe_activated = 0.0
layer_cost_moe_full = 0.0
total_layer_cost_activated += (
layer_cost_attn + layer_cost_dense + layer_cost_moe_activated
)
total_layer_cost_full += (
layer_cost_attn + layer_cost_dense + layer_cost_moe_full
)
# multi-Token Prediction overhead (applies equally to both)
mtp_overhead = 300e6 * ((hidden_size / 7168.0) ** 2) * num_nextn_predict_layers
# fudge factor for extra parameters (norms, biases, etc.)
fudge = 2.0e9
total_activated = emb_params + total_layer_cost_activated + mtp_overhead + fudge
total_full = emb_params + total_layer_cost_full + mtp_overhead + fudge
return (total_activated, total_full)
def compute_training_memory(
est_params,
defaults,
hidden_size,
n_layers,
batch_size,
seq_length,
activation_bytes,
):
"""
Compute total training memory (in GB) using Eleuther's formula:
Total Memory = Model Memory + Optimiser Memory + Gradient Memory + Activation Memory
Assumes:
- Model Memory = N * param_bytes (FP16: 2 bytes)
- Gradient Memory = N * param_bytes
- Optimiser Memory = N * 8 (AdamW: two FP32 copies)
- Activation Memory = batch_size * seq_length * hidden_size * n_layers * activation_bytes
"""
param_bytes = defaults["fine_tuning_precision_bytes"]
model_memory = est_params * param_bytes
gradient_memory = est_params * param_bytes
optimiser_memory = est_params * 8
activation_memory = (
batch_size * seq_length * hidden_size * n_layers * activation_bytes
)
total_bytes = model_memory + gradient_memory + optimiser_memory + activation_memory
total_gb = total_bytes / (1024**3)
return total_gb
def compute_model_estimates(model_name, defaults, cli_overrides):
"""
Compute estimates for a single model:
- Effective parameter count (using MoE-aware function if applicable)
- Detailed training memory (using Eleuther's formula)
- FLOP-based compute estimates.
"""
config = load_model_config(model_name)
config_dict = config.to_dict()
if "n_routed_experts" in config_dict:
activated_params, full_params = estimate_activated_moe_params_general(
config_dict
)
else:
if hasattr(config, "num_parameters"):
full_params = config.num_parameters()
else:
import re
match = re.search(r"(\d+)[bB]", config.name_or_path)
if match:
full_params = float(match.group(1)) * 1e9
else:
full_params = 7e9 # default to 7B
# override if provided via CLI
if cli_overrides.get("model_parameters") is not None:
full_params = cli_overrides["model_parameters"]
# memory estimation using Eleuther AI's formula
hidden_size = getattr(config, "hidden_size", 7168)
n_layers = getattr(config, "num_hidden_layers", 61)
batch_size = defaults.get("batch_size", 16)
seq_length = defaults.get("seq_length", 4096)
activation_bytes = defaults.get("activation_bytes", 2) # FP16 activations
training_memory_gb = compute_training_memory(
full_params,
defaults,
hidden_size,
n_layers,
batch_size,
seq_length,
activation_bytes,
)
required_gpus = math.ceil(training_memory_gb / defaults["gpu_vram_gb"])
# FLOP-based compute estimation:
full_training_flops = (
defaults["flops_per_param_token"] * full_params * defaults["total_tokens"]
)
ft_flops = full_training_flops / defaults["fine_tuning_scaling_factor"]
num_ft_gpus = (
defaults["num_ft_gpus"]
if defaults["num_ft_gpus"] is not None
else required_gpus
)
cluster_throughput = num_ft_gpus * defaults["gpu_flops"]
ft_time_sec = ft_flops / cluster_throughput
ft_time_hr = ft_time_sec / 3600
gpu_hours_per_ft_run = ft_time_hr * num_ft_gpus
base_ft_total_gpu_hours = defaults["ft_runs"] * gpu_hours_per_ft_run
return {
"model_name": model_name,
"num_params": full_params,
"hidden_size": hidden_size,
"n_layers": n_layers,
"training_memory_gb": training_memory_gb,
"required_gpus": required_gpus,
"num_ft_gpus": num_ft_gpus,
"ft_time_hr": ft_time_hr,
"gpu_hours_per_ft_run": gpu_hours_per_ft_run,
"base_ft_total_gpu_hours": base_ft_total_gpu_hours,
}
def main():
parser = argparse.ArgumentParser(
description="Estimate compute budget and GPU requirements for fine-tuning LLMs.",
)
parser.add_argument(
"--model_name",
type=str,
help="Hugging Face model name (for single-model mode).",
)
parser.add_argument(
"--config", type=str, help="Optional JSON config file (for multi-model mode)."
)
# optional CLI overrides
parser.add_argument(
"--ft_runs", type=int, help="Override: number of fine-tuning runs."
)
parser.add_argument(
"--hp_configs",
type=int,
help="Override: number of hyperparameter configurations.",
)
parser.add_argument(
"--abl_experiments", type=int, help="Override: number of ablation experiments."
)
parser.add_argument(
"--gpu_cost_usd", type=float, help="Override: cost per GPU-hour in USD."
)
args = parser.parse_args()
defaults = {
"total_tokens": 54e9, # cf. FineMath
"flops_per_param_token": 6,
"fine_tuning_scaling_factor": 100,
"gpu_vram_gb": 80,
"fine_tuning_precision_bytes": 2,
"batch_size": 16,
"seq_length": 4096,
"activation_bytes": 2,
"ft_runs": 100,
"hp_configs": 200,
"hp_hours_per_config": 50,
"abl_experiments": 50,
"abl_hours_per_experiment": 200,
"gpu_cost_usd": 3.00,
"usd_to_gbp": 0.81,
"gpu_flops": 5.0e14,
"num_ft_gpus": None,
"synth_tokens": 4e9,
"synth_tier": "DeepSeek Reasoner",
"synth_input_fraction": 0.1,
"synth_output_fraction": 0.9,
}
# override defaults from CLI if provided
if args.ft_runs is not None:
defaults["ft_runs"] = args.ft_runs
if args.hp_configs is not None:
defaults["hp_configs"] = args.hp_configs
if args.abl_experiments is not None:
defaults["abl_experiments"] = args.abl_experiments
if args.gpu_cost_usd is not None:
defaults["gpu_cost_usd"] = args.gpu_cost_usd
# CLI overrides dict for model_parameters if wanted
cli_overrides = {"model_parameters": None}
models_info = []
if args.config:
config_data = load_json_config(args.config)
# merge global defaults from JSON if present
for key in defaults.keys():
if key in config_data:
defaults[key] = config_data[key]
if "models" not in config_data:
raise ValueError(
"JSON config must include a 'models' key with a list of models (with weights)."
)
# normalize the weights
total_weight = sum(model.get("weight", 1) for model in config_data["models"])
if total_weight <= 0:
raise ValueError("Sum of 'weight' fields must be > 0 for normalization.")
for model_entry in config_data["models"]:
raw_weight = model_entry.get("weight", 1)
model_entry["weight"] = raw_weight / total_weight
for model_entry in config_data["models"]:
model_name = model_entry.get("model_name")
if not model_name:
continue
weight = model_entry.get("weight", 1)
if "model_parameters" in model_entry:
cli_overrides["model_parameters"] = model_entry["model_parameters"]
est = compute_model_estimates(model_name, defaults, cli_overrides)
est["weight"] = weight
models_info.append(est)
elif args.model_name:
est = compute_model_estimates(args.model_name, defaults, cli_overrides)
est["weight"] = 1
models_info.append(est)
else:
raise ValueError(
"You must supply either --model_name (single-model mode) or --config (multi-model mode)."
)
print("\n📊 Model Estimates:")
overall_weighted_ft_gpu_hours = 0
max_gpus_required = 0
for info in models_info:
print("--------------------------------------------")
print(f"Model: {info['model_name']}")
print(f" - Estimated activated parameters: {info['num_params']:.0f}")
print(f" - Hidden size: {info['hidden_size']}")
print(f" - Number of layers: {info['n_layers']}")
print(
f" - Training memory estimate: {info['training_memory_gb']:.2f} GB (Eleuther's formula)"
)
print(f" - Minimum GPUs required (memory): {info['required_gpus']}")
print(f" - GPUs used for compute: {info['num_ft_gpus']}")
print(f" - Time per FT run: {info['ft_time_hr']:.2f} hours")
print(f" - GPU-hours per FT run: {info['gpu_hours_per_ft_run']:.0f} GPU-hours")
print(
f" - Base FT total GPU-hours ({defaults['ft_runs']} runs): {info['base_ft_total_gpu_hours']:.0f} GPU-hours"
)
print(
f" - Weight (usage frequency): {info['weight']:.3f}"
) # now a normalized fraction
overall_weighted_ft_gpu_hours += (
info["weight"] * info["base_ft_total_gpu_hours"]
)
max_gpus_required = max(max_gpus_required, info["required_gpus"])
total_weight = sum(info["weight"] for info in models_info)
additional_gpu_hours = (
defaults["hp_configs"] * defaults["hp_hours_per_config"]
+ defaults["abl_experiments"] * defaults["abl_hours_per_experiment"]
)
# synthetic data generation cost
synth_tier = defaults.get("synth_tier", "DeepSeek Reasoner")
if synth_tier not in PRESET_SYNTH_PRICES:
raise ValueError(
f"Synthetic tier '{synth_tier}' not recognized. Available tiers: {list(PRESET_SYNTH_PRICES.keys())}"
)
prices = PRESET_SYNTH_PRICES[synth_tier]
input_frac = defaults.get("synth_input_fraction", 0.1)
output_frac = defaults.get("synth_output_fraction", 0.9)
cost_per_million = input_frac * prices["input"] + output_frac * prices["output"]
synth_tokens = defaults.get("synth_tokens", 1e9) # total number of tokens
synth_cost = (
synth_tokens / 1e6
) * cost_per_million # convert to millions for pricing
overall_total_gpu_hours = (
overall_weighted_ft_gpu_hours + additional_gpu_hours * total_weight
)
gpu_compute_cost_usd = overall_total_gpu_hours * defaults["gpu_cost_usd"]
total_cost_usd = gpu_compute_cost_usd + synth_cost
total_cost_gbp = total_cost_usd * defaults["usd_to_gbp"]
print("\n🧮 Overall Compute Estimation:")
print(
f" Weighted fine-tuning GPU-hours (base): {overall_weighted_ft_gpu_hours:.0f} GPU-hours"
)
print(
f" Additional experiments (HP + Ablation) GPU-hours (weighted): {additional_gpu_hours * total_weight:.0f} GPU-hours"
)
print(f" Total GPU-hours: {overall_total_gpu_hours:.0f} GPU-hours")
print(f" GPU Compute Cost: ${gpu_compute_cost_usd:,.2f} USD")
print(f" Synthetic Data Generation Cost: ${synth_cost:,.2f} USD")
print(f" Total Cost: ${total_cost_usd:,.2f} USD / £{total_cost_gbp:,.2f} GBP")
print(f"\n🖥️ Minimum GPUs required (by memory): {max_gpus_required} GPUs\n")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment