Created
February 10, 2025 10:26
-
-
Save youqad/877f630e7f3fb59e561365479b3d6f4b to your computer and use it in GitHub Desktop.
Little CLI to estimate the compute budget for an LLM research project.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Grant Budget & GPU Resource Estimator | |
A script for estimating compute requirements and costs for LLM training: | |
1. Compute budget estimation (FLOP-based calculations): | |
- GPU-hours and costs for fine-tuning runs | |
- Hyperparameter search | |
- Ablations | |
2. Memory requirement estimation using EleutherAI's formula (https://blog.eleuther.ai/transformer-math/): | |
Total Memory = Model Memory + Optimizer Memory + Gradient Memory + Activation Memory | |
3. Synthetic data generation cost estimation: | |
- Based on input/output token counts and provider pricing tiers | |
- Cost = (tokens/1e6) × (input_fraction × input_price + output_fraction × output_price) | |
- Prices in USD per million tokens, varying by provider/model | |
- synth_tokens: Total number of synthetic tokens to generate | |
- synth_tier: Provider/model tier for generation | |
- synth_input_fraction: Fraction of tokens that are inputs | |
- synth_output_fraction: Fraction of tokens that are outputs | |
Supports dense transformer models and Mixture-of-Experts (MoE) from Hugging Face. | |
Default assumptions (configurable via JSON or CLI): | |
- Model parameters: FP16 (2 bytes/param) | |
- Optimizer state: 8 bytes/param (AdamW - momentum & variance in FP32) | |
- Gradient storage: Same as model memory | |
- Activation memory: batch_size × seq_length × hidden_size × num_layers × activation_bytes | |
- FLOPs per parameter-token: 6 | |
- Fine-tuning compute ratio: 1/fine_tuning_scaling_factor of full training | |
- Synthetic data: 100M tokens total (10% input, 90% output) using DeepSeek Reasoner tier | |
Usage: | |
Single model: | |
python grant_budget_estimator.py --model_name <HuggingFace model name> | |
Multiple models: | |
python grant_budget_estimator.py --config <config.json> | |
Example config.json: | |
{ | |
"models": [ | |
{"model_name": "deepseek-ai/DeepSeek-R1", "weight": 1}, | |
{"model_name": "mistralai/Mistral-Small-24B-Instruct-2501", "weight": 2}, | |
{ "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "weight": 2} | |
], | |
"total_tokens": 54e9, | |
"flops_per_param_token": 6, | |
"fine_tuning_scaling_factor": 1000, | |
"gpu_vram_gb": 80, | |
"fine_tuning_precision_bytes": 2, | |
"batch_size": 16, | |
"seq_length": 4096, | |
"activation_bytes": 2, | |
"ft_runs": 100, | |
"hp_configs": 200, | |
"hp_hours_per_config": 50, | |
"abl_experiments": 50, | |
"abl_hours_per_experiment": 200, | |
"synth_tokens": 4e9, | |
"synth_tier": "DeepSeek Reasoner", | |
"synth_input_fraction": 0.1, | |
"synth_output_fraction": 0.9, | |
"gpu_cost_usd": 3.00, | |
"usd_to_gbp": 0.81, | |
"gpu_flops": 5.0e14, | |
"num_ft_gpus": null | |
} | |
""" | |
import argparse | |
import math | |
import os | |
import json | |
from transformers import AutoConfig | |
# synthetic data generation prices (USD per million tokens) | |
PRESET_SYNTH_PRICES = { | |
"Gemini 1.5 Flash ≤128k": {"input": 0.07, "output": 0.3}, | |
"Gemini 1.5 Flash >128k": {"input": 0.15, "output": 0.6}, | |
"Gemini 1.5 Flash-8B ≤128k": {"input": 0.04, "output": 0.15}, | |
"Gemini 1.5 Flash-8B >128k": {"input": 0.07, "output": 0.3}, | |
"Gemini 1.5 Pro ≤128k": {"input": 1.25, "output": 5}, | |
"Gemini 1.5 Pro >128k": {"input": 2.5, "output": 10}, | |
"Claude 3.5 Sonnet": {"input": 3, "output": 15}, | |
"Claude 3 Opus": {"input": 15, "output": 75}, | |
"Claude 3 Haiku": {"input": 0.25, "output": 1.25}, | |
"Claude 3.5 Haiku": {"input": 0.8, "output": 4}, | |
"GPT-4o": {"input": 2.5, "output": 10}, | |
"GPT-4o Mini": {"input": 0.15, "output": 0.6}, | |
"o1 and o1-preview": {"input": 15, "output": 60}, | |
"o1-mini": {"input": 1.1, "output": 4.4}, | |
"o3-mini": {"input": 1.1, "output": 4.4}, | |
"Amazon Nova Micro": {"input": 0.04, "output": 0.14}, | |
"Amazon Nova Lite": {"input": 0.06, "output": 0.24}, | |
"Amazon Nova Pro": {"input": 0.8, "output": 3.2}, | |
"DeepSeek Chat": {"input": 0.14, "output": 0.28}, | |
"DeepSeek Reasoner": {"input": 0.55, "output": 2.19}, | |
} | |
def load_json_config(filepath): | |
"""Load and parse a JSON configuration file.""" | |
if os.path.exists(filepath): | |
with open(filepath, "r") as f: | |
return json.load(f) | |
else: | |
raise FileNotFoundError(f"Configuration file '{filepath}' not found. 🚫") | |
def load_model_config(model_name): | |
"""Fetch model configuration from Hugging Face.""" | |
return AutoConfig.from_pretrained(model_name) | |
def estimate_activated_moe_params_general(config: dict): | |
""" | |
Estimate the per-token activated parameter count and the full parameter count for a MoE-based HF model like DeepSeek-R1. | |
Returns a 2-tuple: (activated_params, full_params) | |
- 'activated_params' is computed with a fraction (experts_per_token/n_routed_experts) | |
of the MoE feed-forward cost. | |
- 'full_params' assumes that all experts are loaded in memory (i.e. the full cost). | |
""" | |
hidden_size = config.get("hidden_size", 7168) | |
vocab_size = config.get("vocab_size", 129280) | |
n_layers = config.get("num_hidden_layers", 61) | |
first_k_dense = config.get("first_k_dense_replace", 3) | |
int_size_dense = config.get("intermediate_size", 4 * hidden_size) | |
int_size_moe = config.get("moe_intermediate_size", 2048) | |
n_routed_experts = config.get("n_routed_experts", 256) | |
n_shared_experts = config.get("n_shared_experts", 1) | |
experts_per_token = config.get("num_experts_per_tok", 8) | |
moe_layer_freq = config.get("moe_layer_freq", 1) | |
q_lora_rank = config.get("q_lora_rank", 1536) | |
kv_lora_rank = config.get("kv_lora_rank", 512) | |
num_nextn_predict_layers = config.get("num_nextn_predict_layers", 1) | |
tie_embeddings = config.get("tie_word_embeddings", False) | |
# embedding parameters | |
if tie_embeddings: | |
emb_params = hidden_size * vocab_size | |
else: | |
emb_params = 2.0 * hidden_size * vocab_size | |
# attention cost | |
attn_base = 4.0 * (hidden_size**2) | |
mla_overhead = 30e6 * ((hidden_size / 7168.0) ** 2) | |
attn_cost = attn_base + mla_overhead | |
def lora_cost(h, rank): | |
return 2.0 * h * rank | |
lora_overhead = lora_cost(hidden_size, q_lora_rank) + 2 * lora_cost( | |
hidden_size, kv_lora_rank | |
) | |
total_attn_cost = attn_cost + lora_overhead | |
# MLP cost | |
dense_ffn_cost = 2.0 * hidden_size * int_size_dense | |
moe_expert_cost = 2.0 * hidden_size * int_size_moe | |
# for activated parameter count, only a fraction of experts is active: | |
fraction_active = ( | |
experts_per_token / float(n_routed_experts) if n_routed_experts > 0 else 0.0 | |
) | |
activated_expert_cost = ( | |
moe_expert_cost * n_routed_experts * fraction_active | |
+ moe_expert_cost * n_shared_experts | |
) | |
# for full parameter count, all experts are loaded: | |
full_expert_cost = ( | |
moe_expert_cost * n_routed_experts + moe_expert_cost * n_shared_experts | |
) | |
# gating cost for MoE layers | |
gating_cost = hidden_size * n_routed_experts if n_routed_experts > 0 else 0.0 | |
total_layer_cost_activated = 0.0 | |
total_layer_cost_full = 0.0 | |
for layer_idx in range(n_layers): | |
layer_cost_attn = total_attn_cost | |
if layer_idx < first_k_dense: | |
layer_cost_dense = dense_ffn_cost | |
layer_cost_moe_activated = 0.0 | |
layer_cost_moe_full = 0.0 | |
else: | |
# if this layer is designated as an MoE layer: | |
if n_routed_experts > 0 and ( | |
(layer_idx - first_k_dense) % moe_layer_freq == 0 | |
): | |
layer_cost_moe_activated = activated_expert_cost + gating_cost | |
layer_cost_moe_full = full_expert_cost + gating_cost | |
layer_cost_dense = 0.0 | |
else: | |
layer_cost_dense = dense_ffn_cost | |
layer_cost_moe_activated = 0.0 | |
layer_cost_moe_full = 0.0 | |
total_layer_cost_activated += ( | |
layer_cost_attn + layer_cost_dense + layer_cost_moe_activated | |
) | |
total_layer_cost_full += ( | |
layer_cost_attn + layer_cost_dense + layer_cost_moe_full | |
) | |
# multi-Token Prediction overhead (applies equally to both) | |
mtp_overhead = 300e6 * ((hidden_size / 7168.0) ** 2) * num_nextn_predict_layers | |
# fudge factor for extra parameters (norms, biases, etc.) | |
fudge = 2.0e9 | |
total_activated = emb_params + total_layer_cost_activated + mtp_overhead + fudge | |
total_full = emb_params + total_layer_cost_full + mtp_overhead + fudge | |
return (total_activated, total_full) | |
def compute_training_memory( | |
est_params, | |
defaults, | |
hidden_size, | |
n_layers, | |
batch_size, | |
seq_length, | |
activation_bytes, | |
): | |
""" | |
Compute total training memory (in GB) using Eleuther's formula: | |
Total Memory = Model Memory + Optimiser Memory + Gradient Memory + Activation Memory | |
Assumes: | |
- Model Memory = N * param_bytes (FP16: 2 bytes) | |
- Gradient Memory = N * param_bytes | |
- Optimiser Memory = N * 8 (AdamW: two FP32 copies) | |
- Activation Memory = batch_size * seq_length * hidden_size * n_layers * activation_bytes | |
""" | |
param_bytes = defaults["fine_tuning_precision_bytes"] | |
model_memory = est_params * param_bytes | |
gradient_memory = est_params * param_bytes | |
optimiser_memory = est_params * 8 | |
activation_memory = ( | |
batch_size * seq_length * hidden_size * n_layers * activation_bytes | |
) | |
total_bytes = model_memory + gradient_memory + optimiser_memory + activation_memory | |
total_gb = total_bytes / (1024**3) | |
return total_gb | |
def compute_model_estimates(model_name, defaults, cli_overrides): | |
""" | |
Compute estimates for a single model: | |
- Effective parameter count (using MoE-aware function if applicable) | |
- Detailed training memory (using Eleuther's formula) | |
- FLOP-based compute estimates. | |
""" | |
config = load_model_config(model_name) | |
config_dict = config.to_dict() | |
if "n_routed_experts" in config_dict: | |
activated_params, full_params = estimate_activated_moe_params_general( | |
config_dict | |
) | |
else: | |
if hasattr(config, "num_parameters"): | |
full_params = config.num_parameters() | |
else: | |
import re | |
match = re.search(r"(\d+)[bB]", config.name_or_path) | |
if match: | |
full_params = float(match.group(1)) * 1e9 | |
else: | |
full_params = 7e9 # default to 7B | |
# override if provided via CLI | |
if cli_overrides.get("model_parameters") is not None: | |
full_params = cli_overrides["model_parameters"] | |
# memory estimation using Eleuther AI's formula | |
hidden_size = getattr(config, "hidden_size", 7168) | |
n_layers = getattr(config, "num_hidden_layers", 61) | |
batch_size = defaults.get("batch_size", 16) | |
seq_length = defaults.get("seq_length", 4096) | |
activation_bytes = defaults.get("activation_bytes", 2) # FP16 activations | |
training_memory_gb = compute_training_memory( | |
full_params, | |
defaults, | |
hidden_size, | |
n_layers, | |
batch_size, | |
seq_length, | |
activation_bytes, | |
) | |
required_gpus = math.ceil(training_memory_gb / defaults["gpu_vram_gb"]) | |
# FLOP-based compute estimation: | |
full_training_flops = ( | |
defaults["flops_per_param_token"] * full_params * defaults["total_tokens"] | |
) | |
ft_flops = full_training_flops / defaults["fine_tuning_scaling_factor"] | |
num_ft_gpus = ( | |
defaults["num_ft_gpus"] | |
if defaults["num_ft_gpus"] is not None | |
else required_gpus | |
) | |
cluster_throughput = num_ft_gpus * defaults["gpu_flops"] | |
ft_time_sec = ft_flops / cluster_throughput | |
ft_time_hr = ft_time_sec / 3600 | |
gpu_hours_per_ft_run = ft_time_hr * num_ft_gpus | |
base_ft_total_gpu_hours = defaults["ft_runs"] * gpu_hours_per_ft_run | |
return { | |
"model_name": model_name, | |
"num_params": full_params, | |
"hidden_size": hidden_size, | |
"n_layers": n_layers, | |
"training_memory_gb": training_memory_gb, | |
"required_gpus": required_gpus, | |
"num_ft_gpus": num_ft_gpus, | |
"ft_time_hr": ft_time_hr, | |
"gpu_hours_per_ft_run": gpu_hours_per_ft_run, | |
"base_ft_total_gpu_hours": base_ft_total_gpu_hours, | |
} | |
def main(): | |
parser = argparse.ArgumentParser( | |
description="Estimate compute budget and GPU requirements for fine-tuning LLMs.", | |
) | |
parser.add_argument( | |
"--model_name", | |
type=str, | |
help="Hugging Face model name (for single-model mode).", | |
) | |
parser.add_argument( | |
"--config", type=str, help="Optional JSON config file (for multi-model mode)." | |
) | |
# optional CLI overrides | |
parser.add_argument( | |
"--ft_runs", type=int, help="Override: number of fine-tuning runs." | |
) | |
parser.add_argument( | |
"--hp_configs", | |
type=int, | |
help="Override: number of hyperparameter configurations.", | |
) | |
parser.add_argument( | |
"--abl_experiments", type=int, help="Override: number of ablation experiments." | |
) | |
parser.add_argument( | |
"--gpu_cost_usd", type=float, help="Override: cost per GPU-hour in USD." | |
) | |
args = parser.parse_args() | |
defaults = { | |
"total_tokens": 54e9, # cf. FineMath | |
"flops_per_param_token": 6, | |
"fine_tuning_scaling_factor": 100, | |
"gpu_vram_gb": 80, | |
"fine_tuning_precision_bytes": 2, | |
"batch_size": 16, | |
"seq_length": 4096, | |
"activation_bytes": 2, | |
"ft_runs": 100, | |
"hp_configs": 200, | |
"hp_hours_per_config": 50, | |
"abl_experiments": 50, | |
"abl_hours_per_experiment": 200, | |
"gpu_cost_usd": 3.00, | |
"usd_to_gbp": 0.81, | |
"gpu_flops": 5.0e14, | |
"num_ft_gpus": None, | |
"synth_tokens": 4e9, | |
"synth_tier": "DeepSeek Reasoner", | |
"synth_input_fraction": 0.1, | |
"synth_output_fraction": 0.9, | |
} | |
# override defaults from CLI if provided | |
if args.ft_runs is not None: | |
defaults["ft_runs"] = args.ft_runs | |
if args.hp_configs is not None: | |
defaults["hp_configs"] = args.hp_configs | |
if args.abl_experiments is not None: | |
defaults["abl_experiments"] = args.abl_experiments | |
if args.gpu_cost_usd is not None: | |
defaults["gpu_cost_usd"] = args.gpu_cost_usd | |
# CLI overrides dict for model_parameters if wanted | |
cli_overrides = {"model_parameters": None} | |
models_info = [] | |
if args.config: | |
config_data = load_json_config(args.config) | |
# merge global defaults from JSON if present | |
for key in defaults.keys(): | |
if key in config_data: | |
defaults[key] = config_data[key] | |
if "models" not in config_data: | |
raise ValueError( | |
"JSON config must include a 'models' key with a list of models (with weights)." | |
) | |
# normalize the weights | |
total_weight = sum(model.get("weight", 1) for model in config_data["models"]) | |
if total_weight <= 0: | |
raise ValueError("Sum of 'weight' fields must be > 0 for normalization.") | |
for model_entry in config_data["models"]: | |
raw_weight = model_entry.get("weight", 1) | |
model_entry["weight"] = raw_weight / total_weight | |
for model_entry in config_data["models"]: | |
model_name = model_entry.get("model_name") | |
if not model_name: | |
continue | |
weight = model_entry.get("weight", 1) | |
if "model_parameters" in model_entry: | |
cli_overrides["model_parameters"] = model_entry["model_parameters"] | |
est = compute_model_estimates(model_name, defaults, cli_overrides) | |
est["weight"] = weight | |
models_info.append(est) | |
elif args.model_name: | |
est = compute_model_estimates(args.model_name, defaults, cli_overrides) | |
est["weight"] = 1 | |
models_info.append(est) | |
else: | |
raise ValueError( | |
"You must supply either --model_name (single-model mode) or --config (multi-model mode)." | |
) | |
print("\n📊 Model Estimates:") | |
overall_weighted_ft_gpu_hours = 0 | |
max_gpus_required = 0 | |
for info in models_info: | |
print("--------------------------------------------") | |
print(f"Model: {info['model_name']}") | |
print(f" - Estimated activated parameters: {info['num_params']:.0f}") | |
print(f" - Hidden size: {info['hidden_size']}") | |
print(f" - Number of layers: {info['n_layers']}") | |
print( | |
f" - Training memory estimate: {info['training_memory_gb']:.2f} GB (Eleuther's formula)" | |
) | |
print(f" - Minimum GPUs required (memory): {info['required_gpus']}") | |
print(f" - GPUs used for compute: {info['num_ft_gpus']}") | |
print(f" - Time per FT run: {info['ft_time_hr']:.2f} hours") | |
print(f" - GPU-hours per FT run: {info['gpu_hours_per_ft_run']:.0f} GPU-hours") | |
print( | |
f" - Base FT total GPU-hours ({defaults['ft_runs']} runs): {info['base_ft_total_gpu_hours']:.0f} GPU-hours" | |
) | |
print( | |
f" - Weight (usage frequency): {info['weight']:.3f}" | |
) # now a normalized fraction | |
overall_weighted_ft_gpu_hours += ( | |
info["weight"] * info["base_ft_total_gpu_hours"] | |
) | |
max_gpus_required = max(max_gpus_required, info["required_gpus"]) | |
total_weight = sum(info["weight"] for info in models_info) | |
additional_gpu_hours = ( | |
defaults["hp_configs"] * defaults["hp_hours_per_config"] | |
+ defaults["abl_experiments"] * defaults["abl_hours_per_experiment"] | |
) | |
# synthetic data generation cost | |
synth_tier = defaults.get("synth_tier", "DeepSeek Reasoner") | |
if synth_tier not in PRESET_SYNTH_PRICES: | |
raise ValueError( | |
f"Synthetic tier '{synth_tier}' not recognized. Available tiers: {list(PRESET_SYNTH_PRICES.keys())}" | |
) | |
prices = PRESET_SYNTH_PRICES[synth_tier] | |
input_frac = defaults.get("synth_input_fraction", 0.1) | |
output_frac = defaults.get("synth_output_fraction", 0.9) | |
cost_per_million = input_frac * prices["input"] + output_frac * prices["output"] | |
synth_tokens = defaults.get("synth_tokens", 1e9) # total number of tokens | |
synth_cost = ( | |
synth_tokens / 1e6 | |
) * cost_per_million # convert to millions for pricing | |
overall_total_gpu_hours = ( | |
overall_weighted_ft_gpu_hours + additional_gpu_hours * total_weight | |
) | |
gpu_compute_cost_usd = overall_total_gpu_hours * defaults["gpu_cost_usd"] | |
total_cost_usd = gpu_compute_cost_usd + synth_cost | |
total_cost_gbp = total_cost_usd * defaults["usd_to_gbp"] | |
print("\n🧮 Overall Compute Estimation:") | |
print( | |
f" Weighted fine-tuning GPU-hours (base): {overall_weighted_ft_gpu_hours:.0f} GPU-hours" | |
) | |
print( | |
f" Additional experiments (HP + Ablation) GPU-hours (weighted): {additional_gpu_hours * total_weight:.0f} GPU-hours" | |
) | |
print(f" Total GPU-hours: {overall_total_gpu_hours:.0f} GPU-hours") | |
print(f" GPU Compute Cost: ${gpu_compute_cost_usd:,.2f} USD") | |
print(f" Synthetic Data Generation Cost: ${synth_cost:,.2f} USD") | |
print(f" Total Cost: ${total_cost_usd:,.2f} USD / £{total_cost_gbp:,.2f} GBP") | |
print(f"\n🖥️ Minimum GPUs required (by memory): {max_gpus_required} GPUs\n") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment