youqad · February 10, 2025 10:26
diff --git a/grant_budget_estimator.py b/grant_budget_estimator.py
 """
 Grant Budget & GPU Resource Estimator

 A script for estimating compute requirements and costs for LLM training:

 1. Compute budget estimation (FLOP-based calculations):
   - GPU-hours and costs for fine-tuning runs
   - Hyperparameter search
   - Ablations

 2. Memory requirement estimation using EleutherAI's formula (https://blog.eleuther.ai/transformer-math/):
   Total Memory = Model Memory + Optimizer Memory + Gradient Memory + Activation Memory

 3. Synthetic data generation cost estimation:
   - Based on input/output token counts and provider pricing tiers
   - Cost = (tokens/1e6) × (input_fraction × input_price + output_fraction × output_price)
   - Prices in USD per million tokens, varying by provider/model
   - synth_tokens: Total number of synthetic tokens to generate
   - synth_tier: Provider/model tier for generation
   - synth_input_fraction: Fraction of tokens that are inputs
   - synth_output_fraction: Fraction of tokens that are outputs

 Supports dense transformer models and Mixture-of-Experts (MoE) from Hugging Face.

 Default assumptions (configurable via JSON or CLI):
 - Model parameters: FP16 (2 bytes/param)
 - Optimizer state: 8 bytes/param (AdamW - momentum & variance in FP32)
 - Gradient storage: Same as model memory
 - Activation memory: batch_size × seq_length × hidden_size × num_layers × activation_bytes
 - FLOPs per parameter-token: 6
 - Fine-tuning compute ratio: 1/fine_tuning_scaling_factor of full training
 - Synthetic data: 100M tokens total (10% input, 90% output) using DeepSeek Reasoner tier

 Usage:
    Single model:
        python grant_budget_estimator.py --model_name <HuggingFace model name>
    Multiple models:
        python grant_budget_estimator.py --config <config.json>

 Example config.json:
 {
    "models": [
        {"model_name": "deepseek-ai/DeepSeek-R1", "weight": 1},
        {"model_name": "mistralai/Mistral-Small-24B-Instruct-2501", "weight": 2},
        { "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "weight": 2}
    ],
    "total_tokens": 54e9,
    "flops_per_param_token": 6,
    "fine_tuning_scaling_factor": 1000,
    "gpu_vram_gb": 80,
    "fine_tuning_precision_bytes": 2,
    "batch_size": 16,
    "seq_length": 4096,
    "activation_bytes": 2,
    "ft_runs": 100,
    "hp_configs": 200,
    "hp_hours_per_config": 50,
    "abl_experiments": 50,
    "abl_hours_per_experiment": 200,
    "synth_tokens": 4e9,
    "synth_tier": "DeepSeek Reasoner",
    "synth_input_fraction": 0.1,
    "synth_output_fraction": 0.9,
    "gpu_cost_usd": 3.00,
    "usd_to_gbp": 0.81,
    "gpu_flops": 5.0e14,
    "num_ft_gpus": null
 }
 """

 import argparse
 import math
 import os
 import json
 from transformers import AutoConfig

 # synthetic data generation prices (USD per million tokens)
 PRESET_SYNTH_PRICES = {
    "Gemini 1.5 Flash ≤128k": {"input": 0.07, "output": 0.3},
    "Gemini 1.5 Flash >128k": {"input": 0.15, "output": 0.6},
    "Gemini 1.5 Flash-8B ≤128k": {"input": 0.04, "output": 0.15},
    "Gemini 1.5 Flash-8B >128k": {"input": 0.07, "output": 0.3},
    "Gemini 1.5 Pro ≤128k": {"input": 1.25, "output": 5},
    "Gemini 1.5 Pro >128k": {"input": 2.5, "output": 10},
    "Claude 3.5 Sonnet": {"input": 3, "output": 15},
    "Claude 3 Opus": {"input": 15, "output": 75},
    "Claude 3 Haiku": {"input": 0.25, "output": 1.25},
    "Claude 3.5 Haiku": {"input": 0.8, "output": 4},
    "GPT-4o": {"input": 2.5, "output": 10},
    "GPT-4o Mini": {"input": 0.15, "output": 0.6},
    "o1 and o1-preview": {"input": 15, "output": 60},
    "o1-mini": {"input": 1.1, "output": 4.4},
    "o3-mini": {"input": 1.1, "output": 4.4},
    "Amazon Nova Micro": {"input": 0.04, "output": 0.14},
    "Amazon Nova Lite": {"input": 0.06, "output": 0.24},
    "Amazon Nova Pro": {"input": 0.8, "output": 3.2},
    "DeepSeek Chat": {"input": 0.14, "output": 0.28},
    "DeepSeek Reasoner": {"input": 0.55, "output": 2.19},
 }


 def load_json_config(filepath):
    """Load and parse a JSON configuration file."""
    if os.path.exists(filepath):
        with open(filepath, "r") as f:
            return json.load(f)
    else:
        raise FileNotFoundError(f"Configuration file '{filepath}' not found. 🚫")


 def load_model_config(model_name):
    """Fetch model configuration from Hugging Face."""
    return AutoConfig.from_pretrained(model_name)


 def estimate_activated_moe_params_general(config: dict):
    """
    Estimate the per-token activated parameter count and the full parameter count for a MoE-based HF model like DeepSeek-R1.

    Returns a 2-tuple: (activated_params, full_params)
      - 'activated_params' is computed with a fraction (experts_per_token/n_routed_experts)
        of the MoE feed-forward cost.
      - 'full_params' assumes that all experts are loaded in memory (i.e. the full cost).
    """
    hidden_size = config.get("hidden_size", 7168)
    vocab_size = config.get("vocab_size", 129280)
    n_layers = config.get("num_hidden_layers", 61)
    first_k_dense = config.get("first_k_dense_replace", 3)
    int_size_dense = config.get("intermediate_size", 4 * hidden_size)
    int_size_moe = config.get("moe_intermediate_size", 2048)
    n_routed_experts = config.get("n_routed_experts", 256)
    n_shared_experts = config.get("n_shared_experts", 1)
    experts_per_token = config.get("num_experts_per_tok", 8)
    moe_layer_freq = config.get("moe_layer_freq", 1)
    q_lora_rank = config.get("q_lora_rank", 1536)
    kv_lora_rank = config.get("kv_lora_rank", 512)
    num_nextn_predict_layers = config.get("num_nextn_predict_layers", 1)
    tie_embeddings = config.get("tie_word_embeddings", False)

    # embedding parameters
    if tie_embeddings:
        emb_params = hidden_size * vocab_size
    else:
        emb_params = 2.0 * hidden_size * vocab_size

    # attention cost
    attn_base = 4.0 * (hidden_size**2)
    mla_overhead = 30e6 * ((hidden_size / 7168.0) ** 2)
    attn_cost = attn_base + mla_overhead

    def lora_cost(h, rank):
        return 2.0 * h * rank

    lora_overhead = lora_cost(hidden_size, q_lora_rank) + 2 * lora_cost(
        hidden_size, kv_lora_rank
    )
    total_attn_cost = attn_cost + lora_overhead

    # MLP cost
    dense_ffn_cost = 2.0 * hidden_size * int_size_dense
    moe_expert_cost = 2.0 * hidden_size * int_size_moe
    # for activated parameter count, only a fraction of experts is active:
    fraction_active = (
        experts_per_token / float(n_routed_experts) if n_routed_experts > 0 else 0.0
    )
    activated_expert_cost = (
        moe_expert_cost * n_routed_experts * fraction_active
        + moe_expert_cost * n_shared_experts
    )
    # for full parameter count, all experts are loaded:
    full_expert_cost = (
        moe_expert_cost * n_routed_experts + moe_expert_cost * n_shared_experts
    )

    # gating cost for MoE layers
    gating_cost = hidden_size * n_routed_experts if n_routed_experts > 0 else 0.0

    total_layer_cost_activated = 0.0
    total_layer_cost_full = 0.0
    for layer_idx in range(n_layers):
        layer_cost_attn = total_attn_cost
        if layer_idx < first_k_dense:
            layer_cost_dense = dense_ffn_cost
            layer_cost_moe_activated = 0.0
            layer_cost_moe_full = 0.0
        else:
            # if this layer is designated as an MoE layer:
            if n_routed_experts > 0 and (
                (layer_idx - first_k_dense) % moe_layer_freq == 0
            ):
                layer_cost_moe_activated = activated_expert_cost + gating_cost
                layer_cost_moe_full = full_expert_cost + gating_cost
                layer_cost_dense = 0.0
            else:
                layer_cost_dense = dense_ffn_cost
                layer_cost_moe_activated = 0.0
                layer_cost_moe_full = 0.0
        total_layer_cost_activated += (
            layer_cost_attn + layer_cost_dense + layer_cost_moe_activated
        )
        total_layer_cost_full += (
            layer_cost_attn + layer_cost_dense + layer_cost_moe_full
        )

    # multi-Token Prediction overhead (applies equally to both)
    mtp_overhead = 300e6 * ((hidden_size / 7168.0) ** 2) * num_nextn_predict_layers

    # fudge factor for extra parameters (norms, biases, etc.)
    fudge = 2.0e9

    total_activated = emb_params + total_layer_cost_activated + mtp_overhead + fudge
    total_full = emb_params + total_layer_cost_full + mtp_overhead + fudge

    return (total_activated, total_full)


 def compute_training_memory(
    est_params,
    defaults,
    hidden_size,
    n_layers,
    batch_size,
    seq_length,
    activation_bytes,
 ):
    """
    Compute total training memory (in GB) using Eleuther's formula:
      Total Memory = Model Memory + Optimiser Memory + Gradient Memory + Activation Memory
    Assumes:
      - Model Memory = N * param_bytes  (FP16: 2 bytes)
      - Gradient Memory = N * param_bytes
      - Optimiser Memory = N * 8   (AdamW: two FP32 copies)
      - Activation Memory = batch_size * seq_length * hidden_size * n_layers * activation_bytes
    """
    param_bytes = defaults["fine_tuning_precision_bytes"]
    model_memory = est_params * param_bytes
    gradient_memory = est_params * param_bytes
    optimiser_memory = est_params * 8
    activation_memory = (
        batch_size * seq_length * hidden_size * n_layers * activation_bytes
    )
    total_bytes = model_memory + gradient_memory + optimiser_memory + activation_memory
    total_gb = total_bytes / (1024**3)
    return total_gb


 def compute_model_estimates(model_name, defaults, cli_overrides):
    """
    Compute estimates for a single model:
      - Effective parameter count (using MoE-aware function if applicable)
      - Detailed training memory (using Eleuther's formula)
      - FLOP-based compute estimates.
    """
    config = load_model_config(model_name)

    config_dict = config.to_dict()
    if "n_routed_experts" in config_dict:
        activated_params, full_params = estimate_activated_moe_params_general(
            config_dict
        )
    else:
        if hasattr(config, "num_parameters"):
            full_params = config.num_parameters()
        else:
            import re

            match = re.search(r"(\d+)[bB]", config.name_or_path)
            if match:
                full_params = float(match.group(1)) * 1e9
            else:
                full_params = 7e9  # default to 7B
    # override if provided via CLI
    if cli_overrides.get("model_parameters") is not None:
        full_params = cli_overrides["model_parameters"]

    # memory estimation using Eleuther AI's formula
    hidden_size = getattr(config, "hidden_size", 7168)
    n_layers = getattr(config, "num_hidden_layers", 61)
    batch_size = defaults.get("batch_size", 16)
    seq_length = defaults.get("seq_length", 4096)
    activation_bytes = defaults.get("activation_bytes", 2)  # FP16 activations
    training_memory_gb = compute_training_memory(
        full_params,
        defaults,
        hidden_size,
        n_layers,
        batch_size,
        seq_length,
        activation_bytes,
    )
    required_gpus = math.ceil(training_memory_gb / defaults["gpu_vram_gb"])

    # FLOP-based compute estimation:
    full_training_flops = (
        defaults["flops_per_param_token"] * full_params * defaults["total_tokens"]
    )
    ft_flops = full_training_flops / defaults["fine_tuning_scaling_factor"]
    num_ft_gpus = (
        defaults["num_ft_gpus"]
        if defaults["num_ft_gpus"] is not None
        else required_gpus
    )
    cluster_throughput = num_ft_gpus * defaults["gpu_flops"]
    ft_time_sec = ft_flops / cluster_throughput
    ft_time_hr = ft_time_sec / 3600
    gpu_hours_per_ft_run = ft_time_hr * num_ft_gpus
    base_ft_total_gpu_hours = defaults["ft_runs"] * gpu_hours_per_ft_run

    return {
        "model_name": model_name,
        "num_params": full_params,
        "hidden_size": hidden_size,
        "n_layers": n_layers,
        "training_memory_gb": training_memory_gb,
        "required_gpus": required_gpus,
        "num_ft_gpus": num_ft_gpus,
        "ft_time_hr": ft_time_hr,
        "gpu_hours_per_ft_run": gpu_hours_per_ft_run,
        "base_ft_total_gpu_hours": base_ft_total_gpu_hours,
    }


 def main():
    parser = argparse.ArgumentParser(
        description="Estimate compute budget and GPU requirements for fine-tuning LLMs.",
    )
    parser.add_argument(
        "--model_name",
        type=str,
        help="Hugging Face model name (for single-model mode).",
    )
    parser.add_argument(
        "--config", type=str, help="Optional JSON config file (for multi-model mode)."
    )
    # optional CLI overrides
    parser.add_argument(
        "--ft_runs", type=int, help="Override: number of fine-tuning runs."
    )
    parser.add_argument(
        "--hp_configs",
        type=int,
        help="Override: number of hyperparameter configurations.",
    )
    parser.add_argument(
        "--abl_experiments", type=int, help="Override: number of ablation experiments."
    )
    parser.add_argument(
        "--gpu_cost_usd", type=float, help="Override: cost per GPU-hour in USD."
    )
    args = parser.parse_args()

    defaults = {
        "total_tokens": 54e9,  # cf. FineMath
        "flops_per_param_token": 6,
        "fine_tuning_scaling_factor": 100,
        "gpu_vram_gb": 80,
        "fine_tuning_precision_bytes": 2,
        "batch_size": 16,
        "seq_length": 4096,
        "activation_bytes": 2,
        "ft_runs": 100,
        "hp_configs": 200,
        "hp_hours_per_config": 50,
        "abl_experiments": 50,
        "abl_hours_per_experiment": 200,
        "gpu_cost_usd": 3.00,
        "usd_to_gbp": 0.81,
        "gpu_flops": 5.0e14,
        "num_ft_gpus": None,
        "synth_tokens": 4e9,
        "synth_tier": "DeepSeek Reasoner",
        "synth_input_fraction": 0.1,
        "synth_output_fraction": 0.9,
    }
    # override defaults from CLI if provided
    if args.ft_runs is not None:
        defaults["ft_runs"] = args.ft_runs
    if args.hp_configs is not None:
        defaults["hp_configs"] = args.hp_configs
    if args.abl_experiments is not None:
        defaults["abl_experiments"] = args.abl_experiments
    if args.gpu_cost_usd is not None:
        defaults["gpu_cost_usd"] = args.gpu_cost_usd

    # CLI overrides dict for model_parameters if wanted
    cli_overrides = {"model_parameters": None}

    models_info = []
    if args.config:
        config_data = load_json_config(args.config)

        # merge global defaults from JSON if present
        for key in defaults.keys():
            if key in config_data:
                defaults[key] = config_data[key]

        if "models" not in config_data:
            raise ValueError(
                "JSON config must include a 'models' key with a list of models (with weights)."
            )

        # normalize the weights
        total_weight = sum(model.get("weight", 1) for model in config_data["models"])
        if total_weight <= 0:
            raise ValueError("Sum of 'weight' fields must be > 0 for normalization.")

        for model_entry in config_data["models"]:
            raw_weight = model_entry.get("weight", 1)
            model_entry["weight"] = raw_weight / total_weight

        for model_entry in config_data["models"]:
            model_name = model_entry.get("model_name")
            if not model_name:
                continue
            weight = model_entry.get("weight", 1)
            if "model_parameters" in model_entry:
                cli_overrides["model_parameters"] = model_entry["model_parameters"]
            est = compute_model_estimates(model_name, defaults, cli_overrides)
            est["weight"] = weight
            models_info.append(est)

    elif args.model_name:
        est = compute_model_estimates(args.model_name, defaults, cli_overrides)
        est["weight"] = 1
        models_info.append(est)
    else:
        raise ValueError(
            "You must supply either --model_name (single-model mode) or --config (multi-model mode)."
        )

    print("\n📊 Model Estimates:")
    overall_weighted_ft_gpu_hours = 0
    max_gpus_required = 0
    for info in models_info:
        print("--------------------------------------------")
        print(f"Model: {info['model_name']}")
        print(f"  - Estimated activated parameters: {info['num_params']:.0f}")
        print(f"  - Hidden size: {info['hidden_size']}")
        print(f"  - Number of layers: {info['n_layers']}")
        print(
            f"  - Training memory estimate: {info['training_memory_gb']:.2f} GB (Eleuther's formula)"
        )
        print(f"  - Minimum GPUs required (memory): {info['required_gpus']}")
        print(f"  - GPUs used for compute: {info['num_ft_gpus']}")
        print(f"  - Time per FT run: {info['ft_time_hr']:.2f} hours")
        print(f"  - GPU-hours per FT run: {info['gpu_hours_per_ft_run']:.0f} GPU-hours")
        print(
            f"  - Base FT total GPU-hours ({defaults['ft_runs']} runs): {info['base_ft_total_gpu_hours']:.0f} GPU-hours"
        )
        print(
            f"  - Weight (usage frequency): {info['weight']:.3f}"
        )  # now a normalized fraction
        overall_weighted_ft_gpu_hours += (
            info["weight"] * info["base_ft_total_gpu_hours"]
        )
        max_gpus_required = max(max_gpus_required, info["required_gpus"])

    total_weight = sum(info["weight"] for info in models_info)
    additional_gpu_hours = (
        defaults["hp_configs"] * defaults["hp_hours_per_config"]
        + defaults["abl_experiments"] * defaults["abl_hours_per_experiment"]
    )

    # synthetic data generation cost
    synth_tier = defaults.get("synth_tier", "DeepSeek Reasoner")
    if synth_tier not in PRESET_SYNTH_PRICES:
        raise ValueError(
            f"Synthetic tier '{synth_tier}' not recognized. Available tiers: {list(PRESET_SYNTH_PRICES.keys())}"
        )

    prices = PRESET_SYNTH_PRICES[synth_tier]
    input_frac = defaults.get("synth_input_fraction", 0.1)
    output_frac = defaults.get("synth_output_fraction", 0.9)
    cost_per_million = input_frac * prices["input"] + output_frac * prices["output"]
    synth_tokens = defaults.get("synth_tokens", 1e9)  # total number of tokens
    synth_cost = (
        synth_tokens / 1e6
    ) * cost_per_million  # convert to millions for pricing

    overall_total_gpu_hours = (
        overall_weighted_ft_gpu_hours + additional_gpu_hours * total_weight
    )
    gpu_compute_cost_usd = overall_total_gpu_hours * defaults["gpu_cost_usd"]
    total_cost_usd = gpu_compute_cost_usd + synth_cost
    total_cost_gbp = total_cost_usd * defaults["usd_to_gbp"]

    print("\n🧮 Overall Compute Estimation:")
    print(
        f"  Weighted fine-tuning GPU-hours (base): {overall_weighted_ft_gpu_hours:.0f} GPU-hours"
    )
    print(
        f"  Additional experiments (HP + Ablation) GPU-hours (weighted): {additional_gpu_hours * total_weight:.0f} GPU-hours"
    )
    print(f"  Total GPU-hours: {overall_total_gpu_hours:.0f} GPU-hours")
    print(f"  GPU Compute Cost: ${gpu_compute_cost_usd:,.2f} USD")
    print(f"  Synthetic Data Generation Cost: ${synth_cost:,.2f} USD")
    print(f"  Total Cost: ${total_cost_usd:,.2f} USD / £{total_cost_gbp:,.2f} GBP")
    print(f"\n🖥️  Minimum GPUs required (by memory): {max_gpus_required} GPUs\n")


 if __name__ == "__main__":
    main()
	"""
	Grant Budget & GPU Resource Estimator

	A script for estimating compute requirements and costs for LLM training:

	1. Compute budget estimation (FLOP-based calculations):
	- GPU-hours and costs for fine-tuning runs
	- Hyperparameter search
	- Ablations

	2. Memory requirement estimation using EleutherAI's formula (https://blog.eleuther.ai/transformer-math/):
	Total Memory = Model Memory + Optimizer Memory + Gradient Memory + Activation Memory

	3. Synthetic data generation cost estimation:
	- Based on input/output token counts and provider pricing tiers
	- Cost = (tokens/1e6) × (input_fraction × input_price + output_fraction × output_price)
	- Prices in USD per million tokens, varying by provider/model
	- synth_tokens: Total number of synthetic tokens to generate
	- synth_tier: Provider/model tier for generation
	- synth_input_fraction: Fraction of tokens that are inputs
	- synth_output_fraction: Fraction of tokens that are outputs

	Supports dense transformer models and Mixture-of-Experts (MoE) from Hugging Face.

	Default assumptions (configurable via JSON or CLI):
	- Model parameters: FP16 (2 bytes/param)
	- Optimizer state: 8 bytes/param (AdamW - momentum & variance in FP32)
	- Gradient storage: Same as model memory
	- Activation memory: batch_size × seq_length × hidden_size × num_layers × activation_bytes
	- FLOPs per parameter-token: 6
	- Fine-tuning compute ratio: 1/fine_tuning_scaling_factor of full training
	- Synthetic data: 100M tokens total (10% input, 90% output) using DeepSeek Reasoner tier

	Usage:
	Single model:
	python grant_budget_estimator.py --model_name <HuggingFace model name>
	Multiple models:
	python grant_budget_estimator.py --config <config.json>

	Example config.json:
	{
	"models": [
	{"model_name": "deepseek-ai/DeepSeek-R1", "weight": 1},
	{"model_name": "mistralai/Mistral-Small-24B-Instruct-2501", "weight": 2},
	{ "model_name": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", "weight": 2}
	],
	"total_tokens": 54e9,
	"flops_per_param_token": 6,
	"fine_tuning_scaling_factor": 1000,
	"gpu_vram_gb": 80,
	"fine_tuning_precision_bytes": 2,
	"batch_size": 16,
	"seq_length": 4096,
	"activation_bytes": 2,
	"ft_runs": 100,
	"hp_configs": 200,
	"hp_hours_per_config": 50,
	"abl_experiments": 50,
	"abl_hours_per_experiment": 200,
	"synth_tokens": 4e9,
	"synth_tier": "DeepSeek Reasoner",
	"synth_input_fraction": 0.1,
	"synth_output_fraction": 0.9,
	"gpu_cost_usd": 3.00,
	"usd_to_gbp": 0.81,
	"gpu_flops": 5.0e14,
	"num_ft_gpus": null
	}
	"""

	import argparse
	import math
	import os
	import json
	from transformers import AutoConfig

	# synthetic data generation prices (USD per million tokens)
	PRESET_SYNTH_PRICES = {
	"Gemini 1.5 Flash ≤128k": {"input": 0.07, "output": 0.3},
	"Gemini 1.5 Flash >128k": {"input": 0.15, "output": 0.6},
	"Gemini 1.5 Flash-8B ≤128k": {"input": 0.04, "output": 0.15},
	"Gemini 1.5 Flash-8B >128k": {"input": 0.07, "output": 0.3},
	"Gemini 1.5 Pro ≤128k": {"input": 1.25, "output": 5},
	"Gemini 1.5 Pro >128k": {"input": 2.5, "output": 10},
	"Claude 3.5 Sonnet": {"input": 3, "output": 15},
	"Claude 3 Opus": {"input": 15, "output": 75},
	"Claude 3 Haiku": {"input": 0.25, "output": 1.25},
	"Claude 3.5 Haiku": {"input": 0.8, "output": 4},
	"GPT-4o": {"input": 2.5, "output": 10},
	"GPT-4o Mini": {"input": 0.15, "output": 0.6},
	"o1 and o1-preview": {"input": 15, "output": 60},
	"o1-mini": {"input": 1.1, "output": 4.4},
	"o3-mini": {"input": 1.1, "output": 4.4},
	"Amazon Nova Micro": {"input": 0.04, "output": 0.14},
	"Amazon Nova Lite": {"input": 0.06, "output": 0.24},
	"Amazon Nova Pro": {"input": 0.8, "output": 3.2},
	"DeepSeek Chat": {"input": 0.14, "output": 0.28},
	"DeepSeek Reasoner": {"input": 0.55, "output": 2.19},
	}


	def load_json_config(filepath):
	"""Load and parse a JSON configuration file."""
	if os.path.exists(filepath):
	with open(filepath, "r") as f:
	return json.load(f)
	else:
	raise FileNotFoundError(f"Configuration file '{filepath}' not found. 🚫")


	def load_model_config(model_name):
	"""Fetch model configuration from Hugging Face."""
	return AutoConfig.from_pretrained(model_name)


	def estimate_activated_moe_params_general(config: dict):
	"""
	Estimate the per-token activated parameter count and the full parameter count for a MoE-based HF model like DeepSeek-R1.

	Returns a 2-tuple: (activated_params, full_params)
	- 'activated_params' is computed with a fraction (experts_per_token/n_routed_experts)
	of the MoE feed-forward cost.
	- 'full_params' assumes that all experts are loaded in memory (i.e. the full cost).
	"""
	hidden_size = config.get("hidden_size", 7168)
	vocab_size = config.get("vocab_size", 129280)
	n_layers = config.get("num_hidden_layers", 61)
	first_k_dense = config.get("first_k_dense_replace", 3)
	int_size_dense = config.get("intermediate_size", 4 * hidden_size)
	int_size_moe = config.get("moe_intermediate_size", 2048)
	n_routed_experts = config.get("n_routed_experts", 256)
	n_shared_experts = config.get("n_shared_experts", 1)
	experts_per_token = config.get("num_experts_per_tok", 8)
	moe_layer_freq = config.get("moe_layer_freq", 1)
	q_lora_rank = config.get("q_lora_rank", 1536)
	kv_lora_rank = config.get("kv_lora_rank", 512)
	num_nextn_predict_layers = config.get("num_nextn_predict_layers", 1)
	tie_embeddings = config.get("tie_word_embeddings", False)

	# embedding parameters
	if tie_embeddings:
	emb_params = hidden_size * vocab_size
	else:
	emb_params = 2.0 * hidden_size * vocab_size

	# attention cost
	attn_base = 4.0 * (hidden_size**2)
	mla_overhead = 30e6 * ((hidden_size / 7168.0) ** 2)
	attn_cost = attn_base + mla_overhead

	def lora_cost(h, rank):
	return 2.0 * h * rank

	lora_overhead = lora_cost(hidden_size, q_lora_rank) + 2 * lora_cost(
	hidden_size, kv_lora_rank
	)
	total_attn_cost = attn_cost + lora_overhead

	# MLP cost
	dense_ffn_cost = 2.0 * hidden_size * int_size_dense
	moe_expert_cost = 2.0 * hidden_size * int_size_moe
	# for activated parameter count, only a fraction of experts is active:
	fraction_active = (
	experts_per_token / float(n_routed_experts) if n_routed_experts > 0 else 0.0
	)
	activated_expert_cost = (
	moe_expert_cost * n_routed_experts * fraction_active
	+ moe_expert_cost * n_shared_experts
	)
	# for full parameter count, all experts are loaded:
	full_expert_cost = (
	moe_expert_cost * n_routed_experts + moe_expert_cost * n_shared_experts
	)

	# gating cost for MoE layers
	gating_cost = hidden_size * n_routed_experts if n_routed_experts > 0 else 0.0

	total_layer_cost_activated = 0.0
	total_layer_cost_full = 0.0
	for layer_idx in range(n_layers):
	layer_cost_attn = total_attn_cost
	if layer_idx < first_k_dense:
	layer_cost_dense = dense_ffn_cost
	layer_cost_moe_activated = 0.0
	layer_cost_moe_full = 0.0
	else:
	# if this layer is designated as an MoE layer:
	if n_routed_experts > 0 and (
	(layer_idx - first_k_dense) % moe_layer_freq == 0
	):
	layer_cost_moe_activated = activated_expert_cost + gating_cost
	layer_cost_moe_full = full_expert_cost + gating_cost
	layer_cost_dense = 0.0
	else:
	layer_cost_dense = dense_ffn_cost
	layer_cost_moe_activated = 0.0
	layer_cost_moe_full = 0.0
	total_layer_cost_activated += (
	layer_cost_attn + layer_cost_dense + layer_cost_moe_activated
	)
	total_layer_cost_full += (
	layer_cost_attn + layer_cost_dense + layer_cost_moe_full
	)

	# multi-Token Prediction overhead (applies equally to both)
	mtp_overhead = 300e6 * ((hidden_size / 7168.0) ** 2) * num_nextn_predict_layers

	# fudge factor for extra parameters (norms, biases, etc.)
	fudge = 2.0e9

	total_activated = emb_params + total_layer_cost_activated + mtp_overhead + fudge
	total_full = emb_params + total_layer_cost_full + mtp_overhead + fudge

	return (total_activated, total_full)


	def compute_training_memory(
	est_params,
	defaults,
	hidden_size,
	n_layers,
	batch_size,
	seq_length,
	activation_bytes,
	):
	"""
	Compute total training memory (in GB) using Eleuther's formula:
	Total Memory = Model Memory + Optimiser Memory + Gradient Memory + Activation Memory
	Assumes:
	- Model Memory = N * param_bytes (FP16: 2 bytes)
	- Gradient Memory = N * param_bytes
	- Optimiser Memory = N * 8 (AdamW: two FP32 copies)
	- Activation Memory = batch_size * seq_length * hidden_size * n_layers * activation_bytes
	"""
	param_bytes = defaults["fine_tuning_precision_bytes"]
	model_memory = est_params * param_bytes
	gradient_memory = est_params * param_bytes
	optimiser_memory = est_params * 8
	activation_memory = (
	batch_size * seq_length * hidden_size * n_layers * activation_bytes
	)
	total_bytes = model_memory + gradient_memory + optimiser_memory + activation_memory
	total_gb = total_bytes / (1024**3)
	return total_gb


	def compute_model_estimates(model_name, defaults, cli_overrides):
	"""
	Compute estimates for a single model:
	- Effective parameter count (using MoE-aware function if applicable)
	- Detailed training memory (using Eleuther's formula)
	- FLOP-based compute estimates.
	"""
	config = load_model_config(model_name)

	config_dict = config.to_dict()
	if "n_routed_experts" in config_dict:
	activated_params, full_params = estimate_activated_moe_params_general(
	config_dict
	)
	else:
	if hasattr(config, "num_parameters"):
	full_params = config.num_parameters()
	else:
	import re

	match = re.search(r"(\d+)[bB]", config.name_or_path)
	if match:
	full_params = float(match.group(1)) * 1e9
	else:
	full_params = 7e9 # default to 7B
	# override if provided via CLI
	if cli_overrides.get("model_parameters") is not None:
	full_params = cli_overrides["model_parameters"]

	# memory estimation using Eleuther AI's formula
	hidden_size = getattr(config, "hidden_size", 7168)
	n_layers = getattr(config, "num_hidden_layers", 61)
	batch_size = defaults.get("batch_size", 16)
	seq_length = defaults.get("seq_length", 4096)
	activation_bytes = defaults.get("activation_bytes", 2) # FP16 activations
	training_memory_gb = compute_training_memory(
	full_params,
	defaults,
	hidden_size,
	n_layers,
	batch_size,
	seq_length,
	activation_bytes,
	)
	required_gpus = math.ceil(training_memory_gb / defaults["gpu_vram_gb"])

	# FLOP-based compute estimation:
	full_training_flops = (
	defaults["flops_per_param_token"] * full_params * defaults["total_tokens"]
	)
	ft_flops = full_training_flops / defaults["fine_tuning_scaling_factor"]
	num_ft_gpus = (
	defaults["num_ft_gpus"]
	if defaults["num_ft_gpus"] is not None
	else required_gpus
	)
	cluster_throughput = num_ft_gpus * defaults["gpu_flops"]
	ft_time_sec = ft_flops / cluster_throughput
	ft_time_hr = ft_time_sec / 3600
	gpu_hours_per_ft_run = ft_time_hr * num_ft_gpus
	base_ft_total_gpu_hours = defaults["ft_runs"] * gpu_hours_per_ft_run

	return {
	"model_name": model_name,
	"num_params": full_params,
	"hidden_size": hidden_size,
	"n_layers": n_layers,
	"training_memory_gb": training_memory_gb,
	"required_gpus": required_gpus,
	"num_ft_gpus": num_ft_gpus,
	"ft_time_hr": ft_time_hr,
	"gpu_hours_per_ft_run": gpu_hours_per_ft_run,
	"base_ft_total_gpu_hours": base_ft_total_gpu_hours,
	}


	def main():
	parser = argparse.ArgumentParser(
	description="Estimate compute budget and GPU requirements for fine-tuning LLMs.",
	)
	parser.add_argument(
	"--model_name",
	type=str,
	help="Hugging Face model name (for single-model mode).",
	)
	parser.add_argument(
	"--config", type=str, help="Optional JSON config file (for multi-model mode)."
	)
	# optional CLI overrides
	parser.add_argument(
	"--ft_runs", type=int, help="Override: number of fine-tuning runs."
	)
	parser.add_argument(
	"--hp_configs",
	type=int,
	help="Override: number of hyperparameter configurations.",
	)
	parser.add_argument(
	"--abl_experiments", type=int, help="Override: number of ablation experiments."
	)
	parser.add_argument(
	"--gpu_cost_usd", type=float, help="Override: cost per GPU-hour in USD."
	)
	args = parser.parse_args()

	defaults = {
	"total_tokens": 54e9, # cf. FineMath
	"flops_per_param_token": 6,
	"fine_tuning_scaling_factor": 100,
	"gpu_vram_gb": 80,
	"fine_tuning_precision_bytes": 2,
	"batch_size": 16,
	"seq_length": 4096,
	"activation_bytes": 2,
	"ft_runs": 100,
	"hp_configs": 200,
	"hp_hours_per_config": 50,
	"abl_experiments": 50,
	"abl_hours_per_experiment": 200,
	"gpu_cost_usd": 3.00,
	"usd_to_gbp": 0.81,
	"gpu_flops": 5.0e14,
	"num_ft_gpus": None,
	"synth_tokens": 4e9,
	"synth_tier": "DeepSeek Reasoner",
	"synth_input_fraction": 0.1,
	"synth_output_fraction": 0.9,
	}
	# override defaults from CLI if provided
	if args.ft_runs is not None:
	defaults["ft_runs"] = args.ft_runs
	if args.hp_configs is not None:
	defaults["hp_configs"] = args.hp_configs
	if args.abl_experiments is not None:
	defaults["abl_experiments"] = args.abl_experiments
	if args.gpu_cost_usd is not None:
	defaults["gpu_cost_usd"] = args.gpu_cost_usd

	# CLI overrides dict for model_parameters if wanted
	cli_overrides = {"model_parameters": None}

	models_info = []
	if args.config:
	config_data = load_json_config(args.config)

	# merge global defaults from JSON if present
	for key in defaults.keys():
	if key in config_data:
	defaults[key] = config_data[key]

	if "models" not in config_data:
	raise ValueError(
	"JSON config must include a 'models' key with a list of models (with weights)."
	)

	# normalize the weights
	total_weight = sum(model.get("weight", 1) for model in config_data["models"])
	if total_weight <= 0:
	raise ValueError("Sum of 'weight' fields must be > 0 for normalization.")

	for model_entry in config_data["models"]:
	raw_weight = model_entry.get("weight", 1)
	model_entry["weight"] = raw_weight / total_weight

	for model_entry in config_data["models"]:
	model_name = model_entry.get("model_name")
	if not model_name:
	continue
	weight = model_entry.get("weight", 1)
	if "model_parameters" in model_entry:
	cli_overrides["model_parameters"] = model_entry["model_parameters"]
	est = compute_model_estimates(model_name, defaults, cli_overrides)
	est["weight"] = weight
	models_info.append(est)

	elif args.model_name:
	est = compute_model_estimates(args.model_name, defaults, cli_overrides)
	est["weight"] = 1
	models_info.append(est)
	else:
	raise ValueError(
	"You must supply either --model_name (single-model mode) or --config (multi-model mode)."
	)

	print("\n📊 Model Estimates:")
	overall_weighted_ft_gpu_hours = 0
	max_gpus_required = 0
	for info in models_info:
	print("--------------------------------------------")
	print(f"Model: {info['model_name']}")
	print(f" - Estimated activated parameters: {info['num_params']:.0f}")
	print(f" - Hidden size: {info['hidden_size']}")
	print(f" - Number of layers: {info['n_layers']}")
	print(
	f" - Training memory estimate: {info['training_memory_gb']:.2f} GB (Eleuther's formula)"
	)
	print(f" - Minimum GPUs required (memory): {info['required_gpus']}")
	print(f" - GPUs used for compute: {info['num_ft_gpus']}")
	print(f" - Time per FT run: {info['ft_time_hr']:.2f} hours")
	print(f" - GPU-hours per FT run: {info['gpu_hours_per_ft_run']:.0f} GPU-hours")
	print(
	f" - Base FT total GPU-hours ({defaults['ft_runs']} runs): {info['base_ft_total_gpu_hours']:.0f} GPU-hours"
	)
	print(
	f" - Weight (usage frequency): {info['weight']:.3f}"
	) # now a normalized fraction
	overall_weighted_ft_gpu_hours += (
	info["weight"] * info["base_ft_total_gpu_hours"]
	)
	max_gpus_required = max(max_gpus_required, info["required_gpus"])

	total_weight = sum(info["weight"] for info in models_info)
	additional_gpu_hours = (
	defaults["hp_configs"] * defaults["hp_hours_per_config"]
	+ defaults["abl_experiments"] * defaults["abl_hours_per_experiment"]
	)

	# synthetic data generation cost
	synth_tier = defaults.get("synth_tier", "DeepSeek Reasoner")
	if synth_tier not in PRESET_SYNTH_PRICES:
	raise ValueError(
	f"Synthetic tier '{synth_tier}' not recognized. Available tiers: {list(PRESET_SYNTH_PRICES.keys())}"
	)

	prices = PRESET_SYNTH_PRICES[synth_tier]
	input_frac = defaults.get("synth_input_fraction", 0.1)
	output_frac = defaults.get("synth_output_fraction", 0.9)
	cost_per_million = input_frac * prices["input"] + output_frac * prices["output"]
	synth_tokens = defaults.get("synth_tokens", 1e9) # total number of tokens
	synth_cost = (
	synth_tokens / 1e6
	) * cost_per_million # convert to millions for pricing

	overall_total_gpu_hours = (
	overall_weighted_ft_gpu_hours + additional_gpu_hours * total_weight
	)
	gpu_compute_cost_usd = overall_total_gpu_hours * defaults["gpu_cost_usd"]
	total_cost_usd = gpu_compute_cost_usd + synth_cost
	total_cost_gbp = total_cost_usd * defaults["usd_to_gbp"]

	print("\n🧮 Overall Compute Estimation:")
	print(
	f" Weighted fine-tuning GPU-hours (base): {overall_weighted_ft_gpu_hours:.0f} GPU-hours"
	)
	print(
	f" Additional experiments (HP + Ablation) GPU-hours (weighted): {additional_gpu_hours * total_weight:.0f} GPU-hours"
	)
	print(f" Total GPU-hours: {overall_total_gpu_hours:.0f} GPU-hours")
	print(f" GPU Compute Cost: ${gpu_compute_cost_usd:,.2f} USD")
	print(f" Synthetic Data Generation Cost: ${synth_cost:,.2f} USD")
	print(f" Total Cost: ${total_cost_usd:,.2f} USD / £{total_cost_gbp:,.2f} GBP")
	print(f"\n🖥️ Minimum GPUs required (by memory): {max_gpus_required} GPUs\n")


	if __name__ == "__main__":
	main()