Skip to content

Instantly share code, notes, and snippets.

@w32zhong
Created October 5, 2024 17:14
Show Gist options
  • Select an option

  • Save w32zhong/652ac50634dc3bed14b461c5c696cb50 to your computer and use it in GitHub Desktop.

Select an option

Save w32zhong/652ac50634dc3bed14b461c5c696cb50 to your computer and use it in GitHub Desktop.
GPU vram estimate for pre-training LLMs.
import math
def act_mem(layers, seqlen, h_dim, heads, precision=2, bs=1):
""" Returns amount of GPU VRAM (in GB) required to store
intermediate activations for traditional Transformer blocks
"""
mem_bytes = layers * precision * seqlen * bs * h_dim * (
16 + 2/precision + 2*heads*seqlen/h_dim
+ heads*seqlen/(precision*h_dim)
)
return round(mem_bytes / 10**9, 2)
from transformers import AutoConfig
for qbits in [16]:
for x in [7, 13, 30, 65]:
#config = AutoConfig.from_pretrained(f"Qwen/Qwen1.5-{x}B")
config = AutoConfig.from_pretrained(f"huggyllama/llama-{x}b")
param_bytes = qbits / 8
grads_bytes = 2
optim_bytes = 4 * 3
state = x * (param_bytes + grads_bytes + optim_bytes)
act = act_mem(
config.num_hidden_layers,
2048,
config.hidden_size,
config.num_attention_heads
)
total = state + act
#card = dict(name='NVIDIA RTX 4000 SFF Ada', vram=20, watts=70)
#card = dict(name='NVIDIA RTX 3060', vram=12, watts=170)
#card = dict(name='NVIDIA RTX 5880 ADA', vram=48, watts=285)
card = dict(name='NVIDIA RTX 4060', vram=16, watts=165)
name, watts, vram = card['name'], card['watts'], card['vram']
cards = math.ceil(total / vram)
print(f'# of {name} ({watts}W): {cards} ({cards * vram}G, {watts * cards}W)')
print(f'{qbits}bit-{x}B: {state:.1f} GB + {act:.1f} GB, total = {total:.1f} GB')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment