Created
October 5, 2024 17:14
-
-
Save w32zhong/652ac50634dc3bed14b461c5c696cb50 to your computer and use it in GitHub Desktop.
GPU vram estimate for pre-training LLMs.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import math | |
| def act_mem(layers, seqlen, h_dim, heads, precision=2, bs=1): | |
| """ Returns amount of GPU VRAM (in GB) required to store | |
| intermediate activations for traditional Transformer blocks | |
| """ | |
| mem_bytes = layers * precision * seqlen * bs * h_dim * ( | |
| 16 + 2/precision + 2*heads*seqlen/h_dim | |
| + heads*seqlen/(precision*h_dim) | |
| ) | |
| return round(mem_bytes / 10**9, 2) | |
| from transformers import AutoConfig | |
| for qbits in [16]: | |
| for x in [7, 13, 30, 65]: | |
| #config = AutoConfig.from_pretrained(f"Qwen/Qwen1.5-{x}B") | |
| config = AutoConfig.from_pretrained(f"huggyllama/llama-{x}b") | |
| param_bytes = qbits / 8 | |
| grads_bytes = 2 | |
| optim_bytes = 4 * 3 | |
| state = x * (param_bytes + grads_bytes + optim_bytes) | |
| act = act_mem( | |
| config.num_hidden_layers, | |
| 2048, | |
| config.hidden_size, | |
| config.num_attention_heads | |
| ) | |
| total = state + act | |
| #card = dict(name='NVIDIA RTX 4000 SFF Ada', vram=20, watts=70) | |
| #card = dict(name='NVIDIA RTX 3060', vram=12, watts=170) | |
| #card = dict(name='NVIDIA RTX 5880 ADA', vram=48, watts=285) | |
| card = dict(name='NVIDIA RTX 4060', vram=16, watts=165) | |
| name, watts, vram = card['name'], card['watts'], card['vram'] | |
| cards = math.ceil(total / vram) | |
| print(f'# of {name} ({watts}W): {cards} ({cards * vram}G, {watts * cards}W)') | |
| print(f'{qbits}bit-{x}B: {state:.1f} GB + {act:.1f} GB, total = {total:.1f} GB') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment