w32zhong · October 5, 2024 17:14
diff --git a/gpu_vram_estimate.py b/gpu_vram_estimate.py
 import math

 def act_mem(layers, seqlen, h_dim, heads, precision=2, bs=1):
    """ Returns amount of GPU VRAM (in GB) required to store
    intermediate activations for traditional Transformer blocks
    """
    mem_bytes = layers * precision * seqlen * bs * h_dim * (
        16 + 2/precision + 2*heads*seqlen/h_dim
        + heads*seqlen/(precision*h_dim)
    )
    return round(mem_bytes / 10**9, 2)

 from transformers import AutoConfig
 for qbits in [16]:
    for x in [7, 13, 30, 65]:
        #config = AutoConfig.from_pretrained(f"Qwen/Qwen1.5-{x}B")
        config = AutoConfig.from_pretrained(f"huggyllama/llama-{x}b")
        param_bytes = qbits / 8
        grads_bytes = 2
        optim_bytes = 4 * 3
        state = x * (param_bytes + grads_bytes + optim_bytes)
        act = act_mem(
            config.num_hidden_layers,
            2048,
            config.hidden_size,
            config.num_attention_heads
        )
        total = state + act
        #card = dict(name='NVIDIA RTX 4000 SFF Ada', vram=20, watts=70)
        #card = dict(name='NVIDIA RTX 3060', vram=12, watts=170)
        #card = dict(name='NVIDIA RTX 5880 ADA', vram=48, watts=285)
        card = dict(name='NVIDIA RTX 4060', vram=16, watts=165)
        name, watts, vram = card['name'], card['watts'], card['vram']
        cards = math.ceil(total / vram)
        print(f'# of {name} ({watts}W): {cards} ({cards * vram}G, {watts * cards}W)')
        print(f'{qbits}bit-{x}B: {state:.1f} GB + {act:.1f} GB, total = {total:.1f} GB')
	import math

	def act_mem(layers, seqlen, h_dim, heads, precision=2, bs=1):
	""" Returns amount of GPU VRAM (in GB) required to store
	intermediate activations for traditional Transformer blocks
	"""
	mem_bytes = layers * precision * seqlen * bs * h_dim * (
	16 + 2/precision + 2headsseqlen/h_dim
	+ headsseqlen/(precisionh_dim)
	)
	return round(mem_bytes / 10**9, 2)

	from transformers import AutoConfig
	for qbits in [16]:
	for x in [7, 13, 30, 65]:
	#config = AutoConfig.from_pretrained(f"Qwen/Qwen1.5-{x}B")
	config = AutoConfig.from_pretrained(f"huggyllama/llama-{x}b")
	param_bytes = qbits / 8
	grads_bytes = 2
	optim_bytes = 4 * 3
	state = x * (param_bytes + grads_bytes + optim_bytes)
	act = act_mem(
	config.num_hidden_layers,
	2048,
	config.hidden_size,
	config.num_attention_heads
	)
	total = state + act
	#card = dict(name='NVIDIA RTX 4000 SFF Ada', vram=20, watts=70)
	#card = dict(name='NVIDIA RTX 3060', vram=12, watts=170)
	#card = dict(name='NVIDIA RTX 5880 ADA', vram=48, watts=285)
	card = dict(name='NVIDIA RTX 4060', vram=16, watts=165)
	name, watts, vram = card['name'], card['watts'], card['vram']
	cards = math.ceil(total / vram)
	print(f'# of {name} ({watts}W): {cards} ({cards * vram}G, {watts * cards}W)')
	print(f'{qbits}bit-{x}B: {state:.1f} GB + {act:.1f} GB, total = {total:.1f} GB')
No results found