alvarobartt · March 27, 2025 17:04 · gabormadarasz2117 · Mar 27, 2025
diff --git a/kv_cache_computation.py b/kv_cache_computation.py
 from transformers import AutoConfig

 if __name__ == "__main__":
    config = AutoConfig.from_pretrained("meta-llama/Llama-3.1-8B-Instruct", token="hf_...")

    tokens_in_cache = 1024  # this is the only arg that will change over time (as more requests are sent)
    precision_in_bytes = 2  # float16 or bfloat16

    cache_size_bytes = (
        2 * 
        config.num_hidden_layers * 
        config.num_key_value_heads * 
        (config.hidden_size // config.num_attention_heads) * 
        tokens_in_cache * 
        precision_in_bytes
    )
    cache_size_mb = cache_size_bytes / (1024 * 1024)

    print(f"{cache_size_mb / 1024:.2f} GB" if cache_size_mb >= 1024 else f"{cache_size_mb:.2f} MB")
	from transformers import AutoConfig

	if __name__ == "__main__":
	config = AutoConfig.from_pretrained("meta-llama/Llama-3.1-8B-Instruct", token="hf_...")

	tokens_in_cache = 1024 # this is the only arg that will change over time (as more requests are sent)
	precision_in_bytes = 2 # float16 or bfloat16

	cache_size_bytes = (
	2 *
	config.num_hidden_layers *
	config.num_key_value_heads *
	(config.hidden_size // config.num_attention_heads) *
	tokens_in_cache *
	precision_in_bytes
	)
	cache_size_mb = cache_size_bytes / (1024 * 1024)

	print(f"{cache_size_mb / 1024:.2f} GB" if cache_size_mb >= 1024 else f"{cache_size_mb:.2f} MB")