Skip to content

Instantly share code, notes, and snippets.

@alvarobartt
Last active February 3, 2025 15:41
Show Gist options
  • Save alvarobartt/1097ca1b07c66fd71470937d599c2072 to your computer and use it in GitHub Desktop.
Save alvarobartt/1097ca1b07c66fd71470937d599c2072 to your computer and use it in GitHub Desktop.
KV Cache Size Computation
from transformers import AutoConfig
if __name__ == "__main__":
config = AutoConfig.from_pretrained("meta-llama/Llama-3.1-8B-Instruct", token="hf_...")
tokens_in_cache = 1024 # this is the only arg that will change over time (as more requests are sent)
precision_in_bytes = 2 # float16 or bfloat16
cache_size_bytes = (
2 *
config.num_hidden_layers *
config.num_key_value_heads *
(config.hidden_size // config.num_attention_heads) *
tokens_in_cache *
precision_in_bytes
)
cache_size_mb = cache_size_bytes / (1024 * 1024)
print(f"{cache_size_mb / 1024:.2f} GB" if cache_size_mb >= 1024 else f"{cache_size_mb:.2f} MB")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment