Created
January 16, 2025 13:53
-
-
Save philschmid/d188034c759811a7183e7949e1fa0aa4 to your computer and use it in GitHub Desktop.
Get needed GPU per precision for a Hugging Face Model Id
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Dict, Union | |
from huggingface_hub import get_safetensors_metadata | |
import argparse | |
import sys | |
# Example: | |
# python get_gpu_memory.py Qwen/Qwen2.5-7B-Instruct | |
# Dictionary mapping dtype strings to their byte sizes | |
bytes_per_dtype: Dict[str, float] = { | |
"int4": 0.5, | |
"int8": 1, | |
"float8": 1, | |
"float16": 2, | |
"float32": 4, | |
} | |
def calculate_gpu_memory(parameters: float, bytes: float) -> float: | |
"""Calculates the GPU memory required for serving a Large Language Model (LLM). | |
This function estimates the GPU memory needed using the formula: | |
M = (P * 4B) / (32 / Q) * 1.18 | |
where: | |
- M is the GPU memory in Gigabytes | |
- P is the number of parameters in billions (e.g., 7 for a 7B model) | |
- 4B represents 4 bytes per parameter | |
- 32 represents bits in 4 bytes | |
- Q is the quantization bits (e.g., 16, 8, or 4 bits) | |
- 1.18 represents ~18% overhead for additional GPU memory requirements | |
Args: | |
parameters: Number of model parameters in billions | |
bytes: Number of bytes per parameter based on dtype | |
Returns: | |
Estimated GPU memory required in Gigabytes | |
Examples: | |
>>> calculate_gpu_memory(7, bytes_per_dtype["float16"]) | |
13.72 | |
>>> calculate_gpu_memory(13, bytes_per_dtype["int8"]) | |
12.74 | |
""" | |
memory = round((parameters * 4) / (32 / (bytes * 8)) * 1.18, 2) | |
return memory | |
def get_model_size(model_id: str, dtype: str = "float16") -> Union[float, None]: | |
"""Get the estimated GPU memory requirement for a Hugging Face model. | |
Args: | |
model_id: Hugging Face model ID (e.g., "facebook/opt-350m") | |
dtype: Data type for model loading ("float16", "int8", etc.) | |
Returns: | |
Estimated GPU memory in GB, or None if estimation fails | |
Examples: | |
>>> get_model_size("facebook/opt-350m") | |
0.82 | |
>>> get_model_size("meta-llama/Llama-2-7b-hf", dtype="int8") | |
6.86 | |
""" | |
try: | |
if dtype not in bytes_per_dtype: | |
raise ValueError( | |
f"Unsupported dtype: {dtype}. Supported types: {list(bytes_per_dtype.keys())}" | |
) | |
metadata = get_safetensors_metadata(model_id) | |
if not metadata or not metadata.parameter_count: | |
raise ValueError(f"Could not fetch metadata for model: {model_id}") | |
model_parameters = list(metadata.parameter_count.values())[0] | |
model_parameters = int(model_parameters) / 1_000_000_000 # Convert to billions | |
return calculate_gpu_memory(model_parameters, bytes_per_dtype[dtype]) | |
except Exception as e: | |
print(f"Error estimating model size: {str(e)}", file=sys.stderr) | |
return None | |
def main(): | |
"""Command-line interface for GPU memory estimation.""" | |
parser = argparse.ArgumentParser( | |
description="Estimate GPU memory requirements for Hugging Face models" | |
) | |
parser.add_argument( | |
"model_id", help="Hugging Face model ID (e.g., Qwen/Qwen2.5-7B-Instruct)" | |
) | |
parser.add_argument( | |
"--dtype", | |
default="float16", | |
choices=bytes_per_dtype.keys(), | |
help="Data type for model loading", | |
) | |
args = parser.parse_args() | |
size = get_model_size(args.model_id, args.dtype) | |
print( | |
f"Estimated GPU memory requirement for {args.model_id}: {size:.2f} GB ({args.dtype})" | |
) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, it seems like the example calculations in the
calculate_gpu_memory
DocString are off:I was a bit confused when I ran the numbers manually to verify the logic.