Skip to content

Instantly share code, notes, and snippets.

@secemp9
Created December 29, 2024 15:04
Show Gist options
  • Save secemp9/c1aa677e4632bf0f1d540f1f054713e3 to your computer and use it in GitHub Desktop.
Save secemp9/c1aa677e4632bf0f1d540f1f054713e3 to your computer and use it in GitHub Desktop.
def calculate_model_size(total_parameters, precision="fp32"):
"""
Calculate the model size in GB given the total number of parameters.
Parameters:
total_parameters (int): Total number of parameters in the model.
precision (str): Precision type ("fp32" or "fp16").
Returns:
float: Model size in GB.
"""
bytes_per_parameter = 4 if precision == "fp32" else 2 # 4 bytes for FP32, 2 bytes for FP16
model_size_gb = (total_parameters * bytes_per_parameter) / (1024 ** 3)
return model_size_gb
def calculate_batch_size(memory_per_gpu, total_parameters, activation_size, base_lr, base_batch_size, num_nodes, gpus_per_node, precision="fp32"):
"""
Calculate the effective batch size and scaled learning rate for distributed training.
Parameters:
memory_per_gpu (float): Total memory available per GPU (in GB).
total_parameters (int): Total number of parameters in the model.
activation_size (float): Activation memory per sample (in GB).
base_lr (float): Base learning rate for a reference batch size.
base_batch_size (int): Base batch size for the reference learning rate.
num_nodes (int): Number of nodes in the cluster.
gpus_per_node (int): Number of GPUs per node.
precision (str): Precision type ("fp32" or "fp16").
Returns:
dict: A dictionary with local batch size, effective batch size, and scaled learning rate.
"""
# Calculate model size from parameters
model_size = calculate_model_size(total_parameters, precision)
# Convert memory to GB for calculations
memory_per_gpu_gb = memory_per_gpu * 1024 # Convert GB to MB
activation_size_gb = activation_size * 1024 # Convert GB to MB
# Calculate the maximum batch size per GPU based on memory constraints
local_batch_size = int(memory_per_gpu_gb / (model_size * 1024 + activation_size_gb))
# Total number of GPUs in the cluster
total_gpus = num_nodes * gpus_per_node
# Effective batch size across all GPUs
effective_batch_size = local_batch_size * total_gpus
# Scaled learning rate based on the effective batch size
scaled_lr = base_lr * (effective_batch_size / base_batch_size)
return {
"Model Size (GB)": model_size,
"Local Batch Size": local_batch_size,
"Effective Batch Size": effective_batch_size,
"Scaled Learning Rate": scaled_lr,
}
# Example Usage
if __name__ == "__main__":
# Define inputs
memory_per_gpu = 80 # H100 memory in GB
total_parameters = 1e9 # 1 billion parameters (e.g., a large Transformer)
activation_size = 0.1 # Activation size per sample in GB
base_lr = 0.001 # Base learning rate for reference batch size
base_batch_size = 64 # Reference batch size
num_nodes = 4 # Number of nodes
gpus_per_node = 8 # GPUs per node
precision = "fp16" # Mixed precision training (FP16)
# Calculate
results = calculate_batch_size(
memory_per_gpu, total_parameters, activation_size, base_lr, base_batch_size, num_nodes, gpus_per_node, precision
)
# Print results
print("Calculation Results:")
for key, value in results.items():
print(f"{key}: {value}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment