Created
December 29, 2024 15:04
-
-
Save secemp9/c1aa677e4632bf0f1d540f1f054713e3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def calculate_model_size(total_parameters, precision="fp32"): | |
""" | |
Calculate the model size in GB given the total number of parameters. | |
Parameters: | |
total_parameters (int): Total number of parameters in the model. | |
precision (str): Precision type ("fp32" or "fp16"). | |
Returns: | |
float: Model size in GB. | |
""" | |
bytes_per_parameter = 4 if precision == "fp32" else 2 # 4 bytes for FP32, 2 bytes for FP16 | |
model_size_gb = (total_parameters * bytes_per_parameter) / (1024 ** 3) | |
return model_size_gb | |
def calculate_batch_size(memory_per_gpu, total_parameters, activation_size, base_lr, base_batch_size, num_nodes, gpus_per_node, precision="fp32"): | |
""" | |
Calculate the effective batch size and scaled learning rate for distributed training. | |
Parameters: | |
memory_per_gpu (float): Total memory available per GPU (in GB). | |
total_parameters (int): Total number of parameters in the model. | |
activation_size (float): Activation memory per sample (in GB). | |
base_lr (float): Base learning rate for a reference batch size. | |
base_batch_size (int): Base batch size for the reference learning rate. | |
num_nodes (int): Number of nodes in the cluster. | |
gpus_per_node (int): Number of GPUs per node. | |
precision (str): Precision type ("fp32" or "fp16"). | |
Returns: | |
dict: A dictionary with local batch size, effective batch size, and scaled learning rate. | |
""" | |
# Calculate model size from parameters | |
model_size = calculate_model_size(total_parameters, precision) | |
# Convert memory to GB for calculations | |
memory_per_gpu_gb = memory_per_gpu * 1024 # Convert GB to MB | |
activation_size_gb = activation_size * 1024 # Convert GB to MB | |
# Calculate the maximum batch size per GPU based on memory constraints | |
local_batch_size = int(memory_per_gpu_gb / (model_size * 1024 + activation_size_gb)) | |
# Total number of GPUs in the cluster | |
total_gpus = num_nodes * gpus_per_node | |
# Effective batch size across all GPUs | |
effective_batch_size = local_batch_size * total_gpus | |
# Scaled learning rate based on the effective batch size | |
scaled_lr = base_lr * (effective_batch_size / base_batch_size) | |
return { | |
"Model Size (GB)": model_size, | |
"Local Batch Size": local_batch_size, | |
"Effective Batch Size": effective_batch_size, | |
"Scaled Learning Rate": scaled_lr, | |
} | |
# Example Usage | |
if __name__ == "__main__": | |
# Define inputs | |
memory_per_gpu = 80 # H100 memory in GB | |
total_parameters = 1e9 # 1 billion parameters (e.g., a large Transformer) | |
activation_size = 0.1 # Activation size per sample in GB | |
base_lr = 0.001 # Base learning rate for reference batch size | |
base_batch_size = 64 # Reference batch size | |
num_nodes = 4 # Number of nodes | |
gpus_per_node = 8 # GPUs per node | |
precision = "fp16" # Mixed precision training (FP16) | |
# Calculate | |
results = calculate_batch_size( | |
memory_per_gpu, total_parameters, activation_size, base_lr, base_batch_size, num_nodes, gpus_per_node, precision | |
) | |
# Print results | |
print("Calculation Results:") | |
for key, value in results.items(): | |
print(f"{key}: {value}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment