secemp9 · December 29, 2024 15:04
diff --git a/batch_size_estim.py b/batch_size_estim.py
 def calculate_model_size(total_parameters, precision="fp32"):
    """
    Calculate the model size in GB given the total number of parameters.

    Parameters:
        total_parameters (int): Total number of parameters in the model.
        precision (str): Precision type ("fp32" or "fp16").

    Returns:
        float: Model size in GB.
    """
    bytes_per_parameter = 4 if precision == "fp32" else 2  # 4 bytes for FP32, 2 bytes for FP16
    model_size_gb = (total_parameters * bytes_per_parameter) / (1024 ** 3)
    return model_size_gb


 def calculate_batch_size(memory_per_gpu, total_parameters, activation_size, base_lr, base_batch_size, num_nodes, gpus_per_node, precision="fp32"):
    """
    Calculate the effective batch size and scaled learning rate for distributed training.

    Parameters:
        memory_per_gpu (float): Total memory available per GPU (in GB).
        total_parameters (int): Total number of parameters in the model.
        activation_size (float): Activation memory per sample (in GB).
        base_lr (float): Base learning rate for a reference batch size.
        base_batch_size (int): Base batch size for the reference learning rate.
        num_nodes (int): Number of nodes in the cluster.
        gpus_per_node (int): Number of GPUs per node.
        precision (str): Precision type ("fp32" or "fp16").

    Returns:
        dict: A dictionary with local batch size, effective batch size, and scaled learning rate.
    """
    # Calculate model size from parameters
    model_size = calculate_model_size(total_parameters, precision)

    # Convert memory to GB for calculations
    memory_per_gpu_gb = memory_per_gpu * 1024  # Convert GB to MB
    activation_size_gb = activation_size * 1024  # Convert GB to MB

    # Calculate the maximum batch size per GPU based on memory constraints
    local_batch_size = int(memory_per_gpu_gb / (model_size * 1024 + activation_size_gb))

    # Total number of GPUs in the cluster
    total_gpus = num_nodes * gpus_per_node

    # Effective batch size across all GPUs
    effective_batch_size = local_batch_size * total_gpus

    # Scaled learning rate based on the effective batch size
    scaled_lr = base_lr * (effective_batch_size / base_batch_size)

    return {
        "Model Size (GB)": model_size,
        "Local Batch Size": local_batch_size,
        "Effective Batch Size": effective_batch_size,
        "Scaled Learning Rate": scaled_lr,
    }


 # Example Usage
 if __name__ == "__main__":
    # Define inputs
    memory_per_gpu = 80  # H100 memory in GB
    total_parameters = 1e9  # 1 billion parameters (e.g., a large Transformer)
    activation_size = 0.1  # Activation size per sample in GB
    base_lr = 0.001  # Base learning rate for reference batch size
    base_batch_size = 64  # Reference batch size
    num_nodes = 4  # Number of nodes
    gpus_per_node = 8  # GPUs per node
    precision = "fp16"  # Mixed precision training (FP16)

    # Calculate
    results = calculate_batch_size(
        memory_per_gpu, total_parameters, activation_size, base_lr, base_batch_size, num_nodes, gpus_per_node, precision
    )

    # Print results
    print("Calculation Results:")
    for key, value in results.items():
        print(f"{key}: {value}")
	def calculate_model_size(total_parameters, precision="fp32"):
	"""
	Calculate the model size in GB given the total number of parameters.

	Parameters:
	total_parameters (int): Total number of parameters in the model.
	precision (str): Precision type ("fp32" or "fp16").

	Returns:
	float: Model size in GB.
	"""
	bytes_per_parameter = 4 if precision == "fp32" else 2 # 4 bytes for FP32, 2 bytes for FP16
	model_size_gb = (total_parameters * bytes_per_parameter) / (1024 ** 3)
	return model_size_gb


	def calculate_batch_size(memory_per_gpu, total_parameters, activation_size, base_lr, base_batch_size, num_nodes, gpus_per_node, precision="fp32"):
	"""
	Calculate the effective batch size and scaled learning rate for distributed training.

	Parameters:
	memory_per_gpu (float): Total memory available per GPU (in GB).
	total_parameters (int): Total number of parameters in the model.
	activation_size (float): Activation memory per sample (in GB).
	base_lr (float): Base learning rate for a reference batch size.
	base_batch_size (int): Base batch size for the reference learning rate.
	num_nodes (int): Number of nodes in the cluster.
	gpus_per_node (int): Number of GPUs per node.
	precision (str): Precision type ("fp32" or "fp16").

	Returns:
	dict: A dictionary with local batch size, effective batch size, and scaled learning rate.
	"""
	# Calculate model size from parameters
	model_size = calculate_model_size(total_parameters, precision)

	# Convert memory to GB for calculations
	memory_per_gpu_gb = memory_per_gpu * 1024 # Convert GB to MB
	activation_size_gb = activation_size * 1024 # Convert GB to MB

	# Calculate the maximum batch size per GPU based on memory constraints
	local_batch_size = int(memory_per_gpu_gb / (model_size * 1024 + activation_size_gb))

	# Total number of GPUs in the cluster
	total_gpus = num_nodes * gpus_per_node

	# Effective batch size across all GPUs
	effective_batch_size = local_batch_size * total_gpus

	# Scaled learning rate based on the effective batch size
	scaled_lr = base_lr * (effective_batch_size / base_batch_size)

	return {
	"Model Size (GB)": model_size,
	"Local Batch Size": local_batch_size,
	"Effective Batch Size": effective_batch_size,
	"Scaled Learning Rate": scaled_lr,
	}


	# Example Usage
	if __name__ == "__main__":
	# Define inputs
	memory_per_gpu = 80 # H100 memory in GB
	total_parameters = 1e9 # 1 billion parameters (e.g., a large Transformer)
	activation_size = 0.1 # Activation size per sample in GB
	base_lr = 0.001 # Base learning rate for reference batch size
	base_batch_size = 64 # Reference batch size
	num_nodes = 4 # Number of nodes
	gpus_per_node = 8 # GPUs per node
	precision = "fp16" # Mixed precision training (FP16)

	# Calculate
	results = calculate_batch_size(
	memory_per_gpu, total_parameters, activation_size, base_lr, base_batch_size, num_nodes, gpus_per_node, precision
	)

	# Print results
	print("Calculation Results:")
	for key, value in results.items():
	print(f"{key}: {value}")