Quentin Anthony Quentin-Anthony

ZeRO Stage	Data-Parallel	MP	PP	MP+PP	MoE	MoE+MP
1	✓	✓	✓	✓	✓	✓
2	✓	✓	N/A	N/A	✓	✓
3	✓	✓	N/A	N/A	N/A	N/A

	import argparse
	import math

	# Helper function to pretty-print message sizes
	def convert_params(params):
	if params == 0:
	return "0"
	size_name = ("", "K", "M", "B", "T", "P", "E", "Z", "Y")
	i = int(math.floor(math.log(params, 1000)))
	p = math.pow(1000, i)

	#!/bin/bash
	#SBATCH --partition=gpu
	#SBATCH --job-name=gputest
	#SBATCH --nodes 1
	#SBATCH --ntasks-per-node 8
	#SBATCH --cpus-per-gpu=6
	#SBATCH --gres=gpu:8
	#SBATCH --nodelist gpu-st-p4d-24xlarge-42
	#SBATCH --output=%x_%j.out
	#SBATCH --open-mode=append