ZeRO Stage | Data-Parallel | MP | PP | MP+PP | MoE | MoE+MP |
---|---|---|---|---|---|---|
1 | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
2 | ✓ | ✓ | N/A | N/A | ✓ | ✓ |
3 | ✓ | ✓ | N/A | N/A | N/A | N/A |
#!/bin/bash | |
#SBATCH --partition=gpu | |
#SBATCH --job-name=gputest | |
#SBATCH --nodes 1 | |
#SBATCH --ntasks-per-node 8 | |
#SBATCH --cpus-per-gpu=6 | |
#SBATCH --gres=gpu:8 | |
#SBATCH --nodelist gpu-st-p4d-24xlarge-42 | |
#SBATCH --output=%x_%j.out | |
#SBATCH --open-mode=append |
import argparse | |
import math | |
# Helper function to pretty-print message sizes | |
def convert_params(params): | |
if params == 0: | |
return "0" | |
size_name = ("", "K", "M", "B", "T", "P", "E", "Z", "Y") | |
i = int(math.floor(math.log(params, 1000))) | |
p = math.pow(1000, i) |
Thank you for your interest in contributing to open source software projects (“Projects”) made available by the Network-Based Computing Laboratory (NBCL) or its affiliates (“NBCL”). This Individual Contributor License Agreement (“Agreement”) sets out the terms governing any source code, object code, bug fixes, configuration changes, tools, specifications, documentation, data, materials, feedback, information or other works of authorship that you submit or have submitted, in any form and in any manner, to NBCL in respect of any of the Projects (collectively “Contributions”). If you have any questions respecting this Agreement, please contact [email protected].
You agree that the following terms apply to all of your past, present and future Contributions. Except for the licenses granted in this Agreement, you retain all of your right, title and interest in and to your Contributions.
Copyright License. You hereby grant, and agree to grant, to NB
import torch | |
from safetensors.torch import save_file, load_file | |
import numpy as np | |
import argparse | |
import os | |
import time | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--no-save", action="store_false", help="disables saving initial tensors") |
import argparse | |
import math | |
# Helper function to pretty-print message sizes | |
def convert_params(params): | |
if params == 0: | |
return "0" | |
size_name = ("", "K", "M", "B", "T", "P", "E", "Z", "Y") | |
i = int(math.floor(math.log(params, 1000))) | |
p = math.pow(1000, i) |
import argparse | |
import math | |
# Helper function to pretty-print message sizes | |
def convert_flops(params): | |
if params == 0: | |
return "0" | |
size_name = ("", "KFLOPs", "MFLOPs", "GFLOPs", "TFLOPs", "PFLOPs", "EFLOPs", "ZFLOPs", "YFLOPs") | |
i = int(math.floor(math.log(params, 1000))) | |
p = math.pow(1000, i) |
#!/bin/bash | |
# set tokenizer | |
TOKENIZER_TYPE=<TODO> | |
TOKENIZER_MODEL=<TODO> | |
# set up distributed | |
GPUS_PER_NODE=<TODO> | |
NNODES=<TODO> | |
export MASTER_ADDR=localhost #ONLY FOR SINGLE-NODE. CHANGE FOR MULTINODE. |