Created
September 26, 2024 20:08
-
-
Save Quentin-Anthony/6ddac24442053c918708c57a171abb78 to your computer and use it in GitHub Desktop.
Template MLM config for a 70B
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# set tokenizer | |
TOKENIZER_TYPE=<TODO> | |
TOKENIZER_MODEL=<TODO> | |
# set up distributed | |
GPUS_PER_NODE=<TODO> | |
NNODES=<TODO> | |
export MASTER_ADDR=localhost #ONLY FOR SINGLE-NODE. CHANGE FOR MULTINODE. | |
export MASTER_PORT=6000 | |
NODE_RANK=0 #ONLY FOR SINGLE-NODE. CHANGE FOR MULTINODE | |
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) | |
CHECKPOINT_PATH=<TODO> | |
DATA_PATH=<TODO> | |
DATA_CACHE_PATH=<TODO> | |
TOKENIZER_TYPE=<TODO> | |
DISTRIBUTED_ARGS=" | |
--nproc_per_node $GPUS_PER_NODE \ | |
--nnodes $NNODES \ | |
--node_rank $NODE_RANK \ | |
--master_addr $MASTER_ADDR \ | |
--master_port $MASTER_PORT | |
" | |
GPT_ARGS=" | |
--tensor-model-parallel-size <TODO> \ # don't go above $GPUS_PER_NODE | |
--pipeline-model-parallel-size <TODO> \ # only increase until the model fits in VRAM with mbs=1 | |
--sequence-parallel # slightly extra intra-node comms, slightly less VRAM. I tend to find it worth. | |
--use-mcore-models \ # sometimes regresses compared to legacy? Worth turning on/off to check throughput | |
--num-layers 80 \ | |
--hidden-size 8192 \ | |
--num-attention-heads 64 \ | |
--seq-length 4096 \ | |
--max-position-embeddings 4096 \ | |
--micro-batch-size 1 \ # increase to fill VRAM once PP, TP, SP are decided | |
--global-batch-size 32 \ # set to find max batch size that still converges. I like 2M-8M tokens. | |
--train-iters <TODO> \ | |
--lr-decay-iters <TODO> \ | |
--lr-decay-style cosine \ | |
--lr <TODO> \ | |
--min-lr <TODO> \ | |
--weight-decay 0.1 \ | |
--lr-warmup-fraction .01 \ | |
--clip-grad 1.0 \ | |
--bf16 \ | |
--recompute-granularity selective \ | |
--use-flash-attn \ | |
--accumulate-allreduce-grads-in-fp32 \ # slight perf hit, but improves stability. Not totally sure when exactly this is necessary. | |
--attention-dropout 0.0 \ | |
--hidden-dropout 0.0 \ | |
--position-embedding-type rope \ | |
--rotary-percent 0.25 \ | |
--adam-beta2 0.95 \ | |
--untie-embeddings-and-output-weights \ | |
--transformer-impl transformer_engine \ | |
--use-distributed-optimizer \ # ZeRO-1 aka FSDP with only optimizer sharding | |
--overlap-param-gather \ # overlaps the allgather for the distributed optimizer | |
--overlap-param-gather-with-optimizer-step \ # overlap allgather with opt step | |
--tp-comm-overlap \ # overlap tp comms with gemm. I haven't tested this but worth a check | |
" | |
DATA_ARGS=" | |
--data-path $DATA_PATH \ | |
--data-cache-path $DATA_CACHE_PATH \ | |
--split 1,0,0 \ | |
--num-workers 1 \ # increase depending on your CPU | |
--tokenizer-type $TOKENIZER_TYPE \ | |
" | |
OUTPUT_ARGS=" | |
--log-interval 1 \ | |
--log-throughput \ | |
--save-interval <TODO> \ | |
--eval-interval <TODO> \ | |
" | |
torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \ | |
$GPT_ARGS \ | |
$DATA_ARGS \ | |
$OUTPUT_ARGS \ | |
--distributed-backend nccl \ | |
--save $CHECKPOINT_PATH \ | |
--load $CHECKPOINT_PATH |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment