Quentin-Anthony · September 26, 2024 20:08
diff --git a/70b-template-mlm.sh b/70b-template-mlm.sh
 #!/bin/bash

 # set tokenizer
 TOKENIZER_TYPE=<TODO>
 TOKENIZER_MODEL=<TODO>

 # set up distributed
 GPUS_PER_NODE=<TODO>
 NNODES=<TODO>
 export MASTER_ADDR=localhost #ONLY FOR SINGLE-NODE. CHANGE FOR MULTINODE.
 export MASTER_PORT=6000
 NODE_RANK=0 #ONLY FOR SINGLE-NODE. CHANGE FOR MULTINODE
 WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))

 CHECKPOINT_PATH=<TODO>
 DATA_PATH=<TODO>
 DATA_CACHE_PATH=<TODO>

 TOKENIZER_TYPE=<TODO>

 DISTRIBUTED_ARGS="
    --nproc_per_node $GPUS_PER_NODE \
    --nnodes $NNODES \
    --node_rank $NODE_RANK \
    --master_addr $MASTER_ADDR \
    --master_port $MASTER_PORT
 "

 GPT_ARGS="
    --tensor-model-parallel-size <TODO> \ # don't go above $GPUS_PER_NODE
    --pipeline-model-parallel-size <TODO> \ # only increase until the model fits in VRAM with mbs=1
    --sequence-parallel # slightly extra intra-node comms, slightly less VRAM. I tend to find it worth.
    --use-mcore-models \ # sometimes regresses compared to legacy? Worth turning on/off to check throughput
    --num-layers 80 \
    --hidden-size 8192 \
    --num-attention-heads 64 \
    --seq-length 4096 \
    --max-position-embeddings 4096 \
    --micro-batch-size 1 \ # increase to fill VRAM once PP, TP, SP are decided
    --global-batch-size 32 \ # set to find max batch size that still converges. I like 2M-8M tokens.
    --train-iters <TODO> \
    --lr-decay-iters <TODO> \
    --lr-decay-style cosine \
    --lr <TODO> \
    --min-lr <TODO> \
    --weight-decay 0.1 \
    --lr-warmup-fraction .01 \
    --clip-grad 1.0 \
    --bf16 \
    --recompute-granularity selective \
    --use-flash-attn \
    --accumulate-allreduce-grads-in-fp32 \ # slight perf hit, but improves stability. Not totally sure when exactly this is necessary.
    --attention-dropout 0.0 \
    --hidden-dropout 0.0 \
    --position-embedding-type rope \
    --rotary-percent 0.25 \
    --adam-beta2 0.95 \
    --untie-embeddings-and-output-weights \
    --transformer-impl transformer_engine \
    --use-distributed-optimizer \ # ZeRO-1 aka FSDP with only optimizer sharding
    --overlap-param-gather \ # overlaps the allgather for the distributed optimizer
    --overlap-param-gather-with-optimizer-step \ # overlap allgather with opt step
    --tp-comm-overlap \ # overlap tp comms with gemm. I haven't tested this but worth a check
 "

 DATA_ARGS="
    --data-path $DATA_PATH \
    --data-cache-path $DATA_CACHE_PATH \
    --split 1,0,0 \
    --num-workers 1 \ # increase depending on your CPU
    --tokenizer-type $TOKENIZER_TYPE \
 "

 OUTPUT_ARGS="
    --log-interval 1 \
    --log-throughput \
    --save-interval <TODO> \
    --eval-interval <TODO> \
 "

 torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
    $GPT_ARGS \
    $DATA_ARGS \
    $OUTPUT_ARGS \
    --distributed-backend nccl \
    --save $CHECKPOINT_PATH \
    --load $CHECKPOINT_PATH
	#!/bin/bash

	# set tokenizer
	TOKENIZER_TYPE=<TODO>
	TOKENIZER_MODEL=<TODO>

	# set up distributed
	GPUS_PER_NODE=<TODO>
	NNODES=<TODO>
	export MASTER_ADDR=localhost #ONLY FOR SINGLE-NODE. CHANGE FOR MULTINODE.
	export MASTER_PORT=6000
	NODE_RANK=0 #ONLY FOR SINGLE-NODE. CHANGE FOR MULTINODE
	WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))

	CHECKPOINT_PATH=<TODO>
	DATA_PATH=<TODO>
	DATA_CACHE_PATH=<TODO>

	TOKENIZER_TYPE=<TODO>

	DISTRIBUTED_ARGS="
	--nproc_per_node $GPUS_PER_NODE \
	--nnodes $NNODES \
	--node_rank $NODE_RANK \
	--master_addr $MASTER_ADDR \
	--master_port $MASTER_PORT
	"

	GPT_ARGS="
	--tensor-model-parallel-size <TODO> \ # don't go above $GPUS_PER_NODE
	--pipeline-model-parallel-size <TODO> \ # only increase until the model fits in VRAM with mbs=1
	--sequence-parallel # slightly extra intra-node comms, slightly less VRAM. I tend to find it worth.
	--use-mcore-models \ # sometimes regresses compared to legacy? Worth turning on/off to check throughput
	--num-layers 80 \
	--hidden-size 8192 \
	--num-attention-heads 64 \
	--seq-length 4096 \
	--max-position-embeddings 4096 \
	--micro-batch-size 1 \ # increase to fill VRAM once PP, TP, SP are decided
	--global-batch-size 32 \ # set to find max batch size that still converges. I like 2M-8M tokens.
	--train-iters <TODO> \
	--lr-decay-iters <TODO> \
	--lr-decay-style cosine \
	--lr <TODO> \
	--min-lr <TODO> \
	--weight-decay 0.1 \
	--lr-warmup-fraction .01 \
	--clip-grad 1.0 \
	--bf16 \
	--recompute-granularity selective \
	--use-flash-attn \
	--accumulate-allreduce-grads-in-fp32 \ # slight perf hit, but improves stability. Not totally sure when exactly this is necessary.
	--attention-dropout 0.0 \
	--hidden-dropout 0.0 \
	--position-embedding-type rope \
	--rotary-percent 0.25 \
	--adam-beta2 0.95 \
	--untie-embeddings-and-output-weights \
	--transformer-impl transformer_engine \
	--use-distributed-optimizer \ # ZeRO-1 aka FSDP with only optimizer sharding
	--overlap-param-gather \ # overlaps the allgather for the distributed optimizer
	--overlap-param-gather-with-optimizer-step \ # overlap allgather with opt step
	--tp-comm-overlap \ # overlap tp comms with gemm. I haven't tested this but worth a check
	"

	DATA_ARGS="
	--data-path $DATA_PATH \
	--data-cache-path $DATA_CACHE_PATH \
	--split 1,0,0 \
	--num-workers 1 \ # increase depending on your CPU
	--tokenizer-type $TOKENIZER_TYPE \
	"

	OUTPUT_ARGS="
	--log-interval 1 \
	--log-throughput \
	--save-interval <TODO> \
	--eval-interval <TODO> \
	"

	torchrun $DISTRIBUTED_ARGS pretrain_gpt.py \
	$GPT_ARGS \
	$DATA_ARGS \
	$OUTPUT_ARGS \
	--distributed-backend nccl \
	--save $CHECKPOINT_PATH \
	--load $CHECKPOINT_PATH