malteos · April 1, 2024 19:29
diff --git a/bigscience-deepspeedmeg-example.sbatch b/bigscience-deepspeedmeg-example.sbatch
 #!/bin/bash
 #SBATCH --job-name=oxw-bloom-1b7-twc-german
 #SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
 #SBATCH --nodes=4
 #SBATCH --gres=gpu:4       # ---> does not matter on JUWELS
 #SBATCH --cpus-per-task=48           # number of cores per tasks
 #SBATCH --hint=nomultithread         # we get physical cores not logical
 #SBATCH --time=0-12:00:00
 #SBATCH --output=%j.%x.out
 #SBATCH --partition=booster
 # Use `develbooster` for debugging, `booster` for "normal" jobs, and
 # `largebooster` for jobs on more than 256 nodes.
 # send signal 4 mins before time limit 
 #SBATCH --signal=B:INT@240

 # Enable logging
 #set -x -e
 echo "START TIME: $(date)"
 # copy this batch script into log directory for reproducibility (TODO this only works when no extra sbatch args are defined)
 if [ -e "$0" ]; then
    cp -p "$0" "slurmLog/${SLURM_JOB_ID}.${SLURM_JOB_NAME}.sh"
 else
    echo 'Please execute the sbatch script from the `run_scripts` directory.'
    exit 1
 fi

 if [[ -z "$EXP_DIR" ]] || [[ -z "$BASE_DIR" ]]; then
    echo "Environment variables (BASE_DIR, EXP_DIR...) are not set!" 1>&2
    exit 1
 fi

 # load experiment env
 source ${BASE_DIR}/slurm/bloom-1b7-twc-german/exp.sh || exit 1

 if ["$EXP_NAME" != "bloom-1b7-twc-german"]; then    
    echo "Invalid EXP_NAME"
    exit 1
 fi

 # load juwels env
 if ! [ -e activate.bash ]; then
    echo 'Please execute the sbatch script from the `run_scripts` directory.'
    exit 1
 fi
 source activate.bash || exit 1

 # Hardware settings
 GPUS_PER_NODE=4
 NNODES=$SLURM_JOB_NUM_NODES

 # Paths
 BIGS_WORKING_DIR=$EXP_DIR/tr1
 DATA_OUTPUT_PATH="$BIGS_WORKING_DIR"/output_dir/job$SLURM_JOBID
 CHECKPOINT_PATH=$BIGS_WORKING_DIR/checkpoints
 ##CHECKPOINT_PATH=$EXP_DIR/tr1/checkpoints   # for debugging load old stage 2020 checkpoint

 TENSORBOARD_PATH=$BIGS_WORKING_DIR/tensorboard
 CODECARBON_PATH=$BIGS_WORKING_DIR/codecarbon
 LOGS_PATH=$BIGS_WORKING_DIR/logs
 KILL_SWITCH_PATH=$BIGS_WORKING_DIR/kill-switch$SLURM_JOBID

 mkdir -p "$LOGS_PATH"

 ### the following is mostly copied from
 # https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/smaller_models/tr11b-1B3-ml.slurm
 # (slurm script corresponds to https://huggingface.co/bigscience/bloom-1b7)

 PP_SIZE=1
 TP_SIZE=1

 MICRO_BATCH_SIZE=32 # TODO increase? MICRO_BATCH_SIZE=2 works, MICRO_BATCH_SIZE=3 use to work with old setup
 ##GLOBAL_BATCH_SIZE=$(((NNODES * GPUS_PER_NODE / (PP_SIZE * TP_SIZE)) * MICRO_BATCH_SIZE))
 #GLOBAL_BATCH_SIZE=240  # gradient accumulation steps = 5
 GAS=1
 GLOBAL_BATCH_SIZE=$(((NNODES * GPUS_PER_NODE / (PP_SIZE * TP_SIZE)) *  MICRO_BATCH_SIZE * GAS))
 # goal: 512 = same as gpt2-xl


 echo "MICRO_BATCH_SIZE=$MICRO_BATCH_SIZE"
 echo "GLOBAL_BATCH_SIZE=$GLOBAL_BATCH_SIZE"

 # Model settings
 NLAYERS=24
 NHIDDEN=2048
 NHEADS=16
 MAX_POS_EMBEDDING=2048
 SEQ_LEN=512  #$MAX_POS_EMBEDDING

 ## original script
 # GLOBAL_BATCH_SIZE=240  # gradient accumulation steps = 5
 # NLAYERS=24
 # NHIDDEN=1024
 # NHEADS=16
 # SEQ_LEN=2048

 SAVE_INTERVAL=6250  # (WECHSEL = 12500)
 LOG_INTERVAL=10
 EVAL_INTERVAL=1250 #2500  # (WECHSEL = 12500)
 EVAL_ITERS=100  # depends on batch size
 #SAVE_INTERVAL=1  # (WECHSEL = 12500)
 #LOG_INTERVAL=1
 #EVAL_INTERVAL=1  # (WECHSEL = 12500)
 #EVAL_ITERS=-1  # depends on batch size


 #TRAIN_SAMPLES=220_000_000  # 450B tokens  ## 1.0
 #LR_DECAY_SAMPLES=200_000_000  # Decay for the first 410B tokens then continue at fixed --min-lr ## 0.9111111111111111
 #LR_WARMUP_SAMPLES=183_105  # 375M tokens  ## 0.0008333333333333334

 # Train on OSCAR-DE (same as GPT2-WECHSEL-German)
 TRAIN_DATA_PATH=${BASE_DIR}/data/gpt2_oscar_unshuffled_deduplicated_de_without_4gb_valid/bigs/train_text_document
 VALID_DATA_PATH=${BASE_DIR}/data/gpt2_oscar_unshuffled_deduplicated_de_without_4gb_valid/bigs/validation_text_document

 TOKENIZER_NAME_OR_PATH=${EXP_DIR}
 FROM_PRETRAINED=${EXP_DIR}

 # total = 62234300 document (split 998,1,1)
 # 99.8% for training = 56_010_870
 # 0.1% for validation
 # 0.1% for test
 TRAIN_SAMPLES=62_109_831
 LR_DECAY_SAMPLES=55_898_848  # 0.9
 LR_WARMUP_SAMPLES=51_758  # 0.0008333333333333334

 # Network settings
 MASTER_PORT=6000
 MASTER_ADDR="$(hostname)"
 scontrol && MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)"

 # Allow communication over InfiniBand cells. (JUWELS only)
 if [ -d "$JUWELS_BASE_DIR" ]; then
    echo "JUWELS dected (${JUWELS_BASE_DIR} exists)"
    echo "Appending i to MASTER_ADDR=${MASTER_ADDR} ..."
    MASTER_ADDR="${MASTER_ADDR}i"
    echo "=> MASTER_ADDR=${MASTER_ADDR} "
 fi

 # NCCL related environment variables
 # (from https://github.com/OpenGPTX/BigScience-Setup/blob/main/run_scripts/tr1-13B-round1_juwels_pipe.sbatch)

 # do not remove or the training will hang and nodes will be lost w/o this workaround
 #export CUDA_LAUNCH_BLOCKING=1
 # hide duplicated errors using this hack - will be properly fixed in pt-1.12
 export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
 # force crashing on nccl issues like hanging broadcast
 #export NCCL_ASYNC_ERROR_HANDLING=1
 # handle timeouts
 export NCCL_IB_TIMEOUT=50
 export UCX_RC_TIMEOUT=4s
 export NCCL_IB_RETRY_CNT=10
 # setting IB for out of band communication
 export NCCL_SOCKET_IFNAME=ib0
 # NCCL and Torch debug
 export NCCL_DEBUG=INFO
 # export NCCL_DEBUG_SUBSYS=ALL
 # export TORCH_DISTRIBUTED_DEBUG=INFO

 # Change to code base directory
 cd "$MEGATRON_DEEPSPEED_REPO"

 # Log git status (oxw repo + obmd repo)
 git -C ${BASE_DIR} branch -vv 
 git branch -vv
 git remote -v

 # Rebuild fused kernels
 CLEAN_PREV_JIT_BUILD=0
 rm -f megatron/fused_kernels/build/lock
 ((CLEAN_PREV_JIT_BUILD)) && rm -rf megatron/fused_kernels/{build,__pycache__}

 OPTIMIZER_ARGS=" \
     --optimizer adam \
     --adam-beta1 0.9 \
     --adam-beta2 0.95 \
     --adam-eps 1e-8 \
     --lr 3.0e-4 \
     --min-lr 1e-5 \
     --lr-decay-style cosine \
     --lr-decay-samples $LR_DECAY_SAMPLES \
     --lr-warmup-samples $LR_WARMUP_SAMPLES \
     --clip-grad 1.0 \
     --weight-decay 1e-1 \
     "

 # for 20h 1190, for 100h 5990
 #    --exit-duration-in-mins 1190 \
 EXIT_OPTS=" \
    --exit-duration-in-mins 5990 \
    --kill-switch-path ${KILL_SWITCH_PATH} \
    "
 # delete old kill switch
 rm -f ${KILL_SWITCH_PATH}

 #     --pad-vocab-size-to 250880 \
 #     --rampup-batch-size 192 32 9_765_625 \
 #    --pp-partition-method 'type:transformer|embedding' \
 #
 # 
 GPT_ARGS=" \
    --from-pretrained-hf $FROM_PRETRAINED \
    --pp-partition-method type:transformer|embedding \
    --num-layers $NLAYERS \
    --hidden-size $NHIDDEN \
    --num-attention-heads $NHEADS \
    --seq-length $SEQ_LEN \
    --max-position-embeddings $MAX_POS_EMBEDDING \
    --micro-batch-size $MICRO_BATCH_SIZE \
    --global-batch-size $GLOBAL_BATCH_SIZE \
    --train-samples $TRAIN_SAMPLES \
    --tokenizer-type PretrainedFromHF \
    --tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
    --init-method-std 0.0048 \
    --embed-layernorm \
    --checkpoint-activations \
    --bf16 \
    --seed 42 \
    --position-embedding-type alibi \
    --abort-on-unmet-fused-kernel-constraints \
    $OPTIMIZER_ARGS \
    $EXIT_OPTS \
    "

 # TODO: decide on efficient eval-interval + eval-iters

 OUTPUT_ARGS=" \
    --log-interval $LOG_INTERVAL \
    --save-interval $SAVE_INTERVAL \
    --eval-interval $EVAL_INTERVAL \
    --eval-iters $EVAL_ITERS \
    --tensorboard-dir $TENSORBOARD_PATH \
    --tensorboard-queue-size 5 \
    --log-timers-to-tensorboard \
    --log-batch-size-to-tensorboard \
    --log-validation-ppl-to-tensorboard \
    "

 ZERO_STAGE=0 # important: bf16 must use z0! it implements its own zero stage 1 equivalent

 config_json="$BIGS_WORKING_DIR/ds_config.$SLURM_JOBID.json"

 # Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
 cat <<EOT > $config_json
 {
  "train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
  "train_batch_size": $GLOBAL_BATCH_SIZE,
  "gradient_clipping": 1.0,
  "zero_optimization": {
    "stage": $ZERO_STAGE
  },
  "bf16": {
    "enabled": true
  },
  "steps_per_print": 2000,
  "wall_clock_breakdown": false
 }
 EOT


 DEEPSPEED_ARGS=" \
    --deepspeed \
    --deepspeed_config ${config_json} \
    --zero-stage ${ZERO_STAGE} \
    --deepspeed-activation-checkpointing \
    "

 export LAUNCHER="python -u -m torch.distributed.run \
    --nproc_per_node $GPUS_PER_NODE \
    --nnodes $NNODES \
    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
    --rdzv_backend c10d \
    --max_restarts 0 \
    --tee 3 \
    "

 export CMD=" \
    `pwd`/pretrain_gpt.py \
    --tensor-model-parallel-size $TP_SIZE \
    --pipeline-model-parallel-size $PP_SIZE \
    $GPT_ARGS \
    $OUTPUT_ARGS \
    --load $CHECKPOINT_PATH \
    --save $CHECKPOINT_PATH \
    --data-path $TRAIN_DATA_PATH \
    --split 998,1,1 \
    --data-impl mmap \
    --distributed-backend nccl \
     $DEEPSPEED_ARGS \
    "

 echo $LAUNCHER
 echo $CMD

 # catch signals
 SLEEP_BEFORE_KILL=180
 trap 'echo "Signal recieved! Saving kill switch to $KILL_SWITCH_PATH and wait for $SLEEP_BEFORE_KILL seconds"; touch $KILL_SWITCH_PATH; sleep $SLEEP_BEFORE_KILL; echo Done' USR1 SIGINT SIGTERM

 (srun --jobid $SLURM_JOB_ID \
            bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 \
    | tee -a "$LOGS_PATH"/main_log.txt) & PID="$!"
 wait "${PID}"

 echo "END TIME: $(date)"
	#!/bin/bash
	#SBATCH --job-name=oxw-bloom-1b7-twc-german
	#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
	#SBATCH --nodes=4
	#SBATCH --gres=gpu:4 # ---> does not matter on JUWELS
	#SBATCH --cpus-per-task=48 # number of cores per tasks
	#SBATCH --hint=nomultithread # we get physical cores not logical
	#SBATCH --time=0-12:00:00
	#SBATCH --output=%j.%x.out
	#SBATCH --partition=booster
	# Use `develbooster` for debugging, `booster` for "normal" jobs, and
	# `largebooster` for jobs on more than 256 nodes.
	# send signal 4 mins before time limit
	#SBATCH --signal=B:INT@240

	# Enable logging
	#set -x -e
	echo "START TIME: $(date)"
	# copy this batch script into log directory for reproducibility (TODO this only works when no extra sbatch args are defined)
	if [ -e "$0" ]; then
	cp -p "$0" "slurmLog/${SLURM_JOB_ID}.${SLURM_JOB_NAME}.sh"
	else
	echo 'Please execute the sbatch script from the `run_scripts` directory.'
	exit 1
	fi

	if [[ -z "$EXP_DIR" ]] \|\| [[ -z "$BASE_DIR" ]]; then
	echo "Environment variables (BASE_DIR, EXP_DIR...) are not set!" 1>&2
	exit 1
	fi

	# load experiment env
	source ${BASE_DIR}/slurm/bloom-1b7-twc-german/exp.sh \|\| exit 1

	if ["$EXP_NAME" != "bloom-1b7-twc-german"]; then
	echo "Invalid EXP_NAME"
	exit 1
	fi

	# load juwels env
	if ! [ -e activate.bash ]; then
	echo 'Please execute the sbatch script from the `run_scripts` directory.'
	exit 1
	fi
	source activate.bash \|\| exit 1

	# Hardware settings
	GPUS_PER_NODE=4
	NNODES=$SLURM_JOB_NUM_NODES

	# Paths
	BIGS_WORKING_DIR=$EXP_DIR/tr1
	DATA_OUTPUT_PATH="$BIGS_WORKING_DIR"/output_dir/job$SLURM_JOBID
	CHECKPOINT_PATH=$BIGS_WORKING_DIR/checkpoints
	##CHECKPOINT_PATH=$EXP_DIR/tr1/checkpoints # for debugging load old stage 2020 checkpoint

	TENSORBOARD_PATH=$BIGS_WORKING_DIR/tensorboard
	CODECARBON_PATH=$BIGS_WORKING_DIR/codecarbon
	LOGS_PATH=$BIGS_WORKING_DIR/logs
	KILL_SWITCH_PATH=$BIGS_WORKING_DIR/kill-switch$SLURM_JOBID

	mkdir -p "$LOGS_PATH"

	### the following is mostly copied from
	# https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/smaller_models/tr11b-1B3-ml.slurm
	# (slurm script corresponds to https://huggingface.co/bigscience/bloom-1b7)

	PP_SIZE=1
	TP_SIZE=1

	MICRO_BATCH_SIZE=32 # TODO increase? MICRO_BATCH_SIZE=2 works, MICRO_BATCH_SIZE=3 use to work with old setup
	##GLOBAL_BATCH_SIZE=$(((NNODES * GPUS_PER_NODE / (PP_SIZE * TP_SIZE)) * MICRO_BATCH_SIZE))
	#GLOBAL_BATCH_SIZE=240 # gradient accumulation steps = 5
	GAS=1
	GLOBAL_BATCH_SIZE=$(((NNODES * GPUS_PER_NODE / (PP_SIZE * TP_SIZE)) * MICRO_BATCH_SIZE * GAS))
	# goal: 512 = same as gpt2-xl


	echo "MICRO_BATCH_SIZE=$MICRO_BATCH_SIZE"
	echo "GLOBAL_BATCH_SIZE=$GLOBAL_BATCH_SIZE"

	# Model settings
	NLAYERS=24
	NHIDDEN=2048
	NHEADS=16
	MAX_POS_EMBEDDING=2048
	SEQ_LEN=512 #$MAX_POS_EMBEDDING

	## original script
	# GLOBAL_BATCH_SIZE=240 # gradient accumulation steps = 5
	# NLAYERS=24
	# NHIDDEN=1024
	# NHEADS=16
	# SEQ_LEN=2048

	SAVE_INTERVAL=6250 # (WECHSEL = 12500)
	LOG_INTERVAL=10
	EVAL_INTERVAL=1250 #2500 # (WECHSEL = 12500)
	EVAL_ITERS=100 # depends on batch size
	#SAVE_INTERVAL=1 # (WECHSEL = 12500)
	#LOG_INTERVAL=1
	#EVAL_INTERVAL=1 # (WECHSEL = 12500)
	#EVAL_ITERS=-1 # depends on batch size


	#TRAIN_SAMPLES=220_000_000 # 450B tokens ## 1.0
	#LR_DECAY_SAMPLES=200_000_000 # Decay for the first 410B tokens then continue at fixed --min-lr ## 0.9111111111111111
	#LR_WARMUP_SAMPLES=183_105 # 375M tokens ## 0.0008333333333333334

	# Train on OSCAR-DE (same as GPT2-WECHSEL-German)
	TRAIN_DATA_PATH=${BASE_DIR}/data/gpt2_oscar_unshuffled_deduplicated_de_without_4gb_valid/bigs/train_text_document
	VALID_DATA_PATH=${BASE_DIR}/data/gpt2_oscar_unshuffled_deduplicated_de_without_4gb_valid/bigs/validation_text_document

	TOKENIZER_NAME_OR_PATH=${EXP_DIR}
	FROM_PRETRAINED=${EXP_DIR}

	# total = 62234300 document (split 998,1,1)
	# 99.8% for training = 56_010_870
	# 0.1% for validation
	# 0.1% for test
	TRAIN_SAMPLES=62_109_831
	LR_DECAY_SAMPLES=55_898_848 # 0.9
	LR_WARMUP_SAMPLES=51_758 # 0.0008333333333333334

	# Network settings
	MASTER_PORT=6000
	MASTER_ADDR="$(hostname)"
	scontrol && MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" \| head -n 1)"

	# Allow communication over InfiniBand cells. (JUWELS only)
	if [ -d "$JUWELS_BASE_DIR" ]; then
	echo "JUWELS dected (${JUWELS_BASE_DIR} exists)"
	echo "Appending i to MASTER_ADDR=${MASTER_ADDR} ..."
	MASTER_ADDR="${MASTER_ADDR}i"
	echo "=> MASTER_ADDR=${MASTER_ADDR} "
	fi

	# NCCL related environment variables
	# (from https://github.com/OpenGPTX/BigScience-Setup/blob/main/run_scripts/tr1-13B-round1_juwels_pipe.sbatch)

	# do not remove or the training will hang and nodes will be lost w/o this workaround
	#export CUDA_LAUNCH_BLOCKING=1
	# hide duplicated errors using this hack - will be properly fixed in pt-1.12
	export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json
	# force crashing on nccl issues like hanging broadcast
	#export NCCL_ASYNC_ERROR_HANDLING=1
	# handle timeouts
	export NCCL_IB_TIMEOUT=50
	export UCX_RC_TIMEOUT=4s
	export NCCL_IB_RETRY_CNT=10
	# setting IB for out of band communication
	export NCCL_SOCKET_IFNAME=ib0
	# NCCL and Torch debug
	export NCCL_DEBUG=INFO
	# export NCCL_DEBUG_SUBSYS=ALL
	# export TORCH_DISTRIBUTED_DEBUG=INFO

	# Change to code base directory
	cd "$MEGATRON_DEEPSPEED_REPO"

	# Log git status (oxw repo + obmd repo)
	git -C ${BASE_DIR} branch -vv
	git branch -vv
	git remote -v

	# Rebuild fused kernels
	CLEAN_PREV_JIT_BUILD=0
	rm -f megatron/fused_kernels/build/lock
	((CLEAN_PREV_JIT_BUILD)) && rm -rf megatron/fused_kernels/{build,__pycache__}

	OPTIMIZER_ARGS=" \
	--optimizer adam \
	--adam-beta1 0.9 \
	--adam-beta2 0.95 \
	--adam-eps 1e-8 \
	--lr 3.0e-4 \
	--min-lr 1e-5 \
	--lr-decay-style cosine \
	--lr-decay-samples $LR_DECAY_SAMPLES \
	--lr-warmup-samples $LR_WARMUP_SAMPLES \
	--clip-grad 1.0 \
	--weight-decay 1e-1 \
	"

	# for 20h 1190, for 100h 5990
	# --exit-duration-in-mins 1190 \
	EXIT_OPTS=" \
	--exit-duration-in-mins 5990 \
	--kill-switch-path ${KILL_SWITCH_PATH} \
	"
	# delete old kill switch
	rm -f ${KILL_SWITCH_PATH}

	# --pad-vocab-size-to 250880 \
	# --rampup-batch-size 192 32 9_765_625 \
	# --pp-partition-method 'type:transformer\|embedding' \
	#
	#
	GPT_ARGS=" \
	--from-pretrained-hf $FROM_PRETRAINED \
	--pp-partition-method type:transformer\|embedding \
	--num-layers $NLAYERS \
	--hidden-size $NHIDDEN \
	--num-attention-heads $NHEADS \
	--seq-length $SEQ_LEN \
	--max-position-embeddings $MAX_POS_EMBEDDING \
	--micro-batch-size $MICRO_BATCH_SIZE \
	--global-batch-size $GLOBAL_BATCH_SIZE \
	--train-samples $TRAIN_SAMPLES \
	--tokenizer-type PretrainedFromHF \
	--tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \
	--init-method-std 0.0048 \
	--embed-layernorm \
	--checkpoint-activations \
	--bf16 \
	--seed 42 \
	--position-embedding-type alibi \
	--abort-on-unmet-fused-kernel-constraints \
	$OPTIMIZER_ARGS \
	$EXIT_OPTS \
	"

	# TODO: decide on efficient eval-interval + eval-iters

	OUTPUT_ARGS=" \
	--log-interval $LOG_INTERVAL \
	--save-interval $SAVE_INTERVAL \
	--eval-interval $EVAL_INTERVAL \
	--eval-iters $EVAL_ITERS \
	--tensorboard-dir $TENSORBOARD_PATH \
	--tensorboard-queue-size 5 \
	--log-timers-to-tensorboard \
	--log-batch-size-to-tensorboard \
	--log-validation-ppl-to-tensorboard \
	"

	ZERO_STAGE=0 # important: bf16 must use z0! it implements its own zero stage 1 equivalent

	config_json="$BIGS_WORKING_DIR/ds_config.$SLURM_JOBID.json"

	# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
	cat <<EOT > $config_json
	{
	"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
	"train_batch_size": $GLOBAL_BATCH_SIZE,
	"gradient_clipping": 1.0,
	"zero_optimization": {
	"stage": $ZERO_STAGE
	},
	"bf16": {
	"enabled": true
	},
	"steps_per_print": 2000,
	"wall_clock_breakdown": false
	}
	EOT


	DEEPSPEED_ARGS=" \
	--deepspeed \
	--deepspeed_config ${config_json} \
	--zero-stage ${ZERO_STAGE} \
	--deepspeed-activation-checkpointing \
	"

	export LAUNCHER="python -u -m torch.distributed.run \
	--nproc_per_node $GPUS_PER_NODE \
	--nnodes $NNODES \
	--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
	--rdzv_backend c10d \
	--max_restarts 0 \
	--tee 3 \
	"

	export CMD=" \
	`pwd`/pretrain_gpt.py \
	--tensor-model-parallel-size $TP_SIZE \
	--pipeline-model-parallel-size $PP_SIZE \
	$GPT_ARGS \
	$OUTPUT_ARGS \
	--load $CHECKPOINT_PATH \
	--save $CHECKPOINT_PATH \
	--data-path $TRAIN_DATA_PATH \
	--split 998,1,1 \
	--data-impl mmap \
	--distributed-backend nccl \
	$DEEPSPEED_ARGS \
	"

	echo $LAUNCHER
	echo $CMD

	# catch signals
	SLEEP_BEFORE_KILL=180
	trap 'echo "Signal recieved! Saving kill switch to $KILL_SWITCH_PATH and wait for $SLEEP_BEFORE_KILL seconds"; touch $KILL_SWITCH_PATH; sleep $SLEEP_BEFORE_KILL; echo Done' USR1 SIGINT SIGTERM

	(srun --jobid $SLURM_JOB_ID \
	bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 \
	\| tee -a "$LOGS_PATH"/main_log.txt) & PID="$!"
	wait "${PID}"

	echo "END TIME: $(date)"