Created
April 1, 2024 19:29
-
-
Save malteos/71635c411544fe86464663720c24aff5 to your computer and use it in GitHub Desktop.
Sbatch example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#SBATCH --job-name=oxw-bloom-1b7-twc-german | |
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! | |
#SBATCH --nodes=4 | |
#SBATCH --gres=gpu:4 # ---> does not matter on JUWELS | |
#SBATCH --cpus-per-task=48 # number of cores per tasks | |
#SBATCH --hint=nomultithread # we get physical cores not logical | |
#SBATCH --time=0-12:00:00 | |
#SBATCH --output=%j.%x.out | |
#SBATCH --partition=booster | |
# Use `develbooster` for debugging, `booster` for "normal" jobs, and | |
# `largebooster` for jobs on more than 256 nodes. | |
# send signal 4 mins before time limit | |
#SBATCH --signal=B:INT@240 | |
# Enable logging | |
#set -x -e | |
echo "START TIME: $(date)" | |
# copy this batch script into log directory for reproducibility (TODO this only works when no extra sbatch args are defined) | |
if [ -e "$0" ]; then | |
cp -p "$0" "slurmLog/${SLURM_JOB_ID}.${SLURM_JOB_NAME}.sh" | |
else | |
echo 'Please execute the sbatch script from the `run_scripts` directory.' | |
exit 1 | |
fi | |
if [[ -z "$EXP_DIR" ]] || [[ -z "$BASE_DIR" ]]; then | |
echo "Environment variables (BASE_DIR, EXP_DIR...) are not set!" 1>&2 | |
exit 1 | |
fi | |
# load experiment env | |
source ${BASE_DIR}/slurm/bloom-1b7-twc-german/exp.sh || exit 1 | |
if ["$EXP_NAME" != "bloom-1b7-twc-german"]; then | |
echo "Invalid EXP_NAME" | |
exit 1 | |
fi | |
# load juwels env | |
if ! [ -e activate.bash ]; then | |
echo 'Please execute the sbatch script from the `run_scripts` directory.' | |
exit 1 | |
fi | |
source activate.bash || exit 1 | |
# Hardware settings | |
GPUS_PER_NODE=4 | |
NNODES=$SLURM_JOB_NUM_NODES | |
# Paths | |
BIGS_WORKING_DIR=$EXP_DIR/tr1 | |
DATA_OUTPUT_PATH="$BIGS_WORKING_DIR"/output_dir/job$SLURM_JOBID | |
CHECKPOINT_PATH=$BIGS_WORKING_DIR/checkpoints | |
##CHECKPOINT_PATH=$EXP_DIR/tr1/checkpoints # for debugging load old stage 2020 checkpoint | |
TENSORBOARD_PATH=$BIGS_WORKING_DIR/tensorboard | |
CODECARBON_PATH=$BIGS_WORKING_DIR/codecarbon | |
LOGS_PATH=$BIGS_WORKING_DIR/logs | |
KILL_SWITCH_PATH=$BIGS_WORKING_DIR/kill-switch$SLURM_JOBID | |
mkdir -p "$LOGS_PATH" | |
### the following is mostly copied from | |
# https://github.com/bigscience-workshop/bigscience/blob/master/train/tr11-176B-ml/smaller_models/tr11b-1B3-ml.slurm | |
# (slurm script corresponds to https://huggingface.co/bigscience/bloom-1b7) | |
PP_SIZE=1 | |
TP_SIZE=1 | |
MICRO_BATCH_SIZE=32 # TODO increase? MICRO_BATCH_SIZE=2 works, MICRO_BATCH_SIZE=3 use to work with old setup | |
##GLOBAL_BATCH_SIZE=$(((NNODES * GPUS_PER_NODE / (PP_SIZE * TP_SIZE)) * MICRO_BATCH_SIZE)) | |
#GLOBAL_BATCH_SIZE=240 # gradient accumulation steps = 5 | |
GAS=1 | |
GLOBAL_BATCH_SIZE=$(((NNODES * GPUS_PER_NODE / (PP_SIZE * TP_SIZE)) * MICRO_BATCH_SIZE * GAS)) | |
# goal: 512 = same as gpt2-xl | |
echo "MICRO_BATCH_SIZE=$MICRO_BATCH_SIZE" | |
echo "GLOBAL_BATCH_SIZE=$GLOBAL_BATCH_SIZE" | |
# Model settings | |
NLAYERS=24 | |
NHIDDEN=2048 | |
NHEADS=16 | |
MAX_POS_EMBEDDING=2048 | |
SEQ_LEN=512 #$MAX_POS_EMBEDDING | |
## original script | |
# GLOBAL_BATCH_SIZE=240 # gradient accumulation steps = 5 | |
# NLAYERS=24 | |
# NHIDDEN=1024 | |
# NHEADS=16 | |
# SEQ_LEN=2048 | |
SAVE_INTERVAL=6250 # (WECHSEL = 12500) | |
LOG_INTERVAL=10 | |
EVAL_INTERVAL=1250 #2500 # (WECHSEL = 12500) | |
EVAL_ITERS=100 # depends on batch size | |
#SAVE_INTERVAL=1 # (WECHSEL = 12500) | |
#LOG_INTERVAL=1 | |
#EVAL_INTERVAL=1 # (WECHSEL = 12500) | |
#EVAL_ITERS=-1 # depends on batch size | |
#TRAIN_SAMPLES=220_000_000 # 450B tokens ## 1.0 | |
#LR_DECAY_SAMPLES=200_000_000 # Decay for the first 410B tokens then continue at fixed --min-lr ## 0.9111111111111111 | |
#LR_WARMUP_SAMPLES=183_105 # 375M tokens ## 0.0008333333333333334 | |
# Train on OSCAR-DE (same as GPT2-WECHSEL-German) | |
TRAIN_DATA_PATH=${BASE_DIR}/data/gpt2_oscar_unshuffled_deduplicated_de_without_4gb_valid/bigs/train_text_document | |
VALID_DATA_PATH=${BASE_DIR}/data/gpt2_oscar_unshuffled_deduplicated_de_without_4gb_valid/bigs/validation_text_document | |
TOKENIZER_NAME_OR_PATH=${EXP_DIR} | |
FROM_PRETRAINED=${EXP_DIR} | |
# total = 62234300 document (split 998,1,1) | |
# 99.8% for training = 56_010_870 | |
# 0.1% for validation | |
# 0.1% for test | |
TRAIN_SAMPLES=62_109_831 | |
LR_DECAY_SAMPLES=55_898_848 # 0.9 | |
LR_WARMUP_SAMPLES=51_758 # 0.0008333333333333334 | |
# Network settings | |
MASTER_PORT=6000 | |
MASTER_ADDR="$(hostname)" | |
scontrol && MASTER_ADDR="$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)" | |
# Allow communication over InfiniBand cells. (JUWELS only) | |
if [ -d "$JUWELS_BASE_DIR" ]; then | |
echo "JUWELS dected (${JUWELS_BASE_DIR} exists)" | |
echo "Appending i to MASTER_ADDR=${MASTER_ADDR} ..." | |
MASTER_ADDR="${MASTER_ADDR}i" | |
echo "=> MASTER_ADDR=${MASTER_ADDR} " | |
fi | |
# NCCL related environment variables | |
# (from https://github.com/OpenGPTX/BigScience-Setup/blob/main/run_scripts/tr1-13B-round1_juwels_pipe.sbatch) | |
# do not remove or the training will hang and nodes will be lost w/o this workaround | |
#export CUDA_LAUNCH_BLOCKING=1 | |
# hide duplicated errors using this hack - will be properly fixed in pt-1.12 | |
export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json | |
# force crashing on nccl issues like hanging broadcast | |
#export NCCL_ASYNC_ERROR_HANDLING=1 | |
# handle timeouts | |
export NCCL_IB_TIMEOUT=50 | |
export UCX_RC_TIMEOUT=4s | |
export NCCL_IB_RETRY_CNT=10 | |
# setting IB for out of band communication | |
export NCCL_SOCKET_IFNAME=ib0 | |
# NCCL and Torch debug | |
export NCCL_DEBUG=INFO | |
# export NCCL_DEBUG_SUBSYS=ALL | |
# export TORCH_DISTRIBUTED_DEBUG=INFO | |
# Change to code base directory | |
cd "$MEGATRON_DEEPSPEED_REPO" | |
# Log git status (oxw repo + obmd repo) | |
git -C ${BASE_DIR} branch -vv | |
git branch -vv | |
git remote -v | |
# Rebuild fused kernels | |
CLEAN_PREV_JIT_BUILD=0 | |
rm -f megatron/fused_kernels/build/lock | |
((CLEAN_PREV_JIT_BUILD)) && rm -rf megatron/fused_kernels/{build,__pycache__} | |
OPTIMIZER_ARGS=" \ | |
--optimizer adam \ | |
--adam-beta1 0.9 \ | |
--adam-beta2 0.95 \ | |
--adam-eps 1e-8 \ | |
--lr 3.0e-4 \ | |
--min-lr 1e-5 \ | |
--lr-decay-style cosine \ | |
--lr-decay-samples $LR_DECAY_SAMPLES \ | |
--lr-warmup-samples $LR_WARMUP_SAMPLES \ | |
--clip-grad 1.0 \ | |
--weight-decay 1e-1 \ | |
" | |
# for 20h 1190, for 100h 5990 | |
# --exit-duration-in-mins 1190 \ | |
EXIT_OPTS=" \ | |
--exit-duration-in-mins 5990 \ | |
--kill-switch-path ${KILL_SWITCH_PATH} \ | |
" | |
# delete old kill switch | |
rm -f ${KILL_SWITCH_PATH} | |
# --pad-vocab-size-to 250880 \ | |
# --rampup-batch-size 192 32 9_765_625 \ | |
# --pp-partition-method 'type:transformer|embedding' \ | |
# | |
# | |
GPT_ARGS=" \ | |
--from-pretrained-hf $FROM_PRETRAINED \ | |
--pp-partition-method type:transformer|embedding \ | |
--num-layers $NLAYERS \ | |
--hidden-size $NHIDDEN \ | |
--num-attention-heads $NHEADS \ | |
--seq-length $SEQ_LEN \ | |
--max-position-embeddings $MAX_POS_EMBEDDING \ | |
--micro-batch-size $MICRO_BATCH_SIZE \ | |
--global-batch-size $GLOBAL_BATCH_SIZE \ | |
--train-samples $TRAIN_SAMPLES \ | |
--tokenizer-type PretrainedFromHF \ | |
--tokenizer-name-or-path $TOKENIZER_NAME_OR_PATH \ | |
--init-method-std 0.0048 \ | |
--embed-layernorm \ | |
--checkpoint-activations \ | |
--bf16 \ | |
--seed 42 \ | |
--position-embedding-type alibi \ | |
--abort-on-unmet-fused-kernel-constraints \ | |
$OPTIMIZER_ARGS \ | |
$EXIT_OPTS \ | |
" | |
# TODO: decide on efficient eval-interval + eval-iters | |
OUTPUT_ARGS=" \ | |
--log-interval $LOG_INTERVAL \ | |
--save-interval $SAVE_INTERVAL \ | |
--eval-interval $EVAL_INTERVAL \ | |
--eval-iters $EVAL_ITERS \ | |
--tensorboard-dir $TENSORBOARD_PATH \ | |
--tensorboard-queue-size 5 \ | |
--log-timers-to-tensorboard \ | |
--log-batch-size-to-tensorboard \ | |
--log-validation-ppl-to-tensorboard \ | |
" | |
ZERO_STAGE=0 # important: bf16 must use z0! it implements its own zero stage 1 equivalent | |
config_json="$BIGS_WORKING_DIR/ds_config.$SLURM_JOBID.json" | |
# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size() | |
cat <<EOT > $config_json | |
{ | |
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE, | |
"train_batch_size": $GLOBAL_BATCH_SIZE, | |
"gradient_clipping": 1.0, | |
"zero_optimization": { | |
"stage": $ZERO_STAGE | |
}, | |
"bf16": { | |
"enabled": true | |
}, | |
"steps_per_print": 2000, | |
"wall_clock_breakdown": false | |
} | |
EOT | |
DEEPSPEED_ARGS=" \ | |
--deepspeed \ | |
--deepspeed_config ${config_json} \ | |
--zero-stage ${ZERO_STAGE} \ | |
--deepspeed-activation-checkpointing \ | |
" | |
export LAUNCHER="python -u -m torch.distributed.run \ | |
--nproc_per_node $GPUS_PER_NODE \ | |
--nnodes $NNODES \ | |
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \ | |
--rdzv_backend c10d \ | |
--max_restarts 0 \ | |
--tee 3 \ | |
" | |
export CMD=" \ | |
`pwd`/pretrain_gpt.py \ | |
--tensor-model-parallel-size $TP_SIZE \ | |
--pipeline-model-parallel-size $PP_SIZE \ | |
$GPT_ARGS \ | |
$OUTPUT_ARGS \ | |
--load $CHECKPOINT_PATH \ | |
--save $CHECKPOINT_PATH \ | |
--data-path $TRAIN_DATA_PATH \ | |
--split 998,1,1 \ | |
--data-impl mmap \ | |
--distributed-backend nccl \ | |
$DEEPSPEED_ARGS \ | |
" | |
echo $LAUNCHER | |
echo $CMD | |
# catch signals | |
SLEEP_BEFORE_KILL=180 | |
trap 'echo "Signal recieved! Saving kill switch to $KILL_SWITCH_PATH and wait for $SLEEP_BEFORE_KILL seconds"; touch $KILL_SWITCH_PATH; sleep $SLEEP_BEFORE_KILL; echo Done' USR1 SIGINT SIGTERM | |
(srun --jobid $SLURM_JOB_ID \ | |
bash -c '$LAUNCHER --node_rank $SLURM_PROCID $CMD' 2>&1 \ | |
| tee -a "$LOGS_PATH"/main_log.txt) & PID="$!" | |
wait "${PID}" | |
echo "END TIME: $(date)" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment