Megatron-LM on JUWELS

1.Preparing the env:

export REPO=/p/project/laionize/cherti1/Megatron-LM # MODIFY!
git clone https://github.com/bigcode-project/Megatron-LM.git $REPO
cd $REPO
ml Stages/2024 GCC/12.3.0 OpenMPI CUDA/12 PyTorch/2.1.0-CUDA-12-ALPHA
python -m venv $REPO/env
source $REPO/env/bin/activate
pip install -U pip
pip install wheel
pip install torchvision==0.16.0
pip install transformers
pip install wandb
pip install datasets
pip install nltk
cd ..
git clone https://github.com/NVIDIA/apex
cd apex
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ # takes some time

2.Preparing data (starcoder C language subset as an example)

cd $REPO
git clone https://huggingface.co/datasets/bigcode/starcoderdata
wget https://huggingface.co/bigcode/starcoder/resolve/main/tokenizer.json
mkdir tokenized_data
python tools/preprocess_data.py \
    --input starcoderdata/c \
    --output-prefix tokenized_data/preprocessed \
    --tokenizer-type TokenizerFromFile \
    --tokenizer-file tokenizer.json \
    --dataset-impl mmap \
    --append-eod \
    --json-keys content \
    --workers 64 \
    --chunk-size 100 \
    --log-interval 1000

3.Preparing the script and running it

Example script for 34B model:

#!/bin/bash
#SBATCH --nodes=256
#SBATCH --time=00:20:00
#SBATCH --ntasks-per-node=1
#SBATCH --cpus-per-task=48
#SBATCH --gres=gpu:4
#SBATCH --partition=booster
#SBATCH --account=transfernetx
#SBATCH --exclude=jwb[0059,0067,0069,0193,0284,0287,0294,0359,0418,0637,0829,0832,0838,0898,0907,0921,0971,1004,1023,1029,1213,1126,0392]
#SBATCH --threads-per-core=1
#SBATCH --mem=0
set -x -e

ml Stages/2024 GCC/12.3.0 OpenMPI CUDA/12 PyTorch
export REPO=/p/project/laionize/cherti1/Megatron-LM # MODIFY!!
source $REPO/env/bin/activate

export LIBRARY_PATH=$CUDA_HOME/lib:$LIBRARY_PATH
export LD_LIBRARY_PATH=$CUDA_HOME/lib:$LD_LIBRARY_PATH
export TRITON_CACHE_DIR=cache
export CUDA_LAUNCH_BLOCKING=1
export NCCL_ASYNC_ERROR_HANDLING=1
export NCCL_IB_TIMEOUT=20
export NCCL_SOCKET_IFNAME=ib0
export NCCL_DEBUG=INFO
export GLOO_SOCKET_IFNAME=ib0 # necessary to avoid gloo issues

echo "START TIME: $(date)"

# File Path setup
pushd $REPO

LOG_PATH=$REPO/main_log.txt

# Training setup
GPUS_PER_NODE=4
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
#export MASTER_ADDR="${MASTER_ADDR}.juwels"
export MASTER_ADDR="${MASTER_ADDR}i"
MASTER_PORT=12345
NNODES=$SLURM_NNODES
NODE_RANK=$SLURM_PROCID
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))

# File path setup
CHECKPOINT_PATH=$REPO/ckpts/34b
TOKENIZER_FILE=$REPO/tokenizer.json
#WEIGHTS_TRAIN=data/train_data_paths.txt.tmp
#WEIGHTS_VALID=data/valid_data_paths.txt.tmp

mkdir -p $CHECKPOINT_PATH/tensorboard
export CUDA_DEVICE_MAX_CONNECTIONS=1

GPT_ARGS="\
       --micro-batch-size 1 \
       --tensor-model-parallel-size 4 \
       --pipeline-model-parallel-size 8 \
       --sequence-parallel \
       --num-layers 56 \
       --hidden-size 7168 \
       --num-attention-heads 56 \
       --ffn-hidden-size 28672 \
       --attention-head-type multihead \
       --init-method-std 0.01275 \
       --seq-length 2048 \
       --max-position-embeddings 2048 \
       --attention-dropout 0.1 \
       --hidden-dropout 0.1 \
       --global-batch-size 32\
       --lr 0.0001 \
       --min-lr 0.00001 \
       --train-iters 250000 \
       --lr-decay-iters 250000 \
       --lr-decay-style cosine \
       --lr-warmup-iters 2000 \
       --weight-decay .1 \
       --adam-beta2 .95 \
       --clip-grad 1.0 \
       --bf16 \
       --fim-rate 0.5 \
       --log-interval 10 \
       --save-interval 2500 \
       --eval-interval 2500 \
       --eval-iters 2 \
       --use-distributed-optimizer \
       --valid-num-workers 0 \
"

TENSORBOARD_ARGS="--tensorboard-dir ${CHECKPOINT_PATH}/tensorboard"

CMD=" \
    $REPO/pretrain_gpt.py \
    $GPT_ARGS \
    --tokenizer-type TokenizerFromFile \
    --tokenizer-file $TOKENIZER_FILE \
    --save $CHECKPOINT_PATH \
    --load $CHECKPOINT_PATH \
    --data-path tokenized_data/preprocessed_content_document \
    --structured-logs \
    --structured-logs-dir $CHECKPOINT_PATH/logs \
    $TENSORBOARD_ARGS \
    "

export LAUNCHER="python -u -m torch.distributed.run \
    --nproc_per_node $GPUS_PER_NODE \
    --nnodes $NNODES \
    --rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
    --rdzv_backend static \
    --max_restarts 0 \
    --tee 3 \
    "

echo $CMD

# hide duplicated errors using this hack - will be properly fixed in pt-1.12
# export TORCHELASTIC_ERROR_FILE=/tmp/torch-elastic-error.json

# force crashing on nccl issues like hanging broadcast
# export NCCL_ASYNC_ERROR_HANDLING=1
# export NCCL_DEBUG=INFO
# export NCCL_DEBUG_SUBSYS=COLL
# export NCCL_SOCKET_NTHREADS=1
# export NCCL_NSOCKS_PERTHREAD=1
# export CUDA_LAUNCH_BLOCKING=1


# srun error handling:
# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
SRUN_ARGS=" \
    --wait=60 --cpu_bind=v --cpus-per-task=48 --threads-per-core=1 \
    --kill-on-bad-exit=1 \
    "

clear; srun  $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER --node_rank \$SLURM_PROCID --role \$SLURMD_NODENAME: $CMD" 2>&1 | tee $LOG_PATH

echo "END TIME: $(date)"

save into run.sbatch, then sbatch run.sbatch

mehdidc/megatron_lm_juwels.md

Select an option

No results found

Select an option

No results found

1.Preparing the env:

2.Preparing data (starcoder C language subset as an example)

3.Preparing the script and running it