Last active
January 16, 2023 19:20
-
-
Save cat-state/6c3c682ad4de3ee2c9f172024486df76 to your computer and use it in GitHub Desktop.
`sbatch trlx-nemo.sh`. Note that you have to edit `num_nodes` in the model yml to match the number set in the this file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#SBATCH --job-name=nemo-trlx | |
#SBATCH --partition=a100-cu117 | |
#SBATCH --nodes=4 | |
#SBATCH --gres=gpu:8 | |
#SBATCH --ntasks-per-node=8 | |
#SBATCH --output="%x.out" | |
#SBATCH --signal=B:INT@60 | |
# optional if cluster has it | |
source /opt/hpcx/hpcx-init.sh | |
hpcx_load | |
# Log env vars | |
export $(grep -v '^#' nemo.env | xargs) | |
export WANDB_API_KEY=<your wandb key> | |
# Python where you installed NeMo | |
PY=/mnt/nvme/home/uwu/conda/nemo-113/bin/python | |
# for debugging | |
# export HYDRA_FULL_ERROR=1 | |
# export CUDA_LAUNCH_BLOCKING=1 | |
srun echo $PY | |
# if using stability wandb | |
export WANDB_BASE_URL=https://stability.wandb.io/ | |
# helps not OOM when close | |
export PYTORCH_CUDA_ALLOC_CONF='max_split_size_mb:512' | |
srun $PY -u /mnt/nvme/home/uwu/trlx/examples/ilql_sentiments.py |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment