Created
January 3, 2023 14:08
-
-
Save harry-stark/027121df167a88a95219540a2e307bd9 to your computer and use it in GitHub Desktop.
Latest paths script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module load openmpi cuda/11.7 | |
#CONDA_HOME=/fsx/quentin/miniconda3 | |
CONDA_HOME=/fsx/gpt-neox/conda/envs/neox | |
#CONDA_HOME=/fsx/gpt-neox/conda/envs/improved-t5 | |
CUDNN_HOME=/fsx/quentin/cudnn-linux-x86_64-8.6.0.163_cuda11-archive | |
export LD_LIBRARY_PATH=$CUDNN_HOME/lib:$LD_LIBRARY_PATH | |
export CPATH=$CUDNN_HOME/include:$CPATH | |
export PATH=$CONDA_HOME/bin:$PATH | |
export LD_LIBRARY_PATH=$CONDA_HOME/lib:$LD_LIBRARY_PATH | |
export CPATH=$CONDA_HOME/include:$CPATH | |
export LD_LIBRARY_PATH=/opt/aws-ofi-nccl/lib:/opt/amazon/efa/lib64:/usr/local/cuda-11.7/efa/lib:/usr/local/cuda-11.7/lib:/usr/local/cuda-11.7/lib64:/usr/local/cuda-11.7:/opt/nccl/build/lib:/opt/aws-ofi-nccl-install/lib:/opt/aws-ofi-nccl/lib:$LD_LIBRARY_PATH | |
export PATH=/opt/amazon/efa/bin:/opt/amazon/openmpi/bin:/usr/local/cuda-11.7/bin:$PATH | |
#export LD_PRELOAD="/opt/nccl/build/lib/libnccl.so" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#SBATCH --job-name="neox" | |
#SBATCH --partition=g40n404 | |
#SBATCH --mem-per-cpu=16GB # Amount of CPU memory | |
#SBATCH --nodes=8 | |
#SBATCH --ntasks-per-node=8 # Crucial - only 1 task per dist per node! | |
#SBATCH --cpus-per-task=6 # Number of cores per tasks | |
#SBATCH --hint=nomultithread # We get physical cores not logical | |
#SBATCH --gres=gpu:8 # Number of gpus | |
#SBATCH --output=%x_%j.out # Set this dir where you want slurm outs to go | |
#SBATCH --error=%x_%j.out # Set this dir where you want slurm outs to go | |
#SBATCH --exclusive # Turn off node sharing | |
#SBATCH --comment=eleuther | |
# setup the environment using the script we created before | |
source /fsx/hailey/conda_setup.sh | |
ds_report | |
export NCCL_DEBUG=WARN | |
export NCCL_TREE_THRESHOLD=0 | |
export NCCL_PROTO=simple | |
# Network issues without the following two NCCL vars set; See https://github.com/NVIDIA/nccl/issues/676 | |
export NCCL_IBEXT_DISABLE=1 | |
export NCCL_SOCKET_IFNAME=^docker0,lo | |
export FI_EFA_FORK_SAFE=1 | |
export FI_EFA_USE_DEVICE_RDMA=1 # use for p4dn | |
export FI_EFA_ENABLE_SHM_TRANSFER=0 | |
export FI_PROVIDER=efa | |
export FI_EFA_TX_MIN_CREDITS=64 | |
export PYTHONFAULTHANDLER=1 | |
export OMPI_MCA_mtl_base_verbose=1 | |
export OMPI_MCA_btl="^openib" | |
export HOSTNAMES=`scontrol show hostnames "$SLURM_JOB_NODELIST"` | |
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) | |
export MASTER_PORT=12802 | |
export COUNT_NODE=`scontrol show hostnames "$SLURM_JOB_NODELIST" | wc -l` | |
# Hide duplicated errors using this hack - will be properly fixed in pt-1.12 | |
export TORCHELASTIC_ERROR_FILE=$TRAIN_PATH/tmp/torch-elastic-error.json | |
# Move to the gpt-neox install | |
TRAIN_PATH=/fsx/hailey/gpt-neox | |
cd $TRAIN_PATH | |
# Write the hostfile for this job | |
#/fsx/shiv/zphang/scripts/write_hostfile.sh | |
#export DLTS_HOSTFILE=/fsx/shiv/zphang/hostfiles/hosts_$SLURM_JOBID | |
bash /fsx/quentin/write_hostfile.sh | |
export DLTS_HOSTFILE=/fsx/hailey/hostfiles/hosts_$SLURM_JOBID | |
#sudo mkdir -p /home/quentin/.cache/torch_extensions | |
#sudo chmod -R 777 /home/quentin | |
python $TRAIN_PATH/deepy.py $TRAIN_PATH/train.py \ | |
--conf_dir configs pythia-13B.yml #1-3B_quentin.yml local_setup.yml | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment