The way I test things quickly with srun
:
(1) on devfair:
srun --gres=gpu:8 --partition=devaccel --nodes=1 --cpus-per-task 64 \
--ntasks-per-node 1 --mem=400G --constraint volta32gb \
--time="2-00:00:00" --pty /bin/zsh -l
(2) on the resultant shell:
source deactivate
source /etc/profile
module load anaconda3/2020.11 cudnn/v8.0.3.33-cuda.11.0 cuda/11.0 fairusers_aws nvtop/1.0.0/gcc.7.3.0 openmpi/4.1.0/cuda.11.0-gcc.9.3.0 NCCL/2.8.3-1-cuda.11.0
source activate fairseq-20210318-v4
Then I run some command with --local
, for example
./fb_sweep/benchmark_lm.py -g 8 -t 1 -n 1 --dl 12 --embed-dim 768 \
--bs 8 --li 50 --epg 0 --mu 10000 --uf 2 \
--checkpoints-dir . -p tmp \
--ddp no_c10d --opt adam8bit --resume-failed --local
To launch a job in venv from devfair, I just do step 2 (well actually my .zshrc does it for me):
source deactivate
source /etc/profile
module load anaconda3/2020.11 cudnn/v8.0.3.33-cuda.11.0 cuda/11.0 fairusers_aws nvtop/1.0.0/gcc.7.3.0 openmpi/4.1.0/cuda.11.0-gcc.9.3.0 NCCL/2.8.3-1-cuda.11.0
source activate fairseq-20210318-v4
sd () {
conda deactivate
}
sa () {
conda activate $@
}
cc () {
source /etc/profile
source ~/.zshrc
module purge && module load anaconda3/2020.11 cudnn/v8.0.3.33-cuda.11.0 cuda/11.0 fairusers_aws nvtop/1.0.0/gcc.7.3.0 openmpi/4.1.0/cuda.11.0-gcc.9.3.0
module load ripgrep/11.0.2
module load NCCL/2.8.3-1-cuda.11.0
}
alias f="cd $HOME/fairseq-py"