sparticlesteve · May 15, 2025 03:40
diff --git a/job.sh b/job.sh
 #!/bin/bash
 #SBATCH -C gpu
 #SBATCH --nodes=2
 #SBATCH --ntasks-per-node=1
 #SBATCH --gpus-per-node=4

 export IMAGE=nvcr.io/nvidia/pytorch:24.06-py3
 export MASTER_ADDR=$(hostname)
 export MASTER_PORT=29507
 export OMP_NUM_THREADS=2

 srun -u ./requeueable-train.sh
diff --git a/requeueable-train.sh b/requeueable-train.sh
 #!/bin/bash

 # The -v mounts here are just to get my train.py script in the container
 PODMAN_ARGS="--rm --gpu --ipc=host --network=host --env OMP* \
             -v $PWD:/workspace -w /workspace"

 set -x
 podman-hpc run $PODMAN_ARGS $IMAGE \
    torchrun \
    --nnodes $SLURM_JOB_NUM_NODES \
    --nproc_per_node $SLURM_GPUS_PER_NODE \
    --rdzv-backend=c10d \
    --rdzv-endpoint=$MASTER_ADDR:$MASTER_PORT \
    train.py
diff --git a/slurm-38708932.out b/slurm-38708932.out
 + podman-hpc run --rm --gpu --ipc=host --network=host -v /global/homes/s/sfarrell/WorkAreas/helping-noel:/workspace -w /workspace nvcr.io/nvidia/pytorch:24.06-py3 torchrun --nnodes 2 --nproc_per_node 4 --rdzv-backend=c10d --rdzv-endpoint=nid001305:29507 train.py
 + podman-hpc run --rm --gpu --ipc=host --network=host -v /global/homes/s/sfarrell/WorkAreas/helping-noel:/workspace -w /workspace nvcr.io/nvidia/pytorch:24.06-py3 torchrun --nnodes 2 --nproc_per_node 4 --rdzv-backend=c10d --rdzv-endpoint=nid001305:29507 train.py

 =============
 == PyTorch ==
 =============

 NVIDIA Release 24.06 (build 96418707)
 PyTorch Version 2.4.0a0+f70bd71
 Container image Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 Copyright (c) 2014-2024 Facebook Inc.
 Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
 Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
 Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
 Copyright (c) 2011-2013 NYU                      (Clement Farabet)
 Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
 Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
 Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
 Copyright (c) 2015      Google Inc.
 Copyright (c) 2015      Yangqing Jia
 Copyright (c) 2013-2016 The Caffe contributors
 All rights reserved.

 Various files include modifications (c) NVIDIA CORPORATION & AFFILIATES.  All rights reserved.

 This container image and its contents are governed by the NVIDIA Deep Learning Container License.
 By pulling and using the container, you accept the terms and conditions of this license:
 https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license

 =============
 == PyTorch ==
 =============

 NVIDIA Release 24.06 (build 96418707)
 PyTorch Version 2.4.0a0+f70bd71
 Container image Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 Copyright (c) 2014-2024 Facebook Inc.
 Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
 Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
 Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
 Copyright (c) 2011-2013 NYU                      (Clement Farabet)
 Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
 Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
 Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
 Copyright (c) 2015      Google Inc.
 Copyright (c) 2015      Yangqing Jia
 Copyright (c) 2013-2016 The Caffe contributors
 All rights reserved.

 Various files include modifications (c) NVIDIA CORPORATION & AFFILIATES.  All rights reserved.

 This container image and its contents are governed by the NVIDIA Deep Learning Container License.
 By pulling and using the container, you accept the terms and conditions of this license:
 https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license

 NOTE: CUDA Forward Compatibility mode ENABLED.
  Using CUDA 12.5 driver version 555.42.02 with kernel driver version 550.127.08.
  See https://docs.nvidia.com/deploy/cuda-compatibility/ for details.


 NOTE: CUDA Forward Compatibility mode ENABLED.
  Using CUDA 12.5 driver version 555.42.02 with kernel driver version 550.127.08.
  See https://docs.nvidia.com/deploy/cuda-compatibility/ for details.

 W0515 03:15:07.651000 139989962904192 torch/distributed/run.py:778]
 W0515 03:15:07.651000 139989962904192 torch/distributed/run.py:778] *****************************************
 W0515 03:15:07.651000 139989962904192 torch/distributed/run.py:778] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
 W0515 03:15:07.651000 139989962904192 torch/distributed/run.py:778] *****************************************
 W0515 03:15:07.883000 139758139224704 torch/distributed/run.py:778]
 W0515 03:15:07.883000 139758139224704 torch/distributed/run.py:778] *****************************************
 W0515 03:15:07.883000 139758139224704 torch/distributed/run.py:778] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
 W0515 03:15:07.883000 139758139224704 torch/distributed/run.py:778] *****************************************
 Initialized host nid001305 rank 3 local-rank 3 size 8
 Initialized host nid001305 rank 0 local-rank 0 size 8
 Initialized host nid001305 rank 1 local-rank 1 size 8
 Initialized host nid001305 rank 2 local-rank 2 size 8
 Using device: cuda:0
 Generating a batch of data
 Initialized host nid001308 rank 6 local-rank 2 size 8
 Initialized host nid001308 rank 5 local-rank 1 size 8
 Initialized host nid001308 rank 7 local-rank 3 size 8
 Initialized host nid001308 rank 4 local-rank 0 size 8
 Using device: cuda:0
 Generating a batch of data
 Using device: cuda:1
 Generating a batch of data
 Constructing model
 Using device: cuda:2
 Generating a batch of data
 Constructing model
 Using device: cuda:3
 Generating a batch of data
 Constructing model
 Constructing model
 Using device: cuda:1
 Generating a batch of data
 Constructing model
 Constructing model
 Using device: cuda:2
 Generating a batch of data
 Using device: cuda:3
 Generating a batch of data
 Constructing model
 Constructing model
 Performing one training step
 Performing one training step
 Performing one training step
 Performing one training step
 Performing one training step
 Performing one training step
 Performing one training step
 Performing one training step
 Finished
 Finished
 Finished
 Finished
 Finished
 Finished
 Finished
 Finished
	#!/bin/bash
	#SBATCH -C gpu
	#SBATCH --nodes=2
	#SBATCH --ntasks-per-node=1
	#SBATCH --gpus-per-node=4

	export IMAGE=nvcr.io/nvidia/pytorch:24.06-py3
	export MASTER_ADDR=$(hostname)
	export MASTER_PORT=29507
	export OMP_NUM_THREADS=2

	srun -u ./requeueable-train.sh
	#!/bin/bash

	# The -v mounts here are just to get my train.py script in the container
	PODMAN_ARGS="--rm --gpu --ipc=host --network=host --env OMP* \
	-v $PWD:/workspace -w /workspace"

	set -x
	podman-hpc run $PODMAN_ARGS $IMAGE \
	torchrun \
	--nnodes $SLURM_JOB_NUM_NODES \
	--nproc_per_node $SLURM_GPUS_PER_NODE \
	--rdzv-backend=c10d \
	--rdzv-endpoint=$MASTER_ADDR:$MASTER_PORT \
	train.py
	+ podman-hpc run --rm --gpu --ipc=host --network=host -v /global/homes/s/sfarrell/WorkAreas/helping-noel:/workspace -w /workspace nvcr.io/nvidia/pytorch:24.06-py3 torchrun --nnodes 2 --nproc_per_node 4 --rdzv-backend=c10d --rdzv-endpoint=nid001305:29507 train.py
	+ podman-hpc run --rm --gpu --ipc=host --network=host -v /global/homes/s/sfarrell/WorkAreas/helping-noel:/workspace -w /workspace nvcr.io/nvidia/pytorch:24.06-py3 torchrun --nnodes 2 --nproc_per_node 4 --rdzv-backend=c10d --rdzv-endpoint=nid001305:29507 train.py

	=============
	== PyTorch ==
	=============

	NVIDIA Release 24.06 (build 96418707)
	PyTorch Version 2.4.0a0+f70bd71
	Container image Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	Copyright (c) 2014-2024 Facebook Inc.
	Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
	Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
	Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
	Copyright (c) 2011-2013 NYU (Clement Farabet)
	Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
	Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
	Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
	Copyright (c) 2015 Google Inc.
	Copyright (c) 2015 Yangqing Jia
	Copyright (c) 2013-2016 The Caffe contributors
	All rights reserved.

	Various files include modifications (c) NVIDIA CORPORATION & AFFILIATES. All rights reserved.

	This container image and its contents are governed by the NVIDIA Deep Learning Container License.
	By pulling and using the container, you accept the terms and conditions of this license:
	https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license

	=============
	== PyTorch ==
	=============

	NVIDIA Release 24.06 (build 96418707)
	PyTorch Version 2.4.0a0+f70bd71
	Container image Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	Copyright (c) 2014-2024 Facebook Inc.
	Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
	Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
	Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
	Copyright (c) 2011-2013 NYU (Clement Farabet)
	Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
	Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
	Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
	Copyright (c) 2015 Google Inc.
	Copyright (c) 2015 Yangqing Jia
	Copyright (c) 2013-2016 The Caffe contributors
	All rights reserved.

	Various files include modifications (c) NVIDIA CORPORATION & AFFILIATES. All rights reserved.

	This container image and its contents are governed by the NVIDIA Deep Learning Container License.
	By pulling and using the container, you accept the terms and conditions of this license:
	https://developer.nvidia.com/ngc/nvidia-deep-learning-container-license

	NOTE: CUDA Forward Compatibility mode ENABLED.
	Using CUDA 12.5 driver version 555.42.02 with kernel driver version 550.127.08.
	See https://docs.nvidia.com/deploy/cuda-compatibility/ for details.


	NOTE: CUDA Forward Compatibility mode ENABLED.
	Using CUDA 12.5 driver version 555.42.02 with kernel driver version 550.127.08.
	See https://docs.nvidia.com/deploy/cuda-compatibility/ for details.

	W0515 03:15:07.651000 139989962904192 torch/distributed/run.py:778]
	W0515 03:15:07.651000 139989962904192 torch/distributed/run.py:778] *****************************************
	W0515 03:15:07.651000 139989962904192 torch/distributed/run.py:778] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
	W0515 03:15:07.651000 139989962904192 torch/distributed/run.py:778] *****************************************
	W0515 03:15:07.883000 139758139224704 torch/distributed/run.py:778]
	W0515 03:15:07.883000 139758139224704 torch/distributed/run.py:778] *****************************************
	W0515 03:15:07.883000 139758139224704 torch/distributed/run.py:778] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
	W0515 03:15:07.883000 139758139224704 torch/distributed/run.py:778] *****************************************
	Initialized host nid001305 rank 3 local-rank 3 size 8
	Initialized host nid001305 rank 0 local-rank 0 size 8
	Initialized host nid001305 rank 1 local-rank 1 size 8
	Initialized host nid001305 rank 2 local-rank 2 size 8
	Using device: cuda:0
	Generating a batch of data
	Initialized host nid001308 rank 6 local-rank 2 size 8
	Initialized host nid001308 rank 5 local-rank 1 size 8
	Initialized host nid001308 rank 7 local-rank 3 size 8
	Initialized host nid001308 rank 4 local-rank 0 size 8
	Using device: cuda:0
	Generating a batch of data
	Using device: cuda:1
	Generating a batch of data
	Constructing model
	Using device: cuda:2
	Generating a batch of data
	Constructing model
	Using device: cuda:3
	Generating a batch of data
	Constructing model
	Constructing model
	Using device: cuda:1
	Generating a batch of data
	Constructing model
	Constructing model
	Using device: cuda:2
	Generating a batch of data
	Using device: cuda:3
	Generating a batch of data
	Constructing model
	Constructing model
	Performing one training step
	Performing one training step
	Performing one training step
	Performing one training step
	Performing one training step
	Performing one training step
	Performing one training step
	Performing one training step
	Finished
	Finished
	Finished
	Finished
	Finished
	Finished
	Finished
	Finished