# Rapid test
srun --nodes 2 --ntasks-per-node 1 /usr/bin/hostname
# Move job to another partition
scontrol update job <jobid> Partition=<partition_name>
# Run remote bash
srun -N1 --job-name=<JOBNAME> --pty /bin/bash
# For enroot+pyxis. These are verbatim copies from
# https://github.com/aws-samples/aws-parallelcluster-post-install-scripts/blob/main/pyxis/test.sh
srun grep PRETTY /etc/os-release
srun --container-image=alpine grep PRETTY /etc/os-release
sbatch --container-image=alpine --wrap "grep PRETTY /etc/os-release"
sbatch --wrap "srun --container-image=alpine grep PRETTY /etc/os-release"
srun --container-image=nvidia/cuda:11.6.2-base-ubuntu20.04 nvidia-smi
# Enroot
enroot import -o /tmp/alpine.sqsh docker://alpine:latest
enroot create /tmp/alpine.sqsh
enroot list # Show container name. By default, xxx.sqsh => xxx
enroot start --env NVIDIA_VISIBLE_DEVICES=void alpine grep PRETTY /etc/os-release
enroot remove alpine
######## [Deprecated] ########
# Lately, I notice `enroot start xxx.sqsh` hangs on many Ubuntu systems. I'm not sure why.
#
# The hang behavior may be the same as https://github.com/NVIDIA/enroot/issues/130 and the solution
# is to `enroot create xxx.sqsh ; enroot start xxx ; enroot remove xxx`.
apt-get -y -o DPkg::Lock::Timeout=120 install fuse-overlayfs squashfuse # Required by `enroot start xxx.sqsh`
enroot start --env NVIDIA_VISIBLE_DEVICES=void /tmp/alpine.sqsh grep PRETTY /etc/os-release
##############################
# Send a job to specific node, to get PCluster to start the instance backing that node.
sbatch -N1 --nodelist=p4d-24xlarge-dy-p4d24xlarge-1 --job-name=power-up-node-1 --wrap "srun --container-image=alpine grep PRETTY /etc/os-release"
sbatch -N1 --nodelist=p4d-24xlarge-dy-p4d24xlarge-1 --job-name=power-up-node-1 --wrap "srun /usr/bin/hostname"
Checking test results.
# When a job runs a for-loop for all_reduce
grep '^# Out of bounds values : ' job-xxxx-all-reduce-perf.txt | nl
watch -n30 "grep '^# Out of bounds values : ' job-xxxx-all-reduce-perf.txt | nl"
# When job runs a for-loop for `run.py ddp`
grep 'rank: 0, step: 900$' slurm-xxxx.out | nl
watch -n30 "grep 'rank: 0, step: 900$' slurm-xxxx.out | nl"
# See what's in /dev/shm
srun -N4 --partition=marcverd-container-04 bash -c 'hostname && ls -al /dev/shm | head'
GPU env vars:
$ srun -l bash -c 'env | egrep "ORDINAL|GPU|CUDA_VISIBLE_DEVICES"'
slurm-60.out:CUDA_VISIBLE_DEVICES=4,5,6,7
slurm-60.out:GPU_DEVICE_ORDINAL=4,5,6,7
slurm-60.out:+ srun -l bash -c 'env | egrep "GPU|CUDA_VISIBLE_DEVICES"'
slurm-60.out:0: CUDA_VISIBLE_DEVICES=4,5,6,7
slurm-60.out:0: GPU_DEVICE_ORDINAL=4,5,6,7
slurm-60.out:1: CUDA_VISIBLE_DEVICES=4,5,6,7
slurm-60.out:1: GPU_DEVICE_ORDINAL=4,5,6,7
$ srun --gres=gpu:a100:4 bash -c 'env | grep GPU' ; echo
SLURM_STEP_GPUS=0,1,2,3
SLURM_GPUS_ON_NODE=4
GPU_DEVICE_ORDINAL=0,1,2,3
$ srun --gpus-per-node=a100:4 bash -c 'env | grep GPU' ; echo
SLURM_GPUS_PER_NODE=a100:4
SLURM_STEP_GPUS=0,1,2,3
SLURM_GPUS_ON_NODE=4
GPU_DEVICE_ORDINAL=0,1,2,3
$ srun --gpus-per-node=4 bash -c 'env | grep GPU' ; echo
SLURM_GPUS_PER_NODE=4
SLURM_STEP_GPUS=0,1,2,3
SLURM_GPUS_ON_NODE=4
GPU_DEVICE_ORDINAL=0,1,2,3
# Restart slurmd on compute nodes
ssh ip-26-0-156-59.us-west-2.compute.internal 'sudo systemctl restart slurmd; sudo systemctl status slurmd'
ssh ip-26-0-148-28.us-west-2.compute.internal 'sudo systemctl restart slurmd; sudo systemctl status slurmd'
sudo systemctl restart slurmctld; sleep 1; sudo systemctl restart slurmctld
# Set node state (e.g., from drain) to idle.
sudo /opt/slurm/bin/scontrol update PartitionName=compute-gpu state=INACTIVE
sudo /opt/slurm/bin/scontrol update NodeName=compute-gpu-st-distributed-ml-[1-2] state=power_down reason=TERMINATE
sudo /opt/slurm/bin/scontrol update nodename=ip-26-0-148-28 state=idle
# Reload slurm.conf
sudo /opt/slurm/bin/scontrol reconfigure
# Reboot compute nodes
sudo /opt/slurm/bin/scontrol reboot host-01,host-02
# Patch compute node with enroot hooks -- when forgetting to configure create cluster with it...
# Assume /fsx/50-slurm-pmi.sh exists and 755 mode.
srun -N16 sudo cp /fsx/50-slurm-pmi.sh /etc/enroot/hooks.d/
srun -N16 ls -al /etc/enroot/hooks.d/50-slurm-pmi.sh
# Clear orphan /dev/shm/* stuffs -> usually when RemoveIPC=no and after job crashes...
srun -l -N16 --partition=marcverd-container-04 bash -c 'hostname && ls -al /dev/shm | head' | sort
srun -l -N16 bash -c 'hostname && rm /dev/shm/fe80*'
srun -l -N16 bash -c 'hostname && rm /dev/shm/nccl*'
# PCluster: keep down/drain nodes around. @lukeseawalker
#
# Edit the config file, set or add the two lines
# https://github.com/aws/aws-parallelcluster-node/blob/4f5fdc6cecbfe3d19c37012d4b1900564d1e7589/src/slurm_plugin/clustermgtd.py#L148-L149
#
# No need to restart any daemon.
# It should see the change starting from the next loop.
# You can monitor the clustermgtd log to see this.
sudo vi /etc/parallelcluster/slurm_plugin/parallelcluster_clustermgtd.conf
terminate_drain_nodes = False
terminate_down_nodes = False
How a job can probe its ec2 topology:
$ cat job.sbatch
#!/bin/bash
set -exuo pipefail
lstopo_ec2() {
local INSTANCE_IDS=( $(srun cat /sys/devices/virtual/dmi/id/board_asset_tag) )
aws ec2 describe-instance-topology --instance-ids "${INSTANCE_IDS[@]}"
}
validate_ec2_same_spine() {
local TOPO_JSON="$(lstopo_ec2)"
echo "${TOPO_JSON}"
local UNIQ_NN=$(echo "$TOPO_JSON" | grep '^ *"nn\-.................\"' | sort -n | uniq -c | wc -l)
echo Expected 3 nn ids, got $UNIQ_NN nn ids
[[ $UNIQ_NN -eq 3 ]] || echo WARNING: ec2 instances on different network spine...
}
validate_ec2_same_spine
$ srun -N2 job.sbatch
{
"Instances": [
{
"InstanceId": "i-01e2c3b3fc9d809a1",
"InstanceType": "p4de.24xlarge",
"NetworkNodes": [
"nn-b59b170155f6801f8",
"nn-a42b0750f49b48c69",
"nn-38464347ecf47e324"
],
"AvailabilityZone": "us-west-2b",
"ZoneId": "usw2-az2"
},
{
"InstanceId": "i-0bd4381e8c71587ae",
"InstanceType": "p4de.24xlarge",
"NetworkNodes": [
"nn-b59b170155f6801f8",
"nn-a42b0750f49b48c69",
"nn-38464347ecf47e324"
],
"AvailabilityZone": "us-west-2b",
"ZoneId": "usw2-az2"
}
]
}
Expected 3 nn ids, got 3 nn ids
# If expected != actual, show:
WARNING: ec2 instances on different network spine...
Slurm queries:
# Show gres
$ sinfo -o "%20N %10c %10m %25f %10G"
NODELIST CPUS MEMORY AVAIL_FEATURES GRES
compute-gpu-st-gpu-[ 96 1120665 static,p4de.24xlarge,gpu, gpu:a100:8
# Show cpu configurations
$ sinfo -o '%9P %4c %8z %8X %8Y %8Z'
PARTITION CPUS S:C:T SOCKETS CORES THREADS
p5* 192 192:1:1 192 1 1
# Tabular view of nodes
$ sinfo --Node --long
...
# Tabular view of nodes, with custom columns
$ sinfo --Format nodehost,available,cpu,cpusload,cpusstate,features_act,freemem,gres,gresused,user -N -n <nodelist>
...
# Tabular view of jobs, when sacct is enabled.
$ sacct -o jobid,start,end,elapsed -j 6162
JobID Start End Elapsed
----- ----- --- -------
... ... ... ...
# Expand jobname
$ squeue --format="%.18i %.9P %.30j %.8u %.8t %.10M %.9l %.6D %R" --me
JOBID PARTITION NAME USER ST TIME TIME_LIMI NODES NODELIST(REASON)
31 compute-g gpt3-40b2-nccl_stock ec2-user R 7:58 UNLIMITED 2 compute-gpu-st-gpu-[3-4]
32 compute-g gpt3-40b8-nccl_stock ec2-user R 5:56 UNLIMITED 8 compute-gpu-st-gpu-[5-12]
# Expand state
$ squeue --format="%.18i %.9P %.30j %.8u %.8t %.10M %.9l %.6D %R" --me
JOBID PARTITION NAME USER STATE TIME TIME_LIMI NODES NODELIST(REASON)
31 compute-g gpt3-40b2-nccl_stock ec2-user RUNNING 6:33 UNLIMITED 2 compute-gpu-st-gpu-[3-4]
32 compute-g gpt3-40b8-nccl_stock ec2-user RUNNING 4:31 UNLIMITED 8 compute-gpu-st-gpu-[5-12]
# Show HOSTNAMES column, but don't print the header
$ sinfo --partition marcverd-container-04 --Node -o '%n'
ip-26-0-161-41
ip-26-0-169-33
ip-26-0-169-123
ip-26-0-171-195
# Check logind's RemoveIPC setting on a partition. This is fancy way, as in
# to have an example of sinfo command. You may want to just srun or sbatch
# the check command.
$ sinfo --partition marcverd-container-04 --Node -o '%n' --noheader \
| xargs -n1 -I{} ssh {} "
echo '########'
hostname
echo -n 'Loaded config is '
loginctl show-session | grep RemoveIPC
echo
echo Config file:
systemd-analyze cat-config systemd/logind.conf | grep 'RemoveIPC'
" | egrep 'RemoveIPC=.*|$'
# NOTE: the last egrep is to have syntax coloring in the output.
# Below is the sample output:
Warning: Permanently added 'ip-26-0-161-41,26.0.161.41' (ECDSA) to the list of known hosts.
########
ip-26-0-161-41
Loaded config is RemoveIPC=no
Config file:
#RemoveIPC=yes
RemoveIPC=no
Warning: Permanently added 'ip-26-0-169-33,26.0.169.33' (ECDSA) to the list of known hosts.
########
ip-26-0-169-33
Loaded config is RemoveIPC=no
Config file:
#RemoveIPC=yes
RemoveIPC=no
Warning: Permanently added 'ip-26-0-169-123,26.0.169.123' (ECDSA) to the list of known hosts.
########
...