Slurm stuffs

1. Quickrun

# Rapid test
srun --nodes 2 --ntasks-per-node 1 /usr/bin/hostname

# Move job to another partition
scontrol update job <jobid> Partition=<partition_name>

# Run remote bash
srun -N1 --job-name=<JOBNAME> --pty /bin/bash

# For enroot+pyxis. These are verbatim copies from
# https://github.com/aws-samples/aws-parallelcluster-post-install-scripts/blob/main/pyxis/test.sh
srun grep PRETTY /etc/os-release
srun --container-image=alpine grep PRETTY /etc/os-release
sbatch --container-image=alpine --wrap "grep PRETTY /etc/os-release"
sbatch --wrap "srun --container-image=alpine grep PRETTY /etc/os-release"
srun --container-image=nvidia/cuda:11.6.2-base-ubuntu20.04 nvidia-smi

# Enroot
enroot import -o /tmp/alpine.sqsh docker://alpine:latest
enroot create /tmp/alpine.sqsh
enroot list  # Show container name. By default, xxx.sqsh => xxx
enroot start --env NVIDIA_VISIBLE_DEVICES=void alpine grep PRETTY /etc/os-release
enroot remove alpine

######## [Deprecated] ########
# Lately, I notice `enroot start xxx.sqsh` hangs on many Ubuntu systems. I'm not sure why.
#
# The hang behavior may be the same as https://github.com/NVIDIA/enroot/issues/130 and the solution
# is to `enroot create xxx.sqsh ; enroot start xxx ; enroot remove xxx`.
apt-get -y -o DPkg::Lock::Timeout=120 install fuse-overlayfs squashfuse   # Required by `enroot start xxx.sqsh`
enroot start --env NVIDIA_VISIBLE_DEVICES=void /tmp/alpine.sqsh grep PRETTY /etc/os-release
##############################

# Send a job to specific node, to get PCluster to start the instance backing that node.
sbatch -N1 --nodelist=p4d-24xlarge-dy-p4d24xlarge-1 --job-name=power-up-node-1 --wrap "srun --container-image=alpine grep PRETTY /etc/os-release"
sbatch -N1 --nodelist=p4d-24xlarge-dy-p4d24xlarge-1 --job-name=power-up-node-1 --wrap "srun /usr/bin/hostname"

Checking test results.

# When a job runs a for-loop for all_reduce
grep '^# Out of bounds values : ' job-xxxx-all-reduce-perf.txt | nl
watch -n30 "grep '^# Out of bounds values : ' job-xxxx-all-reduce-perf.txt | nl"

# When job runs a for-loop for `run.py ddp`
grep 'rank: 0, step: 900$' slurm-xxxx.out | nl
watch -n30 "grep 'rank: 0, step: 900$' slurm-xxxx.out | nl"

# See what's in /dev/shm
srun -N4 --partition=marcverd-container-04 bash -c 'hostname && ls -al /dev/shm | head'

GPU env vars:

$ srun -l bash -c 'env | egrep "ORDINAL|GPU|CUDA_VISIBLE_DEVICES"'
slurm-60.out:CUDA_VISIBLE_DEVICES=4,5,6,7
slurm-60.out:GPU_DEVICE_ORDINAL=4,5,6,7
slurm-60.out:+ srun -l bash -c 'env | egrep "GPU|CUDA_VISIBLE_DEVICES"'
slurm-60.out:0: CUDA_VISIBLE_DEVICES=4,5,6,7
slurm-60.out:0: GPU_DEVICE_ORDINAL=4,5,6,7
slurm-60.out:1: CUDA_VISIBLE_DEVICES=4,5,6,7
slurm-60.out:1: GPU_DEVICE_ORDINAL=4,5,6,7

$ srun --gres=gpu:a100:4 bash -c 'env | grep GPU' ; echo
SLURM_STEP_GPUS=0,1,2,3
SLURM_GPUS_ON_NODE=4
GPU_DEVICE_ORDINAL=0,1,2,3

$ srun --gpus-per-node=a100:4 bash -c 'env | grep GPU' ; echo
SLURM_GPUS_PER_NODE=a100:4
SLURM_STEP_GPUS=0,1,2,3
SLURM_GPUS_ON_NODE=4
GPU_DEVICE_ORDINAL=0,1,2,3

$ srun --gpus-per-node=4 bash -c 'env | grep GPU' ; echo
SLURM_GPUS_PER_NODE=4
SLURM_STEP_GPUS=0,1,2,3
SLURM_GPUS_ON_NODE=4
GPU_DEVICE_ORDINAL=0,1,2,3

2. Management

# Restart slurmd on compute nodes
ssh ip-26-0-156-59.us-west-2.compute.internal 'sudo systemctl restart slurmd; sudo systemctl status slurmd'
ssh ip-26-0-148-28.us-west-2.compute.internal 'sudo systemctl restart slurmd; sudo systemctl status slurmd'
sudo systemctl restart slurmctld; sleep 1; sudo systemctl restart slurmctld

# Set node state (e.g., from drain) to idle.
sudo /opt/slurm/bin/scontrol update PartitionName=compute-gpu state=INACTIVE
sudo /opt/slurm/bin/scontrol update NodeName=compute-gpu-st-distributed-ml-[1-2] state=power_down reason=TERMINATE
sudo /opt/slurm/bin/scontrol update nodename=ip-26-0-148-28 state=idle

# Reload slurm.conf
sudo /opt/slurm/bin/scontrol reconfigure

# Reboot compute nodes
sudo /opt/slurm/bin/scontrol reboot host-01,host-02

# Patch compute node with enroot hooks -- when forgetting to configure create cluster with it...
# Assume /fsx/50-slurm-pmi.sh exists and 755 mode.
srun -N16 sudo cp /fsx/50-slurm-pmi.sh /etc/enroot/hooks.d/
srun -N16 ls -al /etc/enroot/hooks.d/50-slurm-pmi.sh

# Clear orphan /dev/shm/* stuffs -> usually when RemoveIPC=no and after job crashes...
srun -l -N16 --partition=marcverd-container-04 bash -c 'hostname && ls -al /dev/shm | head' | sort
srun -l -N16 bash -c 'hostname && rm /dev/shm/fe80*'
srun -l -N16 bash -c 'hostname && rm /dev/shm/nccl*'

# PCluster: keep down/drain nodes around. @lukeseawalker
#
# Edit the config file, set or add the two lines
# https://github.com/aws/aws-parallelcluster-node/blob/4f5fdc6cecbfe3d19c37012d4b1900564d1e7589/src/slurm_plugin/clustermgtd.py#L148-L149
#
# No need to restart any daemon.
# It should see the change starting from the next loop.
# You can monitor the clustermgtd log to see this.
sudo vi /etc/parallelcluster/slurm_plugin/parallelcluster_clustermgtd.conf
terminate_drain_nodes = False
terminate_down_nodes = False

3. Query

How a job can probe its ec2 topology:

$ cat job.sbatch
#!/bin/bash

set -exuo pipefail

lstopo_ec2() {
    local INSTANCE_IDS=( $(srun cat /sys/devices/virtual/dmi/id/board_asset_tag) )
    aws ec2 describe-instance-topology --instance-ids "${INSTANCE_IDS[@]}"
}

validate_ec2_same_spine() {
    local TOPO_JSON="$(lstopo_ec2)"
    echo "${TOPO_JSON}"

    local UNIQ_NN=$(echo "$TOPO_JSON" | grep '^  *"nn\-.................\"' | sort -n | uniq -c | wc -l)
    echo Expected 3 nn ids, got $UNIQ_NN nn ids
    [[ $UNIQ_NN -eq 3 ]] || echo WARNING: ec2 instances on different network spine...
}
validate_ec2_same_spine


$ srun -N2 job.sbatch
{
    "Instances": [
        {
            "InstanceId": "i-01e2c3b3fc9d809a1",
            "InstanceType": "p4de.24xlarge",
            "NetworkNodes": [
                "nn-b59b170155f6801f8",
                "nn-a42b0750f49b48c69",
                "nn-38464347ecf47e324"
            ],
            "AvailabilityZone": "us-west-2b",
            "ZoneId": "usw2-az2"
        },
        {
            "InstanceId": "i-0bd4381e8c71587ae",
            "InstanceType": "p4de.24xlarge",
            "NetworkNodes": [
                "nn-b59b170155f6801f8",
                "nn-a42b0750f49b48c69",
                "nn-38464347ecf47e324"
            ],
            "AvailabilityZone": "us-west-2b",
            "ZoneId": "usw2-az2"
        }
    ]
}
Expected 3 nn ids, got 3 nn ids
# If expected != actual, show:
WARNING: ec2 instances on different network spine...

Slurm queries:

# Show gres
$ sinfo -o "%20N  %10c  %10m  %25f  %10G"
NODELIST              CPUS        MEMORY      AVAIL_FEATURES             GRES      
compute-gpu-st-gpu-[  96          1120665     static,p4de.24xlarge,gpu,  gpu:a100:8

# Show cpu configurations
$ sinfo -o '%9P %4c %8z %8X %8Y %8Z'
PARTITION CPUS S:C:T    SOCKETS  CORES    THREADS 
p5*       192  192:1:1  192      1        1

# Tabular view of nodes
$ sinfo --Node --long
...

# Tabular view of nodes, with custom columns
$ sinfo --Format nodehost,available,cpu,cpusload,cpusstate,features_act,freemem,gres,gresused,user -N -n <nodelist>
...

# Tabular view of jobs, when sacct is enabled.
$ sacct -o jobid,start,end,elapsed -j 6162
JobID Start End Elapsed
----- ----- --- -------
...   ...   ... ...

# Expand jobname
$ squeue --format="%.18i %.9P %.30j %.8u %.8t %.10M %.9l %.6D %R" --me
             JOBID PARTITION                           NAME     USER       ST       TIME TIME_LIMI  NODES NODELIST(REASON)
                31 compute-g           gpt3-40b2-nccl_stock ec2-user        R       7:58 UNLIMITED      2 compute-gpu-st-gpu-[3-4]
                32 compute-g           gpt3-40b8-nccl_stock ec2-user        R       5:56 UNLIMITED      8 compute-gpu-st-gpu-[5-12]

# Expand state
$ squeue --format="%.18i %.9P %.30j %.8u %.8t %.10M %.9l %.6D %R" --me
             JOBID PARTITION                           NAME     USER    STATE       TIME TIME_LIMI  NODES NODELIST(REASON)
                31 compute-g           gpt3-40b2-nccl_stock ec2-user  RUNNING       6:33 UNLIMITED      2 compute-gpu-st-gpu-[3-4]
                32 compute-g           gpt3-40b8-nccl_stock ec2-user  RUNNING       4:31 UNLIMITED      8 compute-gpu-st-gpu-[5-12]

# Show HOSTNAMES column, but don't print the header
$ sinfo --partition marcverd-container-04 --Node -o '%n'
ip-26-0-161-41
ip-26-0-169-33
ip-26-0-169-123
ip-26-0-171-195

# Check logind's RemoveIPC setting on a partition. This is fancy way, as in
# to have an example of sinfo command. You may want to just srun or sbatch
# the check command.
$ sinfo --partition marcverd-container-04 --Node -o '%n' --noheader \
    | xargs -n1 -I{} ssh {} "
echo '########'
hostname
echo -n 'Loaded config is '
loginctl show-session | grep RemoveIPC
echo
echo Config file:
systemd-analyze cat-config systemd/logind.conf | grep 'RemoveIPC'
" | egrep 'RemoveIPC=.*|$'
# NOTE: the last egrep is to have syntax coloring in the output.
# Below is the sample output:
Warning: Permanently added 'ip-26-0-161-41,26.0.161.41' (ECDSA) to the list of known hosts.
########
ip-26-0-161-41
Loaded config is RemoveIPC=no

Config file:
#RemoveIPC=yes
RemoveIPC=no
Warning: Permanently added 'ip-26-0-169-33,26.0.169.33' (ECDSA) to the list of known hosts.
########
ip-26-0-169-33
Loaded config is RemoveIPC=no

Config file:
#RemoveIPC=yes
RemoveIPC=no
Warning: Permanently added 'ip-26-0-169-123,26.0.169.123' (ECDSA) to the list of known hosts.
########
...

verdimrc/slurm.md

Slurm stuffs

1. Quickrun

2. Management

3. Query