Skip to content

Instantly share code, notes, and snippets.

@fo40225
Last active November 25, 2024 09:30
Show Gist options
  • Save fo40225/d534374aad3dfe8f2191d802a996235b to your computer and use it in GitHub Desktop.
Save fo40225/d534374aad3dfe8f2191d802a996235b to your computer and use it in GitHub Desktop.
setup slurm cluster on ubuntu server
# hostname ip
# master 192.168.1.100
# slave-1 192.168.1.101
# slave-2 192.168.1.102
sudo apt update
sudo apt install -y slurm-wlm
# 1604
# https://slurm.schedmd.com/archive/slurm-15.08.10/configurator.html
# 1804
# https://slurm.schedmd.com/archive/slurm-17.11.2/configurator.html
# 2004
# https://slurm.schedmd.com/archive/slurm-19.05.5/configurator.html
# 2204
# https://slurm.schedmd.com/archive/slurm-21.08.5/configurator.html
# 2404
# https://slurm.schedmd.com/archive/slurm-23.11.4/configurator.html
# 1604 1804 2004
# /etc/slurm-llnl/slurm.conf
# 2204 2404
# /etc/slurm/slurm.conf
#====slurm.conf 2404 example====
# slurm.conf file generated by configurator.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
# https://slurm.schedmd.com/archive/slurm-23.11.4/slurm.conf.html
ClusterName=cluster
SlurmctldHost=master(192.168.1.100)
ProctrackType=proctrack/cgroup
ReturnToService=1
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=slurm
StateSaveLocation=/var/spool/slurmctld
TaskPlugin=task/affinity,task/cgroup
InactiveLimit=0
KillWait=30
MinJobAge=300
SlurmctldTimeout=120
SlurmdTimeout=300
Waittime=0
SchedulerType=sched/backfill
SelectType=select/cons_tres
JobCompType=jobcomp/none
JobAcctGatherFrequency=30
SlurmctldDebug=info
SlurmctldLogFile=/var/log/slurm/slurmctld.log #### not /var/log/slurmctld.log
SlurmdDebug=info
SlurmdLogFile=/var/log/slurm/slurmd.log #### not /var/log/slurmd.log
NodeName=slave-1 NodeAddr=192.168.1.101 CPUs=1 State=UNKNOWN
NodeName=slave-2 NodeAddr=192.168.1.102 CPUs=1 State=UNKNOWN
PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP
#====slurm.conf 2404 example====
# master node
sudo mkdir /var/spool/slurmctld
sudo chown slurm:slurm /var/spool/slurmctld
sudo systemctl enable slurmctld
sudo systemctl restart slurmctld
sudo systemctl disable slurmd
sudo systemctl stop slurmd
# copy to all slave node
sudo scp /etc/munge/munge.key [email protected]:/home/user
sudo scp /etc/munge/munge.key [email protected]:/home/user
# slave node
sudo systemctl disable slurmctld
sudo systemctl stop slurmctld
sudo systemctl enable slurmd
sudo systemctl restart slurmd
# from master
sudo mv /home/user/munge.key /etc/munge/munge.key
sudo chown munge:munge /etc/munge/munge.key
sudo chmod 600 /etc/munge/munge.key
sudo systemctl restart munge
srun hostname
srun sleep 60 &
srun hostname
# other useful setting
# https://slurm.schedmd.com/archive/slurm-23.11.4/cgroup.conf.html
# https://slurm.schedmd.com/archive/slurm-23.11.4/job_container_tmpfs.html
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment