Last active
November 25, 2024 09:30
-
-
Save fo40225/d534374aad3dfe8f2191d802a996235b to your computer and use it in GitHub Desktop.
setup slurm cluster on ubuntu server
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# hostname ip | |
# master 192.168.1.100 | |
# slave-1 192.168.1.101 | |
# slave-2 192.168.1.102 | |
sudo apt update | |
sudo apt install -y slurm-wlm | |
# 1604 | |
# https://slurm.schedmd.com/archive/slurm-15.08.10/configurator.html | |
# 1804 | |
# https://slurm.schedmd.com/archive/slurm-17.11.2/configurator.html | |
# 2004 | |
# https://slurm.schedmd.com/archive/slurm-19.05.5/configurator.html | |
# 2204 | |
# https://slurm.schedmd.com/archive/slurm-21.08.5/configurator.html | |
# 2404 | |
# https://slurm.schedmd.com/archive/slurm-23.11.4/configurator.html | |
# 1604 1804 2004 | |
# /etc/slurm-llnl/slurm.conf | |
# 2204 2404 | |
# /etc/slurm/slurm.conf | |
#====slurm.conf 2404 example==== | |
# slurm.conf file generated by configurator.html. | |
# Put this file on all nodes of your cluster. | |
# See the slurm.conf man page for more information. | |
# https://slurm.schedmd.com/archive/slurm-23.11.4/slurm.conf.html | |
ClusterName=cluster | |
SlurmctldHost=master(192.168.1.100) | |
ProctrackType=proctrack/cgroup | |
ReturnToService=1 | |
SlurmctldPidFile=/var/run/slurmctld.pid | |
SlurmctldPort=6817 | |
SlurmdPidFile=/var/run/slurmd.pid | |
SlurmdPort=6818 | |
SlurmdSpoolDir=/var/spool/slurmd | |
SlurmUser=slurm | |
StateSaveLocation=/var/spool/slurmctld | |
TaskPlugin=task/affinity,task/cgroup | |
InactiveLimit=0 | |
KillWait=30 | |
MinJobAge=300 | |
SlurmctldTimeout=120 | |
SlurmdTimeout=300 | |
Waittime=0 | |
SchedulerType=sched/backfill | |
SelectType=select/cons_tres | |
JobCompType=jobcomp/none | |
JobAcctGatherFrequency=30 | |
SlurmctldDebug=info | |
SlurmctldLogFile=/var/log/slurm/slurmctld.log #### not /var/log/slurmctld.log | |
SlurmdDebug=info | |
SlurmdLogFile=/var/log/slurm/slurmd.log #### not /var/log/slurmd.log | |
NodeName=slave-1 NodeAddr=192.168.1.101 CPUs=1 State=UNKNOWN | |
NodeName=slave-2 NodeAddr=192.168.1.102 CPUs=1 State=UNKNOWN | |
PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP | |
#====slurm.conf 2404 example==== | |
# master node | |
sudo mkdir /var/spool/slurmctld | |
sudo chown slurm:slurm /var/spool/slurmctld | |
sudo systemctl enable slurmctld | |
sudo systemctl restart slurmctld | |
sudo systemctl disable slurmd | |
sudo systemctl stop slurmd | |
# copy to all slave node | |
sudo scp /etc/munge/munge.key [email protected]:/home/user | |
sudo scp /etc/munge/munge.key [email protected]:/home/user | |
# slave node | |
sudo systemctl disable slurmctld | |
sudo systemctl stop slurmctld | |
sudo systemctl enable slurmd | |
sudo systemctl restart slurmd | |
# from master | |
sudo mv /home/user/munge.key /etc/munge/munge.key | |
sudo chown munge:munge /etc/munge/munge.key | |
sudo chmod 600 /etc/munge/munge.key | |
sudo systemctl restart munge | |
srun hostname | |
srun sleep 60 & | |
srun hostname | |
# other useful setting | |
# https://slurm.schedmd.com/archive/slurm-23.11.4/cgroup.conf.html | |
# https://slurm.schedmd.com/archive/slurm-23.11.4/job_container_tmpfs.html |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment