Skip to content

Instantly share code, notes, and snippets.

@kanaka
Created May 21, 2012 18:35
Show Gist options
  • Select an option

  • Save kanaka/2763820 to your computer and use it in GitHub Desktop.

Select an option

Save kanaka/2763820 to your computer and use it in GitHub Desktop.
scboot
#!/bin/bash
# $Id: scboot 64763 2008-11-15 14:29:58Z rwoodscorwin $
usage() {
echo "
Usage: $(basename $0) [options]
Boot the nodes of a SiCortex system.
ARG ENV VARIABLE DESCRIPTION
-a xxx append xxx to kernel command line
-f xxx SCBOOT_FABRICD override fabricd
-h show this message
-i xxx tgz, cpio, or directory of initramfs files
can be specified multiple times
-k xxx SCBOOT_KERNEL_FILES override kernel files directory
(containing vmlinux and System.map)
-K xxx SCBOOT_KERNEL_MODULES override kernel module directory
(default: <kernel_files>/modules)
-l xxx SCBOOT_KERNEL override vmlinux
(default: <kernel_files>/vmlinux)
-L xxx SCBOOT_LUSTRE_MODULES override lustre modules tree
SCBOOT_MYRINET_MODULES override myrinet modules tree
SCBOOT_MYRIMX_MODULES override myrinet-mx modules tree
SCBOOT_OPENIB_MODULES override openib modules tree
SCBOOT_QLOGIC_MODULES override qlogic modules tree
SCBOOT_SK98LIN_MODULES override sk98lin modules tree
-M xxx SCBOOT_OTHER_MODULES additional kernel modules trees
can be specified multiple times
-p xxx SLURM_PARTITION override partition to boot
-r xxx SCBOOT_ROOT override path to node rootfs directory
-R xxx SCBOOT_ROOT_MODE node rootfs mode (nfs, nbd)
(default: nfs, nbd for 200+ node systems)
-s xxx SCBOOT_SCAND_DIR override directory with attnd and mspscand
-u xxx SCBOOT_UCLINUX override uclinux binary image
SCBOOT_BOOTK_GEN override bootk generator directory
SCBOOT_INITRAMFS_GEN override initramfs generator directory
SCBOOT_BAMF override bamf executable
SCBOOT_BOOT1_DIR override the boot program directory
SCBOOT_BOOT_SCRIPTS override boot scripts directory
SCBOOT_CHECK_ROUTE_INFO override check_route_info
SCBOOT_INITRAMFS override the base initramfs directory
SCBOOT_MFD override mfd (default: <mfd_dir>/mfd.py)
SCBOOT_MFD_DIR override mfd directory
SCBOOT_MFD_WATCHER override mfd_watcher
SCBOOT_GLOBAL_CLOCK override global clock master agent
SCBOOT_MKEXT2IMG override mkext2img executable
SCBOOT_MONITOR override scbootmon executable
SCBOOT_ROMFSTOOL override romfstool executable
-v,--verbose be verbose
--show_settings show internal settings base on defaults,
environment, and cmd line and then exit
--loglevel='xxx' override the default kernel loglevel
SCBOOT_LOGLEVEL '' '' '' '' ''
--noclean do not clean up tftp working directory
--scand_options='xxx' command line options for starting scand
--nodes=mXnY reboot a single node (warmboot)
--msp_setup_only exit after doing MSP setup
The following options take a value of 'force', 'skip' or 'auto'.
'auto' means that scboot will try to determine the most logical
action. 'force' means the action should always be taken and 'skip'
means the check should be skipped entirely.
--start_mfd=[MODE] restart MFD (default: force)
--start_msp=[MODE] restart MSP (default: auto)
--create_node_img=[MODE] update node rootfs image (default: auto)
"
exit 2
}
# Prevent pollution from non SCBOOT_ prefix environment variables
verbose= pretend= noclean= cluster= track_boot=
msp_setup_only= show_settings= start_mfd= start_msp= create_node_img=
partition= root= root_n32= root_mode= loglevel= fabricd= scand_dir= uclinux=
fabricd= scand_dir= uclinux= initramfs_base= boot1_dir=
kernel_files= kernel_modules= vmlinux= lustre_modules=
sk98lin_modules= qlogic_modules= myrinet_modules= myrimx_modules= openib_modules= mfd= other_modules=
# Defaults for command line arguments
start_mfd=force # mfd mode
start_msp=auto # MSP mode
create_node_img=auto # node rootfs image mode
# Defaults with only environment override
scdir="${SCBOOT_PREFIX}/opt/sicortex"
bootk_generator="${SCBOOT_BOOTK_GEN:-${scdir}/boot/bootk/default}"
initramfs_generator="${SCBOOT_INITRAMFS_GEN:-${scdir}/boot/initramfs/default}"
cluster_conf="${SCBOOT_CLUSTER_CONF:-/etc/sicortex.conf}"
mfd_dir="${SCBOOT_MFD_DIR:-${scdir}/mfd/default}"
scbootdir_base="${SCBOOT_DIR_BASE:-/tftproot/scboot_tmp}"
bamf="${SCBOOT_BAMF:-${SCBOOT_PREFIX}/usr/bin/bamf}"
master_clock_agent="${SCBOOT_GLOBAL_CLOCK:-${SCBOOT_PREFIX}/usr/sbin/master_clock_agent}"
mfd_watcher="${SCBOOT_MFD_WATCHER:-${SCBOOT_PREFIX}/usr/bin/mfd_watcher}"
check_route_info="${SCBOOT_CHECK_ROUTE_INFO:-${SCBOOT_PREFIX}/usr/bin/check_route_info}"
romfstool="${SCBOOT_ROMFSTOOL:-${SCBOOT_PREFIX}/sbin/romfstool}"
mkext2img="${SCBOOT_MKEXT2IMG:-${SCBOOT_PREFIX}/sbin/mkext2img}"
scbootmon="${SCBOOT_MONITOR:-${SCBOOT_PREFIX}/sbin/scbootmon_dstat}"
scbootmon_options="${SCBOOT_MONITOR_OPTIONS:---oneline}"
dbg_points="${SCBOOT_DBG_POINTS}"
# Bootscripts coupled to other SSP components so default to SSP path
boot_scripts="${SCBOOT_BOOT_SCRIPTS:-/opt/sicortex/bootscripts/default}"
# Misc settings
CTHLIB=/opt/sicortex/config/cthlib
msp_state_dir=/var/log/mspstate
scboot_log=/var/log/scboot.log
msp_timeout=240
mypid=$$
scboot_cmdline="${0}$(printf " %q" "$@")"
scboot_inst= # set after argument processing
pidlist= # for tracking backgrounded subprocesses
scboot_traps="TERM QUIT INT EXIT"
dirty_flags= # for tracking flag usage inconsistent with warmboot
#
# Utility functions
#
die() {
local ret="$1"; shift
echo -e "$@" >&2
exit "$ret"
}
vecho() {
[ "${verbose}" ] && echo "$@"
}
DF() {
# track flag usage inconsistent with warmboot
dirty_flags="${dirty_flags} ${param}"
}
#
# Process arguments
#
orig_cmdline="$0 $*"
while [ "$*" ]; do
param=$1; shift
# Options with no leading dash must be an argument to
# a preceding option
if [ "${1}" ] && [ "${1:0:1}" != "-" ]; then
OPTARG="$1"; shift
elif [ "${param/=/}" != "${param}" ]; then
OPTARG="${param#*=}"
else
OPTARG=
fi
case $param in
-a) append_args="${append_args} $OPTARG"; DF ;;
-f) fabricd=$OPTARG; DF ;;
-h) usage ;;
-i) initramfs_sources="${initramfs_sources} $OPTARG"; DF ;;
-k) kernel_files=$OPTARG; DF ;;
-K) kernel_modules=$OPTARG; DF ;;
-l) vmlinux=$OPTARG; DF ;;
-L) lustre_modules=$OPTARG; DF ;;
-M) other_modules="${other_modules} $OPTARG"; DF ;;
-p) partition=$OPTARG ;;
-r) root=$OPTARG; DF ;;
-R) root_mode=$OPTARG; DF ;;
-u) uclinux=$OPTARG; DF ;;
-s) scand_dir=$OPTARG; DF ;;
-v|--verbose) verbose=1 ;;
--show_settings) show_settings=1 ;;
--loglevel*) loglevel="$OPTARG"; DF ;;
--noclean) noclean=1 ;;
--pretend) pretend=1 ;;
--scand_options*) scand_options="$OPTARG"; DF ;;
--start_mfd*) start_mfd="$OPTARG"; DF ;;
--start_msp*) start_msp="$OPTARG"; DF ;;
--create_node_img*) create_node_img="$OPTARG"; DF ;;
--msp_setup_only*) msp_setup_only=1; DF ;;
--nodes*) warmboot="$OPTARG" ;;
--trackboot) track_boot=1;; # ignored now - monitoring always on
*) usage ;;
esac
done
# Defaults defined in cthlib
source $CTHLIB \
SCv_system_profile \
SCv_pclk_mhz \
SCv_kernel_bigphysarea \
|| die 1 "cthlib call failed"
# Find user specified partition
partition="${partition:-${SCBOOT_PARTITION:-${SLURM_PARTITION}}}"
# If partition is still not specified, identify a default boot partition
if [ -z "${partition}" ]; then
case ${SCv_system_profile} in
sfx)
sfx_ethers=$(egrep -o '^[^#]+s[xf][0-9]-msp[0-9]+' /etc/ethers)
sfx_count=$(wc -l <<<"${sfx_ethers}")
if [ "${sfx_count}" = "1" ]; then
sfx_host="${sfx_ethers##* }"
partition="${sfx_host%%-*}"
fi
;;
sc) partition="sc1" ;;
*) partition="${SCv_system_profile}" ;;
esac
fi
if [ -z "${partition}" ]; then
echo >&2 "Partition not set. Not sure which system to boot."
echo >&2 "The partition is set with the -p argument"
echo >&2 "or with the SLURM_PARTITION environment variable."
die 2
fi
# Defaults that come from global config via cthlib
# These are partition specific so partition must be set first
source $CTHLIB \
SCv_partition=${partition} \
SCv_rootfs \
SCv_rootfs_n32 \
SCv_rootfs_mode \
SCv_kernel_files \
SCv_kernel_boot_loglevel \
SCv_append_kargs \
|| die 1 "cthlib call failed"
rootfs_base="${scdir}/rootfs"
root="${root:-${SCBOOT_ROOT:-${SCv_rootfs:-${rootfs_base}/default}}}"
root_n32="${root_n32:-${SCBOOT_ROOT_N32:-${SCv_rootfs_n32:-${rootfs_base}/build.n32}}}"
root_mode="${root_mode:-${SCBOOT_ROOT_MODE:-${SCv_rootfs_mode}}}"
kernel_files="${kernel_files:-${SCBOOT_KERNEL_FILES:-${SCv_kernel_files:-${scdir}/kernel/linux/default}}}"
loglevel="${loglevel:-${SCBOOT_LOGLEVEL:-${SCv_kernel_boot_loglevel}}}"
append_args="${append_args:-${SCv_append_kargs}}"
append_args="${append_args# }"
# Defaults with environment and command line overrides
fabricd="${fabricd:-${SCBOOT_FABRICD:-${scdir}/fabricd/default}}"
scand_dir="${scand_dir:-${SCBOOT_SCAND_DIR:-${scdir}/msp/default/usr/bin/}}"
uclinux="${uclinux:-${SCBOOT_UCLINUX:-${scdir}/msp/uclinux/default/msp_linux.bin}}"
# Defaults with environment and command line overrides that are based
# on other arguments
root_img="$(readlink -f ${root}).img"
root_n32_img="$(readlink -f ${root_n32}).img"
initramfs_base="${initramfs_base:-${SCBOOT_INITRAMFS:-${root##*:}/boot/initramfs}}"
boot1_dir="${boot1_dir:-${SCBOOT_BOOT1_DIR:-${root##*:}/boot/bin}}"
kernel_modules="${kernel_modules:-${SCBOOT_KERNEL_MODULES:-${kernel_files}/modules}}"
vmlinux="${vmlinux:-${SCBOOT_KERNEL:-${kernel_files}/vmlinux}}"
lustre_modules="${lustre_modules:-${SCBOOT_LUSTRE_MODULES:-${scdir}/kernel/lustre/default/modules}}"
myrinet_modules="${myrinet_modules:-${SCBOOT_MYRINET_MODULES:-${scdir}/kernel/myrinet/default/modules}}"
myrimx_modules="${myrimx_modules:-${SCBOOT_MYRIMX_MODULES:-${scdir}/kernel/myrinet-mx/default/modules}}"
openib_modules="${openib_modules:-${SCBOOT_OPENIB_MODULES:-${scdir}/kernel/openib/default/modules}}"
qlogic_modules="${qlogic_modules:-${SCBOOT_QLOGIC_MODULES:-${scdir}/kernel/qlogic/default/modules}}"
sk98lin_modules="${sk98lin_modules:-${SCBOOT_SK98LIN_MODULES:-${scdir}/kernel/marvell/default/modules}}"
mfd="${mfd:-${SCBOOT_MFD:-${mfd_dir}/mfd.py}}"
# Accumulate both environment and command line
other_modules="${SCBOOT_OTHER_MODULES} ${other_modules}"
#
# Operational functions
#
show_settings() {
echo
echo "Arguments / environment settings:"
echo " full cmdline: ${scboot_cmdline}"
echo " boot instance: ${scboot_inst} (${scboot_time})"
echo " append_args: ${append_args}"
echo " cluster_conf: $cluster_conf"
secho boot1_dir
secho bootk_generator
secho boot_scripts
echo " cluster: ${cluster}"
secho initramfs_base
echo " initramfs_sources: ${initramfs_sources}"
secho initramfs_generator
secho kernel_files
secho kernel_modules
secho vmlinux
secho lustre_modules
secho myrinet_modules
secho myrimx_modules
secho openib_modules
secho qlogic_modules
secho sk98lin_modules
echo " other_modules: ${other_modules}"
secho mfd_dir
secho mfd
secho mfd_watcher
secho fabricd
echo " bamf: ${bamf}"
echo " master_clock_agent: ${master_clock_agent}"
echo " romfstool: ${romfstool}"
echo " mkext2img: ${mkext2img}"
echo " update_modules: ${update_modules}"
echo " scbootmon: ${scbootmon}"
echo " bad_mod_list: ${bad_mod_list}"
echo " bad_node_list: ${bad_node_list}"
echo " partition: ${partition}"
secho root
secho root_n32
echo " root_mode: ${root_mode}"
secho scand_dir
secho uclinux
secho check_route_info
echo " verbose: ${verbose}"
echo " show_settings: ${show_settings}"
echo " noclean: ${noclean}"
echo " pretend: ${pretend}"
echo " scand_options: ${scand_options}"
echo " msp_setup_only: ${msp_setup_only}"
echo
echo " start_mfd: ${start_mfd}"
echo " start_msp: ${start_msp}"
echo " create_node_img: ${create_node_img}"
echo
echo " scbootdir: ${scbootdir}"
echo " modules_dir: ${modules_dir}"
echo " mfd_port: ${mfd_port}"
echo " root_port: ${root_port}"
echo " kmod_port: ${kmod_port}"
echo " nodes_per_module: ${nodes_per_module}"
echo " module_count: ${module_count}"
echo " node population: $population"
echo " msp hash: $(get_msp_hash)"
echo
}
lock_tree() {
local path=$1 owner=
vecho >&2 "Locking ${path}"
if [ -L ${path} ]; then
# Lock exists, if locker is gone, remove the lock
owner=$(readlink ${path})
if ps ax | grep -qs "^ *${owner:-NONE}.*scboot"; then
die 1 "Lock exists on ${path} (pid: ${owner})"
else
echo "Removing stale lock on ${path} (pid: ${owner})"
rm ${path}
fi
fi
if ln -s ${mypid} ${path} 2>/dev/null; then
# Don't quote scboot_traps, they're multiple args
trap "unlock_tree_trap ${path}" ${scboot_traps}
else
die 1 "Failed to obtain lock on ${path}"
fi
}
unlock_tree() {
local path=$1 owner=
# Don't quote scboot_traps, they're multiple args
trap - ${scboot_traps}
# Terminate any running socat image servers
[ "$socat_pids" ] && kill ${socat_pids}
owner=$(readlink ${path})
vecho >&2 "Unlocking ${path}"
if [ "${owner}" == "${mypid}" ]; then
rm -f ${path}
else
die 1 "Lock corruption on ${path}!! (us: ${mypid}, owner ${owner}"
fi
}
unlock_tree_trap() {
echo >&2 "Caught signal, cleaning up."
if [ -n "${pidlist}" ]; then
vecho >&2 "Cleaning up background processes: ${pidlist}"
kill ${pidlist} 2>/dev/null
pidlist=
fi
unlock_tree $1
die 1
}
get_hostip() {
local host="$1"
local ip=""
count=0
while [ -z "$ip" ]; do
ip="$(getent hosts $host |awk '{print $1}')"
[ -z "$ip" ] && sleep 1
count=$(( count + 1 ))
if [ $count -gt 5 ]; then
die 1 "Could not resolve $host"
fi
done
echo "$ip"
}
get_netblock() {
local netblock=
netblock=$(cat /proc/cmdline | grep INTERNAL_NETBLOCK \
| sed 's:^.*INTERNAL_NETBLOCK=\([0-9\.]*\).*$:\1:')
echo ${netblock}
}
get_msp_hash() {
local md5s= msp_hash=
md5s="$(md5sum ${uclinux} ${scand_dir}/{attnd,mspscand,mspledd})"
msp_hash="$(echo ${md5s} ${scand_options} $(get_netblock) | md5sum)"
echo ${msp_hash% -}
}
# Utility function for show_settings, shows version in parens.
secho() {
eval local var=$1 val=\$${1} real=
real=$(readlink -f ${val/default*/default})
[[ "${val}" =~ "default" ]] && real=" (${real##*/})" || real=
echo " $var: $val$real"
}
# Start/restart ev1d due to socket descriptor leak
start_ev1d() {
# The sfx system profile has multiple partitions sharing a single
# ev1d and is not prone to leaking ev1d sockets so don't restart it.
if [ "${SCv_system_profile}" != "sfx" ]; then
echo "Restarting ev1d"
/etc/init.d/ev1d restart
fi
}
# Start master_clock_agent for this Cortex
start_master_clock_agent() {
local curpid= sys=${partition}
local rundir=/var/run/master_clock_agent/${sys}
# Find already running master_clock_agent
if [ -e ${rundir}/pid ]; then
read -u 0 curpid < ${rundir}/pid
read -u 0 curmd5 curpath < ${rundir}/md5sum
if ! ps ax | grep -qs "^ *${curpid:-NONE}"; then
vecho " master_clock_agent ($curpid) went away."
curpid=
start=1
fi
fi
if [ "${curpid}" ]; then
vecho " Killing master_clock_agent pid ${curpid}"
kill ${curpid}
sleep 2
fi
echo -e "\nLaunching Master Clock Agent"
mkdir -p ${rundir} /var/log/${sys}/
${master_clock_agent} \
--port ${clock_port} \
-o ${full_population} \
--partition ${partition} \
${verbose:+--loglevel DEBUG} \
--route_info_file "/var/state/route_info.${partition}" \
${warmboot:+--warmboot ${warmboot}} \
${warmboot:+--loglevel DEBUG} \
--pid_file ${rundir}/pid \
--log_file /var/log/${sys}/master_clock_agent.log \
|| die 1 "master_clock_agent launch failed"
md5sum ${master_clock_agent} > ${rundir}/md5sum
}
# Start correct mfd for this Cortex
start_mfd() {
local start= sys=${partition}
local rundir=/var/run/mfd/${sys}
local curpid= curmd5= curpath= md5= path=
echo -e "\nChecking Master Fabric Daemon"
# Find already running mfd
if [ -e ${rundir}/pid ]; then
read -u 0 curpid < ${rundir}/pid
read -u 0 curmd5 curpath < ${rundir}/md5sum
if ! ps ax | grep -qs "^ *${curpid:-NONE}"; then
vecho " mfd ($curpid) went away."
curpid=
start=1
fi
fi
# auto means restart if running mfd is different than one requested
read -u 0 md5 path < <(md5sum ${mfd})
if [ "${md5:-XXX}${path}" != "${curmd5}${curpath}" ]; then
vecho " Running mfd is different."
start=1
fi
# Only honor skip request if an mfd is currently running
if [ "${start_mfd}" = "skip" ] && [ "${curpid}" ]; then
start=
fi
if [ "${start}" ] || [ "${start_mfd}" = "force" ]; then
if [ "${curpid}" ]; then
vecho " Killing mfd pid ${curpid}"
kill ${curpid}
sleep 2
fi
vecho " Starting new Master Fabric Daemon (port ${mfd_port})"
mkdir -p ${rundir} /var/log/${sys}/
${mfd} -d "${mfd_dir}" -p ${mfd_port} ${verbose:+--loglevel DEBUG} \
--pid_file ${rundir}/pid --log_file /var/log/${sys}/mfd.log \
|| die 1 "mfd launch failed"
md5sum ${mfd} > ${rundir}/md5sum
else
vecho " Using existing Master Fabric Daemon (pid ${curpid})"
fi
}
port_in_use() {
# I'm not generally in favor of using Bash voodoo but, with the
# number of ports open on the SSP, processing the output of
# `netstat -n -A inet` or the contents of /proc/net/tcp takes way
# too long (~7-10 seconds).
(: < /dev/tcp/localhost/$1) 2>/dev/null
return $?
}
get_free_ports() {
# Dynamic ports range 49152-65535
local portbase="${1:-49152}"
local num="${2:-1}"
local ports=""
# Find the requested number of ports starting at portbase
while [ $(wc -w <<<"$ports") -lt $num ]; do
if ! port_in_use $portbase; then
ports="${ports}${portbase} "
fi
portbase=$(( portbase + 1 ))
done
# Don't quote
echo $ports
}
socat_serve() {
local img="$1" port="$2"
local try= pid=
socat -U TCP4-LISTEN:$port,fork,reuseaddr OPEN:"$img",rdonly &
socat_pids="${socat_pids}$! "
pidlist="${pidlist}$! "
sleep 1
[ -e /proc/$! ] || die 1 "Could not start NBD socat server"
}
# Given cluster and mnum calculate other module info
calc_mod_info() {
local mspip=
# Calculate module specifics
msp="${cluster}-msp${mnum}"
mspip="$(get_hostip $msp)"
# Calculate this from MSP IP so it works for frost and sleet
modnum="$(( ${mspip##*.} - 100 ))"
}
# Setup the node rootfs (and image if needed)
setup_node_rootfs() {
echo -e "\nSetting up node rootfs image"
if [ -e /root/.ssh/id_dsa.pub ]; then
# Update Node ssh authorized keys if needed
ssp_dsa_key=$(cat /root/.ssh/id_dsa.pub)
if ! grep -qs "${ssp_dsa_key}" ${root}/root/.ssh/authorized_keys; then
vecho "Updating root user authorized_keys in the node rootfs"
mkdir -p ${root}/root/.ssh/
echo "${ssp_dsa_key}" >> ${root}/root/.ssh/authorized_keys
chmod 755 ${root}/root/.ssh/
chmod 600 ${root}/root/.ssh/authorized_keys
fi
fi
# Only honor skip request if an image already exists
if [ "${create_node_img}" = "skip" ]; then
if file ${root_img} | grep -s "ext2 filesystem"; then
vecho "Skipping checks, using existing image: ${root_img}"
return 0
else
die 1 "No valid root image found at: ${root_img}"
fi
fi
# All NBD modes require the rootfs image to be up to date
case ${root_mode} in
*nbd)
${mkext2img} --indent 2 ${verbose:+-v} \
--img ${root_img} --dir ${root} \
|| die 1 "Failed node root image creation"
;;
esac
# Only socatnbd requires us to serve the image
if [ "${root_mode}" = "socatnbd" ]; then
### Serve node rootfs image using socat ###
socat_serve ${root_img} ${root_port} \
|| die 1 "Failed to serve rootfs image via socat"
socat_serve ${modules_img} ${kmod_port} \
|| die 1 "Failed to serve kernel module image via socat"
fi
}
# Setup the runtime kernel modules directory (and image if needed)
setup_node_kmodules() {
local kver= output= mdir=
local idir=$1
[ "${idir}" ] || die 1 "setup_node_kmodules called without initramfs dir"
kver=$(strings ${vmlinux} | grep "^Linux version" | awk '{print $3}')
# ---- Modules needed for boot in initramfs ----
# Copy in the kernel modules for boot
vecho " Copying kernel modules for boot"
rsync -aKm --include=sc*.ko --exclude=**.ko ${kernel_modules}/ ${idir}/ \
|| die 1 "kernel boot modules copy failed"
if [ "${root_mode}" = "lustre" ]; then
vecho " Copying lustre modules for boot"
rsync -aKm ${lustre_modules}/ ${idir}/ \
|| die 1 "kernel boot modules copy failed"
fi
# Recreate module dependencies for boot modules
vecho " Running depmod on ${idir}/lib/modules/${kver}"
output=$(depmod -b ${idir} -e -F ${kernel_files}/System.map ${kver} 2>&1) \
|| die 1 "failed depmod on ${idir}/lib/modules/${kver}:\n${output}"
vecho "${output}"
# Strip debug info from kernel modules in the initramfs
find ${idir}/lib/modules -type f -a -name *.ko -a -print0 \
|xargs -0 -ifl scstrip --strip-debug fl \
|| die 1 "strip of kernel boot modules failed"
# ---- Modules needed runtime in node rootfs ----
# Copy in the kernel modules to rootfs modules dir
vecho " Copying kernel modules into node rootfs"
mkdir -p ${modules_dir}/lib64
ln -sf lib64 ${modules_dir}/lib
for mdir in ${kernel_modules} ${lustre_modules} \
${myrinet_modules} ${myrimx_modules} ${qlogic_modules} ${sk98lin_modules} \
${openib_modules} ${other_modules}; do
rsync -aKm ${mdir}/ ${modules_dir}/ \
|| die 1 "kernel modules copy failed"
done
# Recreate module dependencies for node rootfs
local kdir=${modules_dir}/lib/modules/${kver}
vecho " Running depmod on ${kdir}"
output=$(depmod -b ${modules_dir} -e -F ${kernel_files}/System.map \
${kver} 2>&1) || die 1 "failed depmod on ${kdir}:\n${output}"
vecho "${output}"
# Keep modules.* timestamps to avoid unneeded rootfs updates
for ref_file in ${kernel_modules}/lib/modules/${kver}/modules.*; do
touch -r ${ref_file} ${kdir}/$(basename ${ref_file}) \
|| die 1 "Could not set timestamp on kernel modules.* files"
done
# Strip debug info from kernel modules in the node rootfs
find ${modules_dir}/lib/modules -type f -a -name *.ko -a -print0 \
|xargs -0 -ifl scstrip --strip-debug --preserve-dates fl \
|| die 1 "strip of kernel boot modules failed"
# All NBD modes require the kernel NBD image to be up to date
case ${root_mode} in
*nbd)
${mkext2img} --indent 2 ${verbose:+-v} --free 10 \
--img ${modules_img} --dir ${modules_dir}/lib/modules \
|| die 1 "Failed module image creation"
;;
esac
}
setup_msp_data() {
local scand_hash=
vecho " Building MSP image"
# Start script sets up MSPnet networks and runs attnd/scand
vecho " Creating MSP rc.msh replacement: ${scbootdir}/rc.sh"
# For legacy reasons for diags we generate this and store it on
# the MSP. We should try to remove this at some point as it's
# essentially redundant with get_msp_hash().
scand_hash="$(cat ${scand_dir}/{attnd,mspscand,mspledd} | md5sum) ${scand_options}"
# Replacement for rc.msh
cat > ${scbootdir}/rc.msh <<-EOF
# Local Network Config
ifconfig lo 127.0.0.1
route add -net 127.0.0.0 netmask 255.0.0.0 lo
# Start DHCP Client and sync on it getting a config.
slotid=\`cat /proc/slotid\`
slotid=\`printf "%02x" \$slotid\`
envtool > /dev/urandom # More entropy for patched dhcpcd
dhcpcd -D -H -p -i msp-kernel-\$slotid-$(get_msp_hash) -a eth0 &
while true; do [ -e /var/tmp/dhcpc/dhcpcd-eth0.info ] && break; done
# Set the system time and spawn the rdate update client script.
rdate -s msp-ssp.scsystem
msh /etc/rdate.msh msp-ssp.scsystem &
# Renew the lease after time is set, fix for SiCortex Bug 2997 et al.
dhcpcd -n eth0
####################################
## Additions to startup MSP programs
####################################
hostname | sed 's/^.*-msp//' > /tmp/mspnum
mspnum=\`cat /tmp/mspnum\`
hostname | sed 's/^..\\(.\\).*$/\1/' > /tmp/sysnum
sysnum=\`cat /tmp/sysnum\`
case \`hostname\` in
sf*) octet3=\`expr 100 + \$sysnum\` ;;
sx*) octet3=\`expr 100 + \$sysnum + 4\` ;;
sca*) octet3=\`expr 100 + \$mspnum\` ;;
sc0*) octet3=\`expr 100 + \$mspnum\` ;;
sc1*) octet3=\`expr 100 + \$mspnum + 4\` ;;
sci*) octet3=\`expr 100 + \$mspnum\` ;;
scx*) octet3=\`expr 100 + \$mspnum\` ;;
esac
net_triple=$(get_netblock).\$octet3
for i in $(echo $(seq 0 $(( nodes_per_module - 1 )) )); do
localnet="\${net_triple}.\`expr 200 + \$i\`"
remotenet="\${net_triple}.\`expr 100 + \$i\`"
ifconfig msp\${i} \${localnet} pointopoint \${remotenet}
done
/attnd &
echo \$? > /var/run/attnd.pid
/mspscand ${scand_options} || exit 1
# We can't just use \$? as mspscand execs itself...
grep -l "^/mspscand" /proc/*/cmdline \
| awk -F/ '{print $3}' \
> /var/run/mspscand.pid
/mspledd &
/bin/sh -c ps | grep [s]cand | awk '{print \$1}' > /tmp/ps.tmp
echo \$! \`cat /tmp/ps.tmp\` ${scand_hash} > /tmp/scand.hash
EOF
cat > ${scbootdir}/hosts <<-EOF
$(get_hostip msp-ssp.scsystem) msp-ssp.scsystem msp-ssp
EOF
msp_tz_file=""
if [ -f /var/state/msp/etc/TZ ]; then
msp_tz_file="/var/state/msp/etc/TZ:./etc/TZ"
fi
# Create the new MSP uclinux image
vecho " Generating MSP image: ${scbootdir}/msp_full.bin"
cp ${uclinux} ${scbootdir}/msp_full.bin
${romfstool} ${scbootdir}/msp_full.bin \
${scbootdir}/rc.msh:./etc/rc.msh \
${scbootdir}/hosts:./etc/hosts \
${scand_dir}/attnd:./attnd \
${scand_dir}/mspscand:./mspscand \
${scand_dir}/mspledd:./mspledd \
${msp_tz_file} \
> ${scbootdir}/romfstool.log || die 1 "MSP image generation failed"
}
setup_voltfreq_info() {
# Load alternate voltages and clock frequencies
volt_freq_file="/var/state/volt_freq_info.${partition}"
if [ -r "$volt_freq_file" ]; then
vddc=$(awk '/^VDDC/ {print $2}' < "$volt_freq_file")
vddf=$(awk '/^VDDF/ {print $2}' < "$volt_freq_file")
vddr=$(awk '/^VDDR/ {print $2}' < "$volt_freq_file")
vddl=$(awk '/^VDDL/ {print $2}' < "$volt_freq_file")
pclk=$(awk '/^PCLK/ {print $2}' < "$volt_freq_file")
dclk=$(awk '/^DCLK/ {print $2}' < "$volt_freq_file")
sclk=$(awk '/^SCLK/ {print $2}' < "$volt_freq_file")
pcirefclk=$(awk '/^PCIREFCLK/ {print $2}' < "$volt_freq_file")
fi
# if not set, go with the defaults
volt_freq_type=$(volt_freq_type ${verbose:+--verbose} -p ${partition} --route_info_file "/var/state/route_info.${partition}")
case ${volt_freq_type} in
1) volt_freq_default="/opt/sicortex/config/volt_freq_info.1.default" ;;
2) volt_freq_default="/opt/sicortex/config/volt_freq_info.2.default" ;;
3) volt_freq_default="/opt/sicortex/config/volt_freq_info.3.default" ;;
*) die 2 "invalid volt_freq_type: ${volt_freq_type}" ;;
esac
if [ -r "$volt_freq_default" ]; then
vddc=${vddc:-$(awk '/^VDDC/ {print $2}' < "$volt_freq_default")}
vddf=${vddf:-$(awk '/^VDDF/ {print $2}' < "$volt_freq_default")}
vddr=${vddr:-$(awk '/^VDDR/ {print $2}' < "$volt_freq_default")}
vddl=${vddl:-$(awk '/^VDDL/ {print $2}' < "$volt_freq_default")}
pclk=${pclk:-$(awk '/^PCLK/ {print $2}' < "$volt_freq_default")}
dclk=${dclk:-$(awk '/^DCLK/ {print $2}' < "$volt_freq_default")}
sclk=${sclk:-$(awk '/^SCLK/ {print $2}' < "$volt_freq_default")}
pcirefclk=${pcirefclk:-$(awk '/^PCIREFCLK/ {print $2}' < "$volt_freq_default")}
fi
if [ -n "${pclk}" ]; then
vecho "Alternate pclk frequency specified: ${pclk}"
fi
if [ -n "${dclk}" ]; then
vecho "Alternate dclk frequency specified: ${dclk}"
fi
if [ -n "${sclk}" ]; then
vecho "Alternate sclk frequency specified: ${sclk}"
fi
if [ -n "${pcirefclk}" ]; then
vecho "Alternate pcirefclk frequency specified: ${pcirefclk}"
fi
if [ -n "${vddc}${vddf}${vddr}${vddl}" ]; then
if [ -z "${vddc}" -o -z "${vddf}" -o -z "${vddr}" -o -z "${vddl}" ]; then
die 2 "${volt_freq_file} must contain all four or no voltages"
fi
vecho "Alternate voltages specified: ${vddc},${vddf},${vddl},${vddr}"
fi
}
# Setup module data directories
setup_node_data() {
local idir= mnum= modnum=
mkdir -p ${scbootdir}
cd ${scbootdir} # initramfs, bootk and bamf work in cwd
echo -e "\nCreating boot configuration"
# finalize kernel args since setup_voltfreq_info() has been run
kargs="linux"
if [ -n "${pclk}" ]; then
kargs="${kargs} clk=${pclk}"
else
kargs="${kargs} clk=${SCv_pclk_mhz}"
fi
kargs="${kargs} console=msp0"
kargs="${kargs} loglevel=${loglevel}"
kargs="${kargs} maxcpus=6"
kargs="${kargs} bigphysarea=${SCv_kernel_bigphysarea}" # needed for scfab
kargs="${kargs} rdinit=/sbin/preinit" # the key to ramdisk booting
kargs="${kargs} ${append_args}"
# emit finalized volt/freq info and kernel args
(
echo " voltage vddc: ${vddc:-<default>}"
echo " voltage vddf: ${vddf:-<default>}"
echo " voltage vddr: ${vddr:-<default>}"
echo " voltage vddl: ${vddl:-<default>}"
echo " frequency pclk: ${pclk:-<default>}"
echo " frequency dclk: ${dclk:-<default>}"
echo " frequency sclk: ${sclk:-<default>}"
echo " frequency pcirefclk: ${pcirefclk:-<default>}"
echo " kargs: ${kargs}"
echo
)>> ${scboot_log}
vecho " Copying bootloader programs"
# Copy in the basic bootloader programs
cp ${boot1_dir}/boot[0-2].elf ${scbootdir} || die 1 "boot*.elf copy failed"
cp ${boot1_dir}/dmseg_gdb.elf ${scbootdir} || die 1 "dmseg_gdb.elf copy failed"
# Copy in and strip the kernel
cp ${vmlinux} ${scbootdir}/vmlinux || die 1 "vmlinux copy failed"
scstrip ${scbootdir}/vmlinux || die 1 "strip of vmlinux failed"
# Create initramfs base directories
vecho " Copying data into initramfs"
idir=${scbootdir}/.initramfs
mkdir -p ${idir}/sbin ${idir}/var/state/etc/openldap
mkdir -p ${idir}/var/state/etc/udev/rules.d
touch ${idir}/var/state/etc/udev/rules.d/70-persistent-net.rules
# Copy in hosts file
local hosts_file="/var/state/hosts"
cp $hosts_file ${idir}/$hosts_file || die 1 "$hosts_file copy failed"
cat <<-EOF >> ${idir}/$hosts_file
127.0.0.1 localhost
# IPV6 versions of localhost and co
::1 ip6-localhost ip6-loopback
fe00::0 ip6-localnet
ff00::0 ip6-mcastprefix
ff02::1 ip6-allnodes
ff02::2 ip6-allrouters
ff02::3 ip6-allhosts
EOF
# Copy in ldap.conf files that are modified to point to the SSP
cat /etc/ldap.conf | grep -v "^ssl " | sed 's/^host .*$/host ssp/' \
> ${idir}/var/state/etc/ldap.conf \
|| die 1 "ldap.conf copy failed"
cat /etc/openldap/ldap.conf | sed 's/^HOST .*$/HOST ssp/' \
> ${idir}/var/state/etc/openldap/ldap.conf \
|| die 1 "ldap.conf copy failed"
# Copy in route_info.* files
local route_info_file="/var/state/route_info.${partition}"
if [ -f $route_info_file ]; then
cp $route_info_file ${idir}/var/state/ \
|| die 1 "$route_info_file copy failed"
fi
# Copy in /opt/sicortex/config
local osc_dest="${idir}/var/state/opt_sicortex_config"
mkdir -p "${osc_dest}"
cp -a /opt/sicortex/config/* "${osc_dest}" || die 1 "${osc_dest} copy failed"
# Copy in the generated config elements as well
generated_config_dir=/var/state/config/${partition}
if [ -d "${generated_config_dir}" ] ; then
for file in $(ls ${generated_config_dir}/) ; do
cp ${generated_config_dir}/${file} ${osc_dest}/ \
|| die 1 "${osc_dest} copy failed"
done
fi
# Copy in the boot scripts
cp -a ${boot_scripts}/* ${idir}/ || die 1 "boot scripts copy failed"
# Copy in fabricd
cp ${fabricd} ${idir}/sbin/fabricd || die 1 "fabricd copy failed"
# Setup kernel modules
setup_node_kmodules ${idir}
#--------
# Honestly, this must go somewhere else.
#--------
# v---
for f in $SCv_initramfs_var_state_files; do
cp $f ${idir}/var/state/ \
|| die 1 "$f copy failed"
done
# ^--- This will replace this ---v
cp $CTHLIB ${idir}/var/state/cthlib \
|| die 1 "cthlib copy failed"
echo "_SCv_system_profile=$SCv_system_profile" > ${idir}/var/state/scprofile
# ----^
# Odd quoting should be fixed by changing eval's in cthlib
for mnum in $modules; do
calc_mod_info
slotid=""
case ${partition} in
scx) slotid=$(( 0x00 + mnum )) ;; # SC5832
sci) slotid=$(( 0x24 + mnum )) ;; # SC1458
sc0) slotid=$(( 0x30 + mnum )) ;; # SC648 Left
sc1) slotid=$(( 0x34 + mnum )) ;; # SC648 Right
sx*) slotid=$(( 0x3b + ${partition/sx} )) ;; # SC162
sca) slotid=$(( 0x38 )) ;; # SC072
sf*) slotid=$(( 0x3f )) ;; # SC24
*) die 2 "Unknown system type ${partition//[0-9]/}" ;;
esac
ssp_gw_ip="$(get_hostip mgt0-ssp0)"
. $CTHLIB \
SCv_boot_inst=$scboot_inst \
SCv_boot_dbg_points="'$dbg_points'" \
SCv_booted_node_count=${population} \
SCv_mfd_port=${mfd_port} \
SCv_clock_port=${clock_port} \
SCv_my_modnum=${modnum} \
SCv_rootfs="'$(get_hostip ssp):${root}'" \
SCv_rootfs_n32="'$(get_hostip ssp):${root_n32}'" \
SCv_modules="'$(get_hostip ssp):${modules_dir}'" \
SCv_partition="$partition" \
--boot_args \
> ${idir}/var/state/boot_args.$slotid \
|| die 1 "cthlib call failed"
cat >> ${idir}/var/state/boot_args.$slotid <<-EOF
SCv_module_count='${module_count}'
SCv_name='${cluster}'
SCv_module_id='${mnum}'
SCv_rootfs_mode='${root_mode}'
SCv_rootfs_srv_port='${root_port}'
SCv_rootfs_url='http://$(get_hostip ssp)${root_img}'
SCv_rootfs_name='${root_img}'
SCv_rootfs_n32_name='${root_n32_img}'
SCv_kmod_srv_port='${kmod_port}'
SCv_kmod_url='http://$(get_hostip ssp)${modules_img}'
SCv_kmod_name='${modules_img}'
SCv_dhcpc_args='-V ${cluster}-${mnum}'
SCv_my_fqdn="${cluster}-m${mnum}n\${NodeID}.scsystem"
SCv_ssp_gw_ip="${ssp_gw_ip}"
SCv_nodes_per_module='${nodes_per_module}'
EOF
done
#--------
vecho " Building initramfs cpio image"
eval make -f ${initramfs_generator}/Makefile srcdir=${initramfs_generator} \
INITRAMFS_SOURCES=\"${initramfs_base} ${idir} ${initramfs_sources}\" \
CC=gcc ${redirect} || die 1 "initramfs build returned bad status $?"
vecho " Building bootk.elf"
eval make -f ${bootk_generator}/Makefile srcdir=${bootk_generator} \
PREFIX=mips64el-gentoo-linux-gnu- INITRAMFS=initramfs.cpio.gz \
VMLINUX=${scbootdir}/vmlinux KARGS=\"${kargs}\" \
${redirect} || die 1 "bootk build returned bad status $?"
}
#### start_msp and utility functions ####
# The per MSP state machine looks like this:
#
# o --> <CHECK STATE> <---------<-------+-----<----+
# | | |
# +-> [timeout] --> <DIE> | |
# | ^ |
# +-> ['AWOL'] --> <DIE> | ^
# | | |
# +-> ['msp_boot'] ---> <SET_REBOOTED> |
# | ^
# +-> ['msp_kernel'] |
# | |
# +-> [badhash x 1] --> <REBOOT> -->-+
# | |
# +-> [badhash x 2] --> <DIE> ^
# | |
# +-> [goodhash x 1, rebooted] --->--+
# |
# |
# +-> [goodhash, not rebooted] --->--+
# | v
# +-> [goodhash x 2] ----->----> <SUCCESS>
#
## List processing routines ##
# contains <item> <list_var> -> true if <item> is in $list_var
contains() { eval [ \"\${${2}//\${1} /}\" != \"\${${2}}\" ]; }
# append <item> <list_var> -> append <item> to $list_var
append() { eval $2=\"\${$2}\$1 \"; }
# remove <item> <list_var> -> remove <item> from $list_var
remove() { eval $2=\"\${$2//\$1 /}\"; }
# Get the MSPs into the right state
start_msp() {
# MSP state lists
local rebooted= goodhash= badhash= finished= pending= forced=
# Other locals
local start_time=$(date +%s) elapsed= state_file= state_files=
local msp= timestamp= state= timer_pid=
local msp_list= msp_file= msp_hash=$(get_msp_hash)
local timeout_file="${scbootdir}/mspstate.timeout"
echo -e "\nChecking Module Service Processors"
if [ "${start_msp}" = "skip" ]; then
vecho " Skipping MSP checks"
return 0
fi
# Tell policyd we're rebooting...
/usr/sbin/policydc env down
/usr/sbin/policydc env reset
# Per module setup
for mnum in ${modules}; do
msp="${cluster}-msp${mnum}"
msp_file=/tftproot/${msp}_linux.bin
append ${msp} msp_list
state_file=${msp_state_dir}/${msp}
append ${state_file} state_files
[ -e ${state_file} ] || die 1 "State file ${state_file} missing"
rm -f ${msp_file}
ln -sf ${scbootdir}/msp_full.bin ${msp_file}
done
[ "${start_msp}" = "force" ] && forced="${msp_list}"
rm -f "${timeout_file}"
touch "${timeout_file}"
# Don't quote scboot_traps, they're multiple args
(
trap - ${scboot_traps}
sleep "${msp_timeout}"
echo "TIMEOUT" > "${timeout_file}"
) &
timer_pid=$!
pidlist="${pidlist}${timer_pid} "
# The <CHECK STATE> loop
while true; do
if read msp timestamp state; then
if [ "${msp}" = "TIMEOUT" ]; then
pending="${msp_list} "
for msp in ${finished}; do
remove ${msp} pending
done
die 1 "Unable to setup the following MSP(s) in ${msp_timeout} seconds: ${pending}"
fi
if ! contains "${msp}" msp_list; then
die 1 "Unknown MSP: ${msp}"
fi
# User requested force reboot
if contains ${msp} forced; then
state=FORCE
remove ${msp} forced
fi
case ${state} in
AWOL) # Lease deleted and never came back
append ${msp} rebooted
vecho " ${msp}: has AWOL dhcp lease. May cause timeout."
;;
msp-boot*) # uboot dhcp
vecho " ${msp}: uboot dhcp"
append ${msp} rebooted
remove ${msp} finished
;;
msp-kernel-*-${msp_hash}) # uclinux: right hash
vecho " ${msp}: uclinux dhcp with correct state"
if ! contains ${msp} rebooted; then
# It was not rebooted and it is in the right state
append ${msp} finished
vecho " ${msp}: ready"
elif contains ${msp} goodhash; then
# We got two good states; uclinux is finished booting
append ${msp} finished
echo " ${msp}: rebooted and ready"
fi
append ${msp} goodhash
;;
msp-kernel-*|UNKNOWN|FORCE) # uclinux: wrong hash, unknown, forced
vecho " ${msp}: uclinux dhcp with bad state: '${state}'"
if contains ${msp} badhash; then
# Second time means reboot failed
die 1 " ${msp}: rebooted to wrong MSP image."
else
echo " ${msp}: rebooting"
(sleep 1; echo -e "reboot\nexit"; sleep 2) \
| telnet ${msp} >/dev/null 2>&1 &
fi
append ${msp} badhash
remove ${msp} finished
;;
*) # What's this?
die 1 " ${msp}: bad state data '${msp} ${timestamp} ${state}'"
;;
esac
fi
# When 'pending' is empty, all MSPs are sane
pending="${msp_list} "
for msp in ${finished}; do
remove ${msp} pending
done
if [ "${pending// /}" ]; then
elapsed=$(( $(date +%s) - start_time ))
vecho " Waiting (${elapsed}/${msp_timeout}s) for MSPs: ${pending//${cluster}-msp/}"
else
vecho " All MSPs ready"
break
fi
done < <(tail -n1 -q -f ${state_files} ${timeout_file} --pid=$mypid)
# Clean up the timer pid now, otherwise when scboot terminates we
# can get nasty looking "Terminated" messages
kill ${timer_pid}
# Tell policyd we're back...
/usr/sbin/policydc env up
return 0
}
# Push the bootloader, vmlinux, bootk, initramfs across with bamf
bamf_linux() {
cmd="${bamf} ${verbose:+-v -v} --log_dir=/var/log/${partition}"
cmd="${cmd} tftp://msp-ssp/${scbootdir#/tftproot}"
if [ -n "${vddc}${vddf}${vddl}${vddr}" ]; then
cmd="${cmd} --voltages ${vddc},${vddf},${vddl},${vddr}"
fi
if [ -n "${pclk}" ]; then
cmd="${cmd} --pclk ${pclk}"
fi
if [ -n "${dclk}" ]; then
cmd="${cmd} --dclk ${dclk}"
fi
if [ -n "${sclk}" ]; then
cmd="${cmd} --sclk ${sclk}"
fi
if [ -n "${pcirefclk}" ]; then
cmd="${cmd} --pcirefclk ${pcirefclk}"
fi
cmd="${cmd} $*"
if [ "${pretend}" ]; then
# Show what would be done
echo " Pretend:"
echo " cd ${scbootdir}"
echo " ${cmd} --prefix=' bamf: '"
else
# We know the arp cache is hot for the MSPs
logger -t scboot -- "booting [${scboot_inst}]"
vecho " ${cmd} --prefix=\" bamf: \""
cd ${scbootdir}
${cmd} --prefix=" bamf: " || die 1 "Failed loading linux"
fi
echo -e "Finished loading linux (kernel boot initiated)"
}
# Halt all nodes for this partition
halt_nodes() {
echo -e "\nHalting all nodes"
for n in $(seq 0 $((module_count-1)) ); do
hmsp="${partition}-msp${n}"
sfile="${msp_state_dir}/${hmsp}"
hmask=$(( (1<<$nodes_per_module) - 1 ))
hexhmask="$(printf "%#x" ${hmask})"
if sline="$(tail -n1 ${sfile} 2>/dev/null)"; then
if ! grep -q "AWOL" <<<"$sline"; then
if ! ${bamf} --log_dir /tmp -r NA ${hmsp}:${hexhmask}; then
if ! grep -q "\<$n\>" <<<"$bad_mod_list"; then
die 1 "Halt of nodes on ${hmsp} failed"
else
echo "Warning: Halt of nodes on ${hmsp} failed"
fi
fi
fi
fi
done
}
# Cleanup the slurm state for boot
setup_slurm() {
# Cancel any outstanding slurm jobs
vecho -e "\nCancelling outstanding slurm jobs"
scancel -p ${partition}
# Mark the corresponding SLURM partitions as available
vecho -e "\nMarking slurm partitions as available"
for x in $(sinfo -a -h -o "%P" |grep "^${cluster}"); do
scontrol update PartitionName=${x} Hidden=no State=up
done
# Set all nodes "Down", then "Resume"
# This clears any stuck jobs from the nodes,
# as well as potentially stale "Drain" states from previous boots.
node_states="version"
NL=$'\n'
for m in $(seq 0 $((module_count - 1))); do
nrange="[0-$((nodes_per_module-1))]"
node_states="${node_states}${NL}update NodeName=${partition}-m${m}n${nrange} State=Down"
node_states="${node_states}${NL}update NodeName=${partition}-m${m}n${nrange} State=Resume"
done
if [ -n "${node_states}" ]; then
scontrol <<<"${node_states}${NL}quit" 1>${scbootdir}/scontrol-idle.log \
|| echo "Trouble setting nodes to SLURM 'Idle' state."
fi
# Mark bad modules and nodes in "Drain" state
node_states="version"
node_reason="NotInService"
for m in ${bad_mod_list}; do
nrange="[0-$((nodes_per_module-1))]"
node_states="${node_states}${NL}update NodeName=${partition}-m${m}n${nrange} State=Drain Reason=$node_reason"
done
for node in ${bad_node_list}; do
m=$((node / nodes_per_module))
n=$((node % nodes_per_module))
node_states="${node_states}${NL}update NodeName=${partition}-m${m}n${n} State=Drain Reason=$node_reason"
done
if [ -n "${node_states}" ]; then
scontrol <<<"${node_states}${NL}quit" 1>${scbootdir}/scontrol-drain.log \
|| echo "Trouble setting missing nodes to SLURM 'Drain' state."
fi
# TODO: set default partition if we know of one.
# scontrol update PartitionName=${SCv_default_partition} Default=yes
}
# Clear ev1d state for this partition
clear_ev1d() {
vecho -e "\nCleaning up FabriCache client partitions"
ev1_str="fetch~0~sx2_fab .* mds \ndone\n"
for i in $(echo -en "$ev1_str" | nc localhost 1234 | awk '{print $2}'); do
part="${i}_clients"
lines=$(sinfo -h -p $part 2> /dev/null | wc -l)
if [ $lines != 0 ]; then
echo " deleting $part"
scontrol delete PartitionName="$part"
fi
done
vecho -e "\nClearing event daemon state"
nc localhost 1234 <<-EOF
discard~^${partition}_.*
add~${partition}_nbd_rootfs_servers=
add~${partition}_mgtnet_probed=
done
EOF
[ $? = 0 ] || die 1 "Cannot contact event daemon"
}
coldboot() {
### Setup everything needed for boot
echo -e "\nBooting partition: ${partition}"
setup_msp_data # Setup MSP boot config data
start_msp # Reboot/setup the MSPs if necessary
[ "${msp_setup_only}" ] && die 0
setup_voltfreq_info # Load and check volt/freq settings
setup_node_data # Setup ICE9 boot config data
halt_nodes # Halt nodes in this partition
setup_node_rootfs # Setup NBD rootfs/kernel image
start_ev1d # Start/restart ev1d
start_mfd # Start mfd for this Cortex
setup_slurm # Setup slurm config for boot
clear_ev1d # Clear ev1d state for this partition
start_master_clock_agent # Start master_clock_agent for this Cortex
### Boot Linux on the nodes ###
echo -e "\nLoading and booting linux"
local bamf_list=
for mnum in $modules; do
calc_mod_info
# Remove bad nodes
mod_bad_nodes=$(awk '/^'${mnum}' / {print $2}' <<<"${bad_nodes}")
nmask=$(( (1<<$nodes_per_module) - 1 ))
if [ -n "${mod_bad_nodes}" ]; then
vecho " ${msp}: Not booting disabled nodes:" ${mod_bad_nodes}
for node in ${mod_bad_nodes}; do
# XOR this node from mask
nmask=$(( $nmask ^ (1<<$node) ))
done
fi
# nmask="node mask"
hexmask="$(printf "%#x" ${nmask})"
bamf_list="${bamf_list} ${msp}:${hexmask}"
done
bamf_linux ${bamf_list} || die 1 "Linux boot failed"
# If using an NBD mode then wait for the nodes to suck the image down
case ${root_mode} in
*nbd)
echo -e "\nWaiting for NBD server nodes"
nbd_servers=$(
echo -e "fetch~1~^${partition}_mgtnet_probed=[XM]{${population}}\ndone" \
| nc localhost 1234
)
[ $? = 0 ] || die 1 "Cannot contact event daemon"
;;
esac
}
check_warmboot_not_dirty() {
local dirty= dirty_env=
for x in "${!SCBOOT_@}"; do
case "${x}" in
SCBOOT_PARTITION) ;;
*) dirty_env="${dirty_env} ${x}" ;;
esac
done
[ "${dirty_env}" ] && dirty="${dirty}\n\tenvironment variables: ${dirty_env}"
[ "${dirty_flags}" ] && dirty="${dirty}\n\tcommandline flags: ${dirty_flags}"
[ "${dirty}" ] && die 2 "Warmboot (--nodes) does not allow setting: ${dirty}"
}
warmboot()
{
scontrol update nodeName=${partition}-${warmboot} state=Down
local data= module= node= bamf_list=
data=${warmboot#m}
module=${data%n*}
node=${data#*n}
node_n=$node
node=$((1<<$node))
printf -v node "%x" $node
bamf_list="${partition}-msp${module}:0x${node}"
# bug 5918 - warmboot does not support global clock
# start_master_clock_agent # Start master_clock_agent for this Cortex
#
# FIXME start node agents on neighbors
# x6:
#neighbor=${partition}-mXnY
#$(srun -p ${partition} -w ${neighbor} /bin/node_clock_agent -g DEBUG -p ${clock_port} )&
echo -e "\nLoading and booting linux (warmboot m${module}n${node_n})"
bamf_linux -w ${bamf_list} || die 1 "Linux boot failed"
scontrol update nodeName=${partition}-${warmboot} state=Resume
}
#
# Derivative settings and sanity checking
#
scboot_secs=$(date +%s)
scboot_inst="${partition}-${scboot_secs}"
scboot_time=$(date --date=@${scboot_secs} "+%F %T %Z")
scbootdir="${scbootdir_base}/${partition}"
modules_dir="${rootfs_base}/${partition}-kmod"
modules_img="${modules_dir}.img"
if [ "${warmboot}" ]; then
check_warmboot_not_dirty
# Warmboot requires temp directory
noclean=1
fi
# For now, scboot must be run as root
euser="$(id -un)"
if [ "$euser" != "root" ]; then
die 2 "The scboot program must be run as 'root' user -- you are '${euser}'."
fi
[ "${verbose}" ] && redirect=">/dev/null" || redirect="&>/dev/null"
[ "$(get_netblock)" ] || die 2 "INTERNAL_NETBLOCK not found in /proc/cmdline"
[ -d "${root}" ] || die 2 "${root} is not a directory"
if [ "${track_boot}" ] && [ ! -x "$(which ${scbootmon})" ]; then
die 2 "Could not find ${scbootmon}."
fi
case ${root_mode} in
nfs|nbd|nfsnbd|socatnbd|httpnbd) true ;;
lustre) die 2 "Lustre rootfs mode not yet supported" ;;
*) die 2 "Invalid rootfs mode: ${root_mode}" ;;
esac
# Figure out some derived values
[ -z "${cluster}" ] && cluster=${partition}
# Set values based on the type of system
mfd_port_base=6170
root_port_base=6270
kmod_port_base=6370
clock_port_base=6470
case ${partition} in
# SC24
sf*) nodes_per_module=4 module_count=1
mfd_port=$((mfd_port_base + ${partition/sf/}))
root_port=$((root_port_base + ${partition/sf/}))
kmod_port=$((kmod_port_base + ${partition/sf/}))
clock_port=$((clock_port_base + ${partition/sf/}))
;;
# SC072
sca) nodes_per_module=12 module_count=1
mfd_port=${mfd_port_base}
root_port=${root_port_base}
kmod_port=${kmod_port_base}
clock_port=${clock_port_base}
;;
# SC162
sx*) nodes_per_module=27 module_count=1
mfd_port=$((mfd_port_base + ${partition/sx/1}))
root_port=$((root_port_base + ${partition/sx/1}))
kmod_port=$((kmod_port_base + ${partition/sx/1}))
clock_port=$((clock_port_base + ${partition/sx/1}))
;;
# SC648
sc[01])
nodes_per_module=27 module_count=4
mfd_port=$((mfd_port_base + ${partition/sc/}))
root_port=$((root_port_base + ${partition/sc/}))
kmod_port=$((kmod_port_base + ${partition/sc/}))
clock_port=$((clock_port_base + ${partition/sc/}))
;;
# SC1458
sci) nodes_per_module=27 module_count=9
mfd_port=${mfd_port_base}
root_port=${root_port_base}
kmod_port=${kmod_port_base}
clock_port=${clock_port_base}
;;
# SC5832
scx) nodes_per_module=27 module_count=36
mfd_port=${mfd_port_base}
root_port=${root_port_base}
kmod_port=${kmod_port_base}
clock_port=${clock_port_base}
;;
*) die 2 "Unknown system type ${partition//[0-9]/}" ;;
esac
modules="$(seq 0 $(( module_count - 1 )) )"
population=$(( module_count * nodes_per_module ))
# population will change if we have a route_info file; save the original one
full_population=${population}
# Find first free port in the bucket
mfd_port=$(get_free_ports $mfd_port)
root_port=$(get_free_ports $root_port)
kmod_port=$(get_free_ports $kmod_port)
clock_port=$(get_free_ports $clock_port)
# Eliminate MSPs for missing modules
# route_info_${partition}
bad_link_file="/var/state/route_info.${partition}"
if [ -r "$bad_link_file" ]; then
# check that the file is self-consistent
if ! ${check_route_info} -c $bad_link_file; then
die 1 "inconsistent route info file"
fi
# Build a set of regex patterns to filter modules
# we need to sort before adding the ^ and $ for the numerical sort to work
jumper_modules=$(awk '/^jumper module/ {print $3}' < "$bad_link_file" |sort -nu | awk '// {print"^"$1"$"}')
bad_modules=$(awk '/^bad module/ {print $3}' < "$bad_link_file" |sort -nu | awk '// {print"^"$1"$"}')
bad_nodes=$(awk '/^bad node/ {print $3}' < "$bad_link_file" |sort -nu)
# mXnY format for display to the user
bad_nodes_mxny=$(awk '/^bad node/ {print $3}' < "$bad_link_file" |sort -nu | awk '// {printf("m%dn%d ",$1 / 27,$1 % 27)}')
bad_links_mxny=$(awk '/^bad link/ {printf("m%dn%d-rx%d ",$3 / 27, $3 % 27, $4)}' < "$bad_link_file" |sort -nu)
bad_links_mxny=$(sed -e 's/rx3/tx0/g' -e 's/rx4/tx1/g' -e 's/rx5/tx2/g' <<<"$bad_links_mxny")
jumper_module_count=$(wc -w <<<"$jumper_modules")
bad_module_count=$(wc -w <<<"$bad_modules")
bad_node_count=$(wc -w <<<"$bad_nodes")
bad_link_count=$(wc -w <<<"$bad_links_mxny")
jumper_mod_list=$(echo $jumper_modules|sed 's/[^0-9 ]//g')
bad_mod_list=$(echo $bad_modules|sed 's/[^0-9 ]//g')
bad_node_list=$(echo $bad_nodes|sed 's/[^0-9 ]//g')
[ -n "$jumper_modules" ] && echo "Skipping $jumper_module_count placeholder modules: $jumper_mod_list"
[ -n "$bad_modules" ] && echo "Skipping $bad_module_count disabled modules: $bad_mod_list"
[ -n "$bad_nodes" ] && echo "Skipping $bad_node_count disabled nodes: $bad_nodes_mxny"
[ -n "$bad_links_mxny" ] && echo "Skipping $bad_link_count disabled links: $bad_links_mxny"
# Gross, I know.
# TODO: make not gross
population=$(( population - (jumper_module_count * nodes_per_module) - (bad_module_count * nodes_per_module) - bad_node_count ))
if [ -n "$jumper_modules" -o -n "$bad_modules" -o -n "$bad_nodes" ]; then
echo "Population reduced to: $population"
fi
# now combine the jumper modules and bad modules into one list
bad_modules=$(awk '/^(bad|jumper) module/ {print "^"$3"$"}' < "$bad_link_file" |sort -n|uniq)
bad_module_count=$(wc -w <<<"$bad_modules")
bad_mod_list=$(echo $bad_modules|sed 's/[^0-9 ]//g')
# Filter the list of modules:
if [ -n "$bad_modules" ]; then
modules=$(grep -v -f <(cat <<<"$bad_modules") <(cat <<<"$modules"))
fi
# Convert bad node ID list into module + node number list
bnl=""
for node in ${bad_nodes}; do
m=$((node / nodes_per_module))
n=$((node % nodes_per_module))
bnl=${bnl}${m}" "${n}$'\n'
done
bad_nodes="$bnl"
fi
if [ "${verbose}" ] || [ "${show_settings}" ]; then
show_settings
[ "${show_settings}" ] && die 0
fi
# ^---------- No changes to system
#==================================================================
# v---------- Changes made to system
# Create and lock temporary directory
scboot_lock="${scbootdir%%/}.scboot_lock" # NOT in the directory
# Get the temporary directory to a sane state and lock it
[ "${noclean}" ] || rm -rf ${scbootdir}
mkdir -p ${scbootdir}
lock_tree ${scboot_lock}
# Log all the settings for this boot
show_settings >> ${scboot_log}
mkdir -p /var/log/scboot
set > /var/log/scboot/${partition}.vars
logger -t scboot -- "$orig_cmdline"
if [ "${warmboot}" ]; then
warmboot
else
coldboot
fi
# Cleanup
unlock_tree ${scboot_lock}
# FIXME everything from this point on is inadequate and should be
# consolidated into something better
# start watching for errors from MFD in the background
(
${mfd_watcher} -P ${partition} -L WARNING ${verbose:+--loglevel DEBUG}
) &
# wait for errors from global clock in the background
(
echo -e "\nWaiting for global clock completion"
global_clock_result=$(
echo -e "fetch~1~^${partition}_global_clock_state=\ndone" \
| nc localhost 1234
)
[ $? = 0 ] || die 1 "Cannot contact event daemon"
[ "$global_clock_result" == "${partition}_global_clock_state=done" ] \
|| die 1 "global clock sync failed: $global_clock_result\n"
echo -e "\nglobal clock sync complete"
) &
if [ -z "${warmboot}" ]; then
exec ${scbootmon} "${scboot_inst}" "${population}" "${scbootmon_options}"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment