Created
May 21, 2012 18:35
-
-
Save kanaka/2763820 to your computer and use it in GitHub Desktop.
scboot
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # $Id: scboot 64763 2008-11-15 14:29:58Z rwoodscorwin $ | |
| usage() { | |
| echo " | |
| Usage: $(basename $0) [options] | |
| Boot the nodes of a SiCortex system. | |
| ARG ENV VARIABLE DESCRIPTION | |
| -a xxx append xxx to kernel command line | |
| -f xxx SCBOOT_FABRICD override fabricd | |
| -h show this message | |
| -i xxx tgz, cpio, or directory of initramfs files | |
| can be specified multiple times | |
| -k xxx SCBOOT_KERNEL_FILES override kernel files directory | |
| (containing vmlinux and System.map) | |
| -K xxx SCBOOT_KERNEL_MODULES override kernel module directory | |
| (default: <kernel_files>/modules) | |
| -l xxx SCBOOT_KERNEL override vmlinux | |
| (default: <kernel_files>/vmlinux) | |
| -L xxx SCBOOT_LUSTRE_MODULES override lustre modules tree | |
| SCBOOT_MYRINET_MODULES override myrinet modules tree | |
| SCBOOT_MYRIMX_MODULES override myrinet-mx modules tree | |
| SCBOOT_OPENIB_MODULES override openib modules tree | |
| SCBOOT_QLOGIC_MODULES override qlogic modules tree | |
| SCBOOT_SK98LIN_MODULES override sk98lin modules tree | |
| -M xxx SCBOOT_OTHER_MODULES additional kernel modules trees | |
| can be specified multiple times | |
| -p xxx SLURM_PARTITION override partition to boot | |
| -r xxx SCBOOT_ROOT override path to node rootfs directory | |
| -R xxx SCBOOT_ROOT_MODE node rootfs mode (nfs, nbd) | |
| (default: nfs, nbd for 200+ node systems) | |
| -s xxx SCBOOT_SCAND_DIR override directory with attnd and mspscand | |
| -u xxx SCBOOT_UCLINUX override uclinux binary image | |
| SCBOOT_BOOTK_GEN override bootk generator directory | |
| SCBOOT_INITRAMFS_GEN override initramfs generator directory | |
| SCBOOT_BAMF override bamf executable | |
| SCBOOT_BOOT1_DIR override the boot program directory | |
| SCBOOT_BOOT_SCRIPTS override boot scripts directory | |
| SCBOOT_CHECK_ROUTE_INFO override check_route_info | |
| SCBOOT_INITRAMFS override the base initramfs directory | |
| SCBOOT_MFD override mfd (default: <mfd_dir>/mfd.py) | |
| SCBOOT_MFD_DIR override mfd directory | |
| SCBOOT_MFD_WATCHER override mfd_watcher | |
| SCBOOT_GLOBAL_CLOCK override global clock master agent | |
| SCBOOT_MKEXT2IMG override mkext2img executable | |
| SCBOOT_MONITOR override scbootmon executable | |
| SCBOOT_ROMFSTOOL override romfstool executable | |
| -v,--verbose be verbose | |
| --show_settings show internal settings base on defaults, | |
| environment, and cmd line and then exit | |
| --loglevel='xxx' override the default kernel loglevel | |
| SCBOOT_LOGLEVEL '' '' '' '' '' | |
| --noclean do not clean up tftp working directory | |
| --scand_options='xxx' command line options for starting scand | |
| --nodes=mXnY reboot a single node (warmboot) | |
| --msp_setup_only exit after doing MSP setup | |
| The following options take a value of 'force', 'skip' or 'auto'. | |
| 'auto' means that scboot will try to determine the most logical | |
| action. 'force' means the action should always be taken and 'skip' | |
| means the check should be skipped entirely. | |
| --start_mfd=[MODE] restart MFD (default: force) | |
| --start_msp=[MODE] restart MSP (default: auto) | |
| --create_node_img=[MODE] update node rootfs image (default: auto) | |
| " | |
| exit 2 | |
| } | |
| # Prevent pollution from non SCBOOT_ prefix environment variables | |
| verbose= pretend= noclean= cluster= track_boot= | |
| msp_setup_only= show_settings= start_mfd= start_msp= create_node_img= | |
| partition= root= root_n32= root_mode= loglevel= fabricd= scand_dir= uclinux= | |
| fabricd= scand_dir= uclinux= initramfs_base= boot1_dir= | |
| kernel_files= kernel_modules= vmlinux= lustre_modules= | |
| sk98lin_modules= qlogic_modules= myrinet_modules= myrimx_modules= openib_modules= mfd= other_modules= | |
| # Defaults for command line arguments | |
| start_mfd=force # mfd mode | |
| start_msp=auto # MSP mode | |
| create_node_img=auto # node rootfs image mode | |
| # Defaults with only environment override | |
| scdir="${SCBOOT_PREFIX}/opt/sicortex" | |
| bootk_generator="${SCBOOT_BOOTK_GEN:-${scdir}/boot/bootk/default}" | |
| initramfs_generator="${SCBOOT_INITRAMFS_GEN:-${scdir}/boot/initramfs/default}" | |
| cluster_conf="${SCBOOT_CLUSTER_CONF:-/etc/sicortex.conf}" | |
| mfd_dir="${SCBOOT_MFD_DIR:-${scdir}/mfd/default}" | |
| scbootdir_base="${SCBOOT_DIR_BASE:-/tftproot/scboot_tmp}" | |
| bamf="${SCBOOT_BAMF:-${SCBOOT_PREFIX}/usr/bin/bamf}" | |
| master_clock_agent="${SCBOOT_GLOBAL_CLOCK:-${SCBOOT_PREFIX}/usr/sbin/master_clock_agent}" | |
| mfd_watcher="${SCBOOT_MFD_WATCHER:-${SCBOOT_PREFIX}/usr/bin/mfd_watcher}" | |
| check_route_info="${SCBOOT_CHECK_ROUTE_INFO:-${SCBOOT_PREFIX}/usr/bin/check_route_info}" | |
| romfstool="${SCBOOT_ROMFSTOOL:-${SCBOOT_PREFIX}/sbin/romfstool}" | |
| mkext2img="${SCBOOT_MKEXT2IMG:-${SCBOOT_PREFIX}/sbin/mkext2img}" | |
| scbootmon="${SCBOOT_MONITOR:-${SCBOOT_PREFIX}/sbin/scbootmon_dstat}" | |
| scbootmon_options="${SCBOOT_MONITOR_OPTIONS:---oneline}" | |
| dbg_points="${SCBOOT_DBG_POINTS}" | |
| # Bootscripts coupled to other SSP components so default to SSP path | |
| boot_scripts="${SCBOOT_BOOT_SCRIPTS:-/opt/sicortex/bootscripts/default}" | |
| # Misc settings | |
| CTHLIB=/opt/sicortex/config/cthlib | |
| msp_state_dir=/var/log/mspstate | |
| scboot_log=/var/log/scboot.log | |
| msp_timeout=240 | |
| mypid=$$ | |
| scboot_cmdline="${0}$(printf " %q" "$@")" | |
| scboot_inst= # set after argument processing | |
| pidlist= # for tracking backgrounded subprocesses | |
| scboot_traps="TERM QUIT INT EXIT" | |
| dirty_flags= # for tracking flag usage inconsistent with warmboot | |
| # | |
| # Utility functions | |
| # | |
| die() { | |
| local ret="$1"; shift | |
| echo -e "$@" >&2 | |
| exit "$ret" | |
| } | |
| vecho() { | |
| [ "${verbose}" ] && echo "$@" | |
| } | |
| DF() { | |
| # track flag usage inconsistent with warmboot | |
| dirty_flags="${dirty_flags} ${param}" | |
| } | |
| # | |
| # Process arguments | |
| # | |
| orig_cmdline="$0 $*" | |
| while [ "$*" ]; do | |
| param=$1; shift | |
| # Options with no leading dash must be an argument to | |
| # a preceding option | |
| if [ "${1}" ] && [ "${1:0:1}" != "-" ]; then | |
| OPTARG="$1"; shift | |
| elif [ "${param/=/}" != "${param}" ]; then | |
| OPTARG="${param#*=}" | |
| else | |
| OPTARG= | |
| fi | |
| case $param in | |
| -a) append_args="${append_args} $OPTARG"; DF ;; | |
| -f) fabricd=$OPTARG; DF ;; | |
| -h) usage ;; | |
| -i) initramfs_sources="${initramfs_sources} $OPTARG"; DF ;; | |
| -k) kernel_files=$OPTARG; DF ;; | |
| -K) kernel_modules=$OPTARG; DF ;; | |
| -l) vmlinux=$OPTARG; DF ;; | |
| -L) lustre_modules=$OPTARG; DF ;; | |
| -M) other_modules="${other_modules} $OPTARG"; DF ;; | |
| -p) partition=$OPTARG ;; | |
| -r) root=$OPTARG; DF ;; | |
| -R) root_mode=$OPTARG; DF ;; | |
| -u) uclinux=$OPTARG; DF ;; | |
| -s) scand_dir=$OPTARG; DF ;; | |
| -v|--verbose) verbose=1 ;; | |
| --show_settings) show_settings=1 ;; | |
| --loglevel*) loglevel="$OPTARG"; DF ;; | |
| --noclean) noclean=1 ;; | |
| --pretend) pretend=1 ;; | |
| --scand_options*) scand_options="$OPTARG"; DF ;; | |
| --start_mfd*) start_mfd="$OPTARG"; DF ;; | |
| --start_msp*) start_msp="$OPTARG"; DF ;; | |
| --create_node_img*) create_node_img="$OPTARG"; DF ;; | |
| --msp_setup_only*) msp_setup_only=1; DF ;; | |
| --nodes*) warmboot="$OPTARG" ;; | |
| --trackboot) track_boot=1;; # ignored now - monitoring always on | |
| *) usage ;; | |
| esac | |
| done | |
| # Defaults defined in cthlib | |
| source $CTHLIB \ | |
| SCv_system_profile \ | |
| SCv_pclk_mhz \ | |
| SCv_kernel_bigphysarea \ | |
| || die 1 "cthlib call failed" | |
| # Find user specified partition | |
| partition="${partition:-${SCBOOT_PARTITION:-${SLURM_PARTITION}}}" | |
| # If partition is still not specified, identify a default boot partition | |
| if [ -z "${partition}" ]; then | |
| case ${SCv_system_profile} in | |
| sfx) | |
| sfx_ethers=$(egrep -o '^[^#]+s[xf][0-9]-msp[0-9]+' /etc/ethers) | |
| sfx_count=$(wc -l <<<"${sfx_ethers}") | |
| if [ "${sfx_count}" = "1" ]; then | |
| sfx_host="${sfx_ethers##* }" | |
| partition="${sfx_host%%-*}" | |
| fi | |
| ;; | |
| sc) partition="sc1" ;; | |
| *) partition="${SCv_system_profile}" ;; | |
| esac | |
| fi | |
| if [ -z "${partition}" ]; then | |
| echo >&2 "Partition not set. Not sure which system to boot." | |
| echo >&2 "The partition is set with the -p argument" | |
| echo >&2 "or with the SLURM_PARTITION environment variable." | |
| die 2 | |
| fi | |
| # Defaults that come from global config via cthlib | |
| # These are partition specific so partition must be set first | |
| source $CTHLIB \ | |
| SCv_partition=${partition} \ | |
| SCv_rootfs \ | |
| SCv_rootfs_n32 \ | |
| SCv_rootfs_mode \ | |
| SCv_kernel_files \ | |
| SCv_kernel_boot_loglevel \ | |
| SCv_append_kargs \ | |
| || die 1 "cthlib call failed" | |
| rootfs_base="${scdir}/rootfs" | |
| root="${root:-${SCBOOT_ROOT:-${SCv_rootfs:-${rootfs_base}/default}}}" | |
| root_n32="${root_n32:-${SCBOOT_ROOT_N32:-${SCv_rootfs_n32:-${rootfs_base}/build.n32}}}" | |
| root_mode="${root_mode:-${SCBOOT_ROOT_MODE:-${SCv_rootfs_mode}}}" | |
| kernel_files="${kernel_files:-${SCBOOT_KERNEL_FILES:-${SCv_kernel_files:-${scdir}/kernel/linux/default}}}" | |
| loglevel="${loglevel:-${SCBOOT_LOGLEVEL:-${SCv_kernel_boot_loglevel}}}" | |
| append_args="${append_args:-${SCv_append_kargs}}" | |
| append_args="${append_args# }" | |
| # Defaults with environment and command line overrides | |
| fabricd="${fabricd:-${SCBOOT_FABRICD:-${scdir}/fabricd/default}}" | |
| scand_dir="${scand_dir:-${SCBOOT_SCAND_DIR:-${scdir}/msp/default/usr/bin/}}" | |
| uclinux="${uclinux:-${SCBOOT_UCLINUX:-${scdir}/msp/uclinux/default/msp_linux.bin}}" | |
| # Defaults with environment and command line overrides that are based | |
| # on other arguments | |
| root_img="$(readlink -f ${root}).img" | |
| root_n32_img="$(readlink -f ${root_n32}).img" | |
| initramfs_base="${initramfs_base:-${SCBOOT_INITRAMFS:-${root##*:}/boot/initramfs}}" | |
| boot1_dir="${boot1_dir:-${SCBOOT_BOOT1_DIR:-${root##*:}/boot/bin}}" | |
| kernel_modules="${kernel_modules:-${SCBOOT_KERNEL_MODULES:-${kernel_files}/modules}}" | |
| vmlinux="${vmlinux:-${SCBOOT_KERNEL:-${kernel_files}/vmlinux}}" | |
| lustre_modules="${lustre_modules:-${SCBOOT_LUSTRE_MODULES:-${scdir}/kernel/lustre/default/modules}}" | |
| myrinet_modules="${myrinet_modules:-${SCBOOT_MYRINET_MODULES:-${scdir}/kernel/myrinet/default/modules}}" | |
| myrimx_modules="${myrimx_modules:-${SCBOOT_MYRIMX_MODULES:-${scdir}/kernel/myrinet-mx/default/modules}}" | |
| openib_modules="${openib_modules:-${SCBOOT_OPENIB_MODULES:-${scdir}/kernel/openib/default/modules}}" | |
| qlogic_modules="${qlogic_modules:-${SCBOOT_QLOGIC_MODULES:-${scdir}/kernel/qlogic/default/modules}}" | |
| sk98lin_modules="${sk98lin_modules:-${SCBOOT_SK98LIN_MODULES:-${scdir}/kernel/marvell/default/modules}}" | |
| mfd="${mfd:-${SCBOOT_MFD:-${mfd_dir}/mfd.py}}" | |
| # Accumulate both environment and command line | |
| other_modules="${SCBOOT_OTHER_MODULES} ${other_modules}" | |
| # | |
| # Operational functions | |
| # | |
| show_settings() { | |
| echo | |
| echo "Arguments / environment settings:" | |
| echo " full cmdline: ${scboot_cmdline}" | |
| echo " boot instance: ${scboot_inst} (${scboot_time})" | |
| echo " append_args: ${append_args}" | |
| echo " cluster_conf: $cluster_conf" | |
| secho boot1_dir | |
| secho bootk_generator | |
| secho boot_scripts | |
| echo " cluster: ${cluster}" | |
| secho initramfs_base | |
| echo " initramfs_sources: ${initramfs_sources}" | |
| secho initramfs_generator | |
| secho kernel_files | |
| secho kernel_modules | |
| secho vmlinux | |
| secho lustre_modules | |
| secho myrinet_modules | |
| secho myrimx_modules | |
| secho openib_modules | |
| secho qlogic_modules | |
| secho sk98lin_modules | |
| echo " other_modules: ${other_modules}" | |
| secho mfd_dir | |
| secho mfd | |
| secho mfd_watcher | |
| secho fabricd | |
| echo " bamf: ${bamf}" | |
| echo " master_clock_agent: ${master_clock_agent}" | |
| echo " romfstool: ${romfstool}" | |
| echo " mkext2img: ${mkext2img}" | |
| echo " update_modules: ${update_modules}" | |
| echo " scbootmon: ${scbootmon}" | |
| echo " bad_mod_list: ${bad_mod_list}" | |
| echo " bad_node_list: ${bad_node_list}" | |
| echo " partition: ${partition}" | |
| secho root | |
| secho root_n32 | |
| echo " root_mode: ${root_mode}" | |
| secho scand_dir | |
| secho uclinux | |
| secho check_route_info | |
| echo " verbose: ${verbose}" | |
| echo " show_settings: ${show_settings}" | |
| echo " noclean: ${noclean}" | |
| echo " pretend: ${pretend}" | |
| echo " scand_options: ${scand_options}" | |
| echo " msp_setup_only: ${msp_setup_only}" | |
| echo | |
| echo " start_mfd: ${start_mfd}" | |
| echo " start_msp: ${start_msp}" | |
| echo " create_node_img: ${create_node_img}" | |
| echo | |
| echo " scbootdir: ${scbootdir}" | |
| echo " modules_dir: ${modules_dir}" | |
| echo " mfd_port: ${mfd_port}" | |
| echo " root_port: ${root_port}" | |
| echo " kmod_port: ${kmod_port}" | |
| echo " nodes_per_module: ${nodes_per_module}" | |
| echo " module_count: ${module_count}" | |
| echo " node population: $population" | |
| echo " msp hash: $(get_msp_hash)" | |
| echo | |
| } | |
| lock_tree() { | |
| local path=$1 owner= | |
| vecho >&2 "Locking ${path}" | |
| if [ -L ${path} ]; then | |
| # Lock exists, if locker is gone, remove the lock | |
| owner=$(readlink ${path}) | |
| if ps ax | grep -qs "^ *${owner:-NONE}.*scboot"; then | |
| die 1 "Lock exists on ${path} (pid: ${owner})" | |
| else | |
| echo "Removing stale lock on ${path} (pid: ${owner})" | |
| rm ${path} | |
| fi | |
| fi | |
| if ln -s ${mypid} ${path} 2>/dev/null; then | |
| # Don't quote scboot_traps, they're multiple args | |
| trap "unlock_tree_trap ${path}" ${scboot_traps} | |
| else | |
| die 1 "Failed to obtain lock on ${path}" | |
| fi | |
| } | |
| unlock_tree() { | |
| local path=$1 owner= | |
| # Don't quote scboot_traps, they're multiple args | |
| trap - ${scboot_traps} | |
| # Terminate any running socat image servers | |
| [ "$socat_pids" ] && kill ${socat_pids} | |
| owner=$(readlink ${path}) | |
| vecho >&2 "Unlocking ${path}" | |
| if [ "${owner}" == "${mypid}" ]; then | |
| rm -f ${path} | |
| else | |
| die 1 "Lock corruption on ${path}!! (us: ${mypid}, owner ${owner}" | |
| fi | |
| } | |
| unlock_tree_trap() { | |
| echo >&2 "Caught signal, cleaning up." | |
| if [ -n "${pidlist}" ]; then | |
| vecho >&2 "Cleaning up background processes: ${pidlist}" | |
| kill ${pidlist} 2>/dev/null | |
| pidlist= | |
| fi | |
| unlock_tree $1 | |
| die 1 | |
| } | |
| get_hostip() { | |
| local host="$1" | |
| local ip="" | |
| count=0 | |
| while [ -z "$ip" ]; do | |
| ip="$(getent hosts $host |awk '{print $1}')" | |
| [ -z "$ip" ] && sleep 1 | |
| count=$(( count + 1 )) | |
| if [ $count -gt 5 ]; then | |
| die 1 "Could not resolve $host" | |
| fi | |
| done | |
| echo "$ip" | |
| } | |
| get_netblock() { | |
| local netblock= | |
| netblock=$(cat /proc/cmdline | grep INTERNAL_NETBLOCK \ | |
| | sed 's:^.*INTERNAL_NETBLOCK=\([0-9\.]*\).*$:\1:') | |
| echo ${netblock} | |
| } | |
| get_msp_hash() { | |
| local md5s= msp_hash= | |
| md5s="$(md5sum ${uclinux} ${scand_dir}/{attnd,mspscand,mspledd})" | |
| msp_hash="$(echo ${md5s} ${scand_options} $(get_netblock) | md5sum)" | |
| echo ${msp_hash% -} | |
| } | |
| # Utility function for show_settings, shows version in parens. | |
| secho() { | |
| eval local var=$1 val=\$${1} real= | |
| real=$(readlink -f ${val/default*/default}) | |
| [[ "${val}" =~ "default" ]] && real=" (${real##*/})" || real= | |
| echo " $var: $val$real" | |
| } | |
| # Start/restart ev1d due to socket descriptor leak | |
| start_ev1d() { | |
| # The sfx system profile has multiple partitions sharing a single | |
| # ev1d and is not prone to leaking ev1d sockets so don't restart it. | |
| if [ "${SCv_system_profile}" != "sfx" ]; then | |
| echo "Restarting ev1d" | |
| /etc/init.d/ev1d restart | |
| fi | |
| } | |
| # Start master_clock_agent for this Cortex | |
| start_master_clock_agent() { | |
| local curpid= sys=${partition} | |
| local rundir=/var/run/master_clock_agent/${sys} | |
| # Find already running master_clock_agent | |
| if [ -e ${rundir}/pid ]; then | |
| read -u 0 curpid < ${rundir}/pid | |
| read -u 0 curmd5 curpath < ${rundir}/md5sum | |
| if ! ps ax | grep -qs "^ *${curpid:-NONE}"; then | |
| vecho " master_clock_agent ($curpid) went away." | |
| curpid= | |
| start=1 | |
| fi | |
| fi | |
| if [ "${curpid}" ]; then | |
| vecho " Killing master_clock_agent pid ${curpid}" | |
| kill ${curpid} | |
| sleep 2 | |
| fi | |
| echo -e "\nLaunching Master Clock Agent" | |
| mkdir -p ${rundir} /var/log/${sys}/ | |
| ${master_clock_agent} \ | |
| --port ${clock_port} \ | |
| -o ${full_population} \ | |
| --partition ${partition} \ | |
| ${verbose:+--loglevel DEBUG} \ | |
| --route_info_file "/var/state/route_info.${partition}" \ | |
| ${warmboot:+--warmboot ${warmboot}} \ | |
| ${warmboot:+--loglevel DEBUG} \ | |
| --pid_file ${rundir}/pid \ | |
| --log_file /var/log/${sys}/master_clock_agent.log \ | |
| || die 1 "master_clock_agent launch failed" | |
| md5sum ${master_clock_agent} > ${rundir}/md5sum | |
| } | |
| # Start correct mfd for this Cortex | |
| start_mfd() { | |
| local start= sys=${partition} | |
| local rundir=/var/run/mfd/${sys} | |
| local curpid= curmd5= curpath= md5= path= | |
| echo -e "\nChecking Master Fabric Daemon" | |
| # Find already running mfd | |
| if [ -e ${rundir}/pid ]; then | |
| read -u 0 curpid < ${rundir}/pid | |
| read -u 0 curmd5 curpath < ${rundir}/md5sum | |
| if ! ps ax | grep -qs "^ *${curpid:-NONE}"; then | |
| vecho " mfd ($curpid) went away." | |
| curpid= | |
| start=1 | |
| fi | |
| fi | |
| # auto means restart if running mfd is different than one requested | |
| read -u 0 md5 path < <(md5sum ${mfd}) | |
| if [ "${md5:-XXX}${path}" != "${curmd5}${curpath}" ]; then | |
| vecho " Running mfd is different." | |
| start=1 | |
| fi | |
| # Only honor skip request if an mfd is currently running | |
| if [ "${start_mfd}" = "skip" ] && [ "${curpid}" ]; then | |
| start= | |
| fi | |
| if [ "${start}" ] || [ "${start_mfd}" = "force" ]; then | |
| if [ "${curpid}" ]; then | |
| vecho " Killing mfd pid ${curpid}" | |
| kill ${curpid} | |
| sleep 2 | |
| fi | |
| vecho " Starting new Master Fabric Daemon (port ${mfd_port})" | |
| mkdir -p ${rundir} /var/log/${sys}/ | |
| ${mfd} -d "${mfd_dir}" -p ${mfd_port} ${verbose:+--loglevel DEBUG} \ | |
| --pid_file ${rundir}/pid --log_file /var/log/${sys}/mfd.log \ | |
| || die 1 "mfd launch failed" | |
| md5sum ${mfd} > ${rundir}/md5sum | |
| else | |
| vecho " Using existing Master Fabric Daemon (pid ${curpid})" | |
| fi | |
| } | |
| port_in_use() { | |
| # I'm not generally in favor of using Bash voodoo but, with the | |
| # number of ports open on the SSP, processing the output of | |
| # `netstat -n -A inet` or the contents of /proc/net/tcp takes way | |
| # too long (~7-10 seconds). | |
| (: < /dev/tcp/localhost/$1) 2>/dev/null | |
| return $? | |
| } | |
| get_free_ports() { | |
| # Dynamic ports range 49152-65535 | |
| local portbase="${1:-49152}" | |
| local num="${2:-1}" | |
| local ports="" | |
| # Find the requested number of ports starting at portbase | |
| while [ $(wc -w <<<"$ports") -lt $num ]; do | |
| if ! port_in_use $portbase; then | |
| ports="${ports}${portbase} " | |
| fi | |
| portbase=$(( portbase + 1 )) | |
| done | |
| # Don't quote | |
| echo $ports | |
| } | |
| socat_serve() { | |
| local img="$1" port="$2" | |
| local try= pid= | |
| socat -U TCP4-LISTEN:$port,fork,reuseaddr OPEN:"$img",rdonly & | |
| socat_pids="${socat_pids}$! " | |
| pidlist="${pidlist}$! " | |
| sleep 1 | |
| [ -e /proc/$! ] || die 1 "Could not start NBD socat server" | |
| } | |
| # Given cluster and mnum calculate other module info | |
| calc_mod_info() { | |
| local mspip= | |
| # Calculate module specifics | |
| msp="${cluster}-msp${mnum}" | |
| mspip="$(get_hostip $msp)" | |
| # Calculate this from MSP IP so it works for frost and sleet | |
| modnum="$(( ${mspip##*.} - 100 ))" | |
| } | |
| # Setup the node rootfs (and image if needed) | |
| setup_node_rootfs() { | |
| echo -e "\nSetting up node rootfs image" | |
| if [ -e /root/.ssh/id_dsa.pub ]; then | |
| # Update Node ssh authorized keys if needed | |
| ssp_dsa_key=$(cat /root/.ssh/id_dsa.pub) | |
| if ! grep -qs "${ssp_dsa_key}" ${root}/root/.ssh/authorized_keys; then | |
| vecho "Updating root user authorized_keys in the node rootfs" | |
| mkdir -p ${root}/root/.ssh/ | |
| echo "${ssp_dsa_key}" >> ${root}/root/.ssh/authorized_keys | |
| chmod 755 ${root}/root/.ssh/ | |
| chmod 600 ${root}/root/.ssh/authorized_keys | |
| fi | |
| fi | |
| # Only honor skip request if an image already exists | |
| if [ "${create_node_img}" = "skip" ]; then | |
| if file ${root_img} | grep -s "ext2 filesystem"; then | |
| vecho "Skipping checks, using existing image: ${root_img}" | |
| return 0 | |
| else | |
| die 1 "No valid root image found at: ${root_img}" | |
| fi | |
| fi | |
| # All NBD modes require the rootfs image to be up to date | |
| case ${root_mode} in | |
| *nbd) | |
| ${mkext2img} --indent 2 ${verbose:+-v} \ | |
| --img ${root_img} --dir ${root} \ | |
| || die 1 "Failed node root image creation" | |
| ;; | |
| esac | |
| # Only socatnbd requires us to serve the image | |
| if [ "${root_mode}" = "socatnbd" ]; then | |
| ### Serve node rootfs image using socat ### | |
| socat_serve ${root_img} ${root_port} \ | |
| || die 1 "Failed to serve rootfs image via socat" | |
| socat_serve ${modules_img} ${kmod_port} \ | |
| || die 1 "Failed to serve kernel module image via socat" | |
| fi | |
| } | |
| # Setup the runtime kernel modules directory (and image if needed) | |
| setup_node_kmodules() { | |
| local kver= output= mdir= | |
| local idir=$1 | |
| [ "${idir}" ] || die 1 "setup_node_kmodules called without initramfs dir" | |
| kver=$(strings ${vmlinux} | grep "^Linux version" | awk '{print $3}') | |
| # ---- Modules needed for boot in initramfs ---- | |
| # Copy in the kernel modules for boot | |
| vecho " Copying kernel modules for boot" | |
| rsync -aKm --include=sc*.ko --exclude=**.ko ${kernel_modules}/ ${idir}/ \ | |
| || die 1 "kernel boot modules copy failed" | |
| if [ "${root_mode}" = "lustre" ]; then | |
| vecho " Copying lustre modules for boot" | |
| rsync -aKm ${lustre_modules}/ ${idir}/ \ | |
| || die 1 "kernel boot modules copy failed" | |
| fi | |
| # Recreate module dependencies for boot modules | |
| vecho " Running depmod on ${idir}/lib/modules/${kver}" | |
| output=$(depmod -b ${idir} -e -F ${kernel_files}/System.map ${kver} 2>&1) \ | |
| || die 1 "failed depmod on ${idir}/lib/modules/${kver}:\n${output}" | |
| vecho "${output}" | |
| # Strip debug info from kernel modules in the initramfs | |
| find ${idir}/lib/modules -type f -a -name *.ko -a -print0 \ | |
| |xargs -0 -ifl scstrip --strip-debug fl \ | |
| || die 1 "strip of kernel boot modules failed" | |
| # ---- Modules needed runtime in node rootfs ---- | |
| # Copy in the kernel modules to rootfs modules dir | |
| vecho " Copying kernel modules into node rootfs" | |
| mkdir -p ${modules_dir}/lib64 | |
| ln -sf lib64 ${modules_dir}/lib | |
| for mdir in ${kernel_modules} ${lustre_modules} \ | |
| ${myrinet_modules} ${myrimx_modules} ${qlogic_modules} ${sk98lin_modules} \ | |
| ${openib_modules} ${other_modules}; do | |
| rsync -aKm ${mdir}/ ${modules_dir}/ \ | |
| || die 1 "kernel modules copy failed" | |
| done | |
| # Recreate module dependencies for node rootfs | |
| local kdir=${modules_dir}/lib/modules/${kver} | |
| vecho " Running depmod on ${kdir}" | |
| output=$(depmod -b ${modules_dir} -e -F ${kernel_files}/System.map \ | |
| ${kver} 2>&1) || die 1 "failed depmod on ${kdir}:\n${output}" | |
| vecho "${output}" | |
| # Keep modules.* timestamps to avoid unneeded rootfs updates | |
| for ref_file in ${kernel_modules}/lib/modules/${kver}/modules.*; do | |
| touch -r ${ref_file} ${kdir}/$(basename ${ref_file}) \ | |
| || die 1 "Could not set timestamp on kernel modules.* files" | |
| done | |
| # Strip debug info from kernel modules in the node rootfs | |
| find ${modules_dir}/lib/modules -type f -a -name *.ko -a -print0 \ | |
| |xargs -0 -ifl scstrip --strip-debug --preserve-dates fl \ | |
| || die 1 "strip of kernel boot modules failed" | |
| # All NBD modes require the kernel NBD image to be up to date | |
| case ${root_mode} in | |
| *nbd) | |
| ${mkext2img} --indent 2 ${verbose:+-v} --free 10 \ | |
| --img ${modules_img} --dir ${modules_dir}/lib/modules \ | |
| || die 1 "Failed module image creation" | |
| ;; | |
| esac | |
| } | |
| setup_msp_data() { | |
| local scand_hash= | |
| vecho " Building MSP image" | |
| # Start script sets up MSPnet networks and runs attnd/scand | |
| vecho " Creating MSP rc.msh replacement: ${scbootdir}/rc.sh" | |
| # For legacy reasons for diags we generate this and store it on | |
| # the MSP. We should try to remove this at some point as it's | |
| # essentially redundant with get_msp_hash(). | |
| scand_hash="$(cat ${scand_dir}/{attnd,mspscand,mspledd} | md5sum) ${scand_options}" | |
| # Replacement for rc.msh | |
| cat > ${scbootdir}/rc.msh <<-EOF | |
| # Local Network Config | |
| ifconfig lo 127.0.0.1 | |
| route add -net 127.0.0.0 netmask 255.0.0.0 lo | |
| # Start DHCP Client and sync on it getting a config. | |
| slotid=\`cat /proc/slotid\` | |
| slotid=\`printf "%02x" \$slotid\` | |
| envtool > /dev/urandom # More entropy for patched dhcpcd | |
| dhcpcd -D -H -p -i msp-kernel-\$slotid-$(get_msp_hash) -a eth0 & | |
| while true; do [ -e /var/tmp/dhcpc/dhcpcd-eth0.info ] && break; done | |
| # Set the system time and spawn the rdate update client script. | |
| rdate -s msp-ssp.scsystem | |
| msh /etc/rdate.msh msp-ssp.scsystem & | |
| # Renew the lease after time is set, fix for SiCortex Bug 2997 et al. | |
| dhcpcd -n eth0 | |
| #################################### | |
| ## Additions to startup MSP programs | |
| #################################### | |
| hostname | sed 's/^.*-msp//' > /tmp/mspnum | |
| mspnum=\`cat /tmp/mspnum\` | |
| hostname | sed 's/^..\\(.\\).*$/\1/' > /tmp/sysnum | |
| sysnum=\`cat /tmp/sysnum\` | |
| case \`hostname\` in | |
| sf*) octet3=\`expr 100 + \$sysnum\` ;; | |
| sx*) octet3=\`expr 100 + \$sysnum + 4\` ;; | |
| sca*) octet3=\`expr 100 + \$mspnum\` ;; | |
| sc0*) octet3=\`expr 100 + \$mspnum\` ;; | |
| sc1*) octet3=\`expr 100 + \$mspnum + 4\` ;; | |
| sci*) octet3=\`expr 100 + \$mspnum\` ;; | |
| scx*) octet3=\`expr 100 + \$mspnum\` ;; | |
| esac | |
| net_triple=$(get_netblock).\$octet3 | |
| for i in $(echo $(seq 0 $(( nodes_per_module - 1 )) )); do | |
| localnet="\${net_triple}.\`expr 200 + \$i\`" | |
| remotenet="\${net_triple}.\`expr 100 + \$i\`" | |
| ifconfig msp\${i} \${localnet} pointopoint \${remotenet} | |
| done | |
| /attnd & | |
| echo \$? > /var/run/attnd.pid | |
| /mspscand ${scand_options} || exit 1 | |
| # We can't just use \$? as mspscand execs itself... | |
| grep -l "^/mspscand" /proc/*/cmdline \ | |
| | awk -F/ '{print $3}' \ | |
| > /var/run/mspscand.pid | |
| /mspledd & | |
| /bin/sh -c ps | grep [s]cand | awk '{print \$1}' > /tmp/ps.tmp | |
| echo \$! \`cat /tmp/ps.tmp\` ${scand_hash} > /tmp/scand.hash | |
| EOF | |
| cat > ${scbootdir}/hosts <<-EOF | |
| $(get_hostip msp-ssp.scsystem) msp-ssp.scsystem msp-ssp | |
| EOF | |
| msp_tz_file="" | |
| if [ -f /var/state/msp/etc/TZ ]; then | |
| msp_tz_file="/var/state/msp/etc/TZ:./etc/TZ" | |
| fi | |
| # Create the new MSP uclinux image | |
| vecho " Generating MSP image: ${scbootdir}/msp_full.bin" | |
| cp ${uclinux} ${scbootdir}/msp_full.bin | |
| ${romfstool} ${scbootdir}/msp_full.bin \ | |
| ${scbootdir}/rc.msh:./etc/rc.msh \ | |
| ${scbootdir}/hosts:./etc/hosts \ | |
| ${scand_dir}/attnd:./attnd \ | |
| ${scand_dir}/mspscand:./mspscand \ | |
| ${scand_dir}/mspledd:./mspledd \ | |
| ${msp_tz_file} \ | |
| > ${scbootdir}/romfstool.log || die 1 "MSP image generation failed" | |
| } | |
| setup_voltfreq_info() { | |
| # Load alternate voltages and clock frequencies | |
| volt_freq_file="/var/state/volt_freq_info.${partition}" | |
| if [ -r "$volt_freq_file" ]; then | |
| vddc=$(awk '/^VDDC/ {print $2}' < "$volt_freq_file") | |
| vddf=$(awk '/^VDDF/ {print $2}' < "$volt_freq_file") | |
| vddr=$(awk '/^VDDR/ {print $2}' < "$volt_freq_file") | |
| vddl=$(awk '/^VDDL/ {print $2}' < "$volt_freq_file") | |
| pclk=$(awk '/^PCLK/ {print $2}' < "$volt_freq_file") | |
| dclk=$(awk '/^DCLK/ {print $2}' < "$volt_freq_file") | |
| sclk=$(awk '/^SCLK/ {print $2}' < "$volt_freq_file") | |
| pcirefclk=$(awk '/^PCIREFCLK/ {print $2}' < "$volt_freq_file") | |
| fi | |
| # if not set, go with the defaults | |
| volt_freq_type=$(volt_freq_type ${verbose:+--verbose} -p ${partition} --route_info_file "/var/state/route_info.${partition}") | |
| case ${volt_freq_type} in | |
| 1) volt_freq_default="/opt/sicortex/config/volt_freq_info.1.default" ;; | |
| 2) volt_freq_default="/opt/sicortex/config/volt_freq_info.2.default" ;; | |
| 3) volt_freq_default="/opt/sicortex/config/volt_freq_info.3.default" ;; | |
| *) die 2 "invalid volt_freq_type: ${volt_freq_type}" ;; | |
| esac | |
| if [ -r "$volt_freq_default" ]; then | |
| vddc=${vddc:-$(awk '/^VDDC/ {print $2}' < "$volt_freq_default")} | |
| vddf=${vddf:-$(awk '/^VDDF/ {print $2}' < "$volt_freq_default")} | |
| vddr=${vddr:-$(awk '/^VDDR/ {print $2}' < "$volt_freq_default")} | |
| vddl=${vddl:-$(awk '/^VDDL/ {print $2}' < "$volt_freq_default")} | |
| pclk=${pclk:-$(awk '/^PCLK/ {print $2}' < "$volt_freq_default")} | |
| dclk=${dclk:-$(awk '/^DCLK/ {print $2}' < "$volt_freq_default")} | |
| sclk=${sclk:-$(awk '/^SCLK/ {print $2}' < "$volt_freq_default")} | |
| pcirefclk=${pcirefclk:-$(awk '/^PCIREFCLK/ {print $2}' < "$volt_freq_default")} | |
| fi | |
| if [ -n "${pclk}" ]; then | |
| vecho "Alternate pclk frequency specified: ${pclk}" | |
| fi | |
| if [ -n "${dclk}" ]; then | |
| vecho "Alternate dclk frequency specified: ${dclk}" | |
| fi | |
| if [ -n "${sclk}" ]; then | |
| vecho "Alternate sclk frequency specified: ${sclk}" | |
| fi | |
| if [ -n "${pcirefclk}" ]; then | |
| vecho "Alternate pcirefclk frequency specified: ${pcirefclk}" | |
| fi | |
| if [ -n "${vddc}${vddf}${vddr}${vddl}" ]; then | |
| if [ -z "${vddc}" -o -z "${vddf}" -o -z "${vddr}" -o -z "${vddl}" ]; then | |
| die 2 "${volt_freq_file} must contain all four or no voltages" | |
| fi | |
| vecho "Alternate voltages specified: ${vddc},${vddf},${vddl},${vddr}" | |
| fi | |
| } | |
| # Setup module data directories | |
| setup_node_data() { | |
| local idir= mnum= modnum= | |
| mkdir -p ${scbootdir} | |
| cd ${scbootdir} # initramfs, bootk and bamf work in cwd | |
| echo -e "\nCreating boot configuration" | |
| # finalize kernel args since setup_voltfreq_info() has been run | |
| kargs="linux" | |
| if [ -n "${pclk}" ]; then | |
| kargs="${kargs} clk=${pclk}" | |
| else | |
| kargs="${kargs} clk=${SCv_pclk_mhz}" | |
| fi | |
| kargs="${kargs} console=msp0" | |
| kargs="${kargs} loglevel=${loglevel}" | |
| kargs="${kargs} maxcpus=6" | |
| kargs="${kargs} bigphysarea=${SCv_kernel_bigphysarea}" # needed for scfab | |
| kargs="${kargs} rdinit=/sbin/preinit" # the key to ramdisk booting | |
| kargs="${kargs} ${append_args}" | |
| # emit finalized volt/freq info and kernel args | |
| ( | |
| echo " voltage vddc: ${vddc:-<default>}" | |
| echo " voltage vddf: ${vddf:-<default>}" | |
| echo " voltage vddr: ${vddr:-<default>}" | |
| echo " voltage vddl: ${vddl:-<default>}" | |
| echo " frequency pclk: ${pclk:-<default>}" | |
| echo " frequency dclk: ${dclk:-<default>}" | |
| echo " frequency sclk: ${sclk:-<default>}" | |
| echo " frequency pcirefclk: ${pcirefclk:-<default>}" | |
| echo " kargs: ${kargs}" | |
| echo | |
| )>> ${scboot_log} | |
| vecho " Copying bootloader programs" | |
| # Copy in the basic bootloader programs | |
| cp ${boot1_dir}/boot[0-2].elf ${scbootdir} || die 1 "boot*.elf copy failed" | |
| cp ${boot1_dir}/dmseg_gdb.elf ${scbootdir} || die 1 "dmseg_gdb.elf copy failed" | |
| # Copy in and strip the kernel | |
| cp ${vmlinux} ${scbootdir}/vmlinux || die 1 "vmlinux copy failed" | |
| scstrip ${scbootdir}/vmlinux || die 1 "strip of vmlinux failed" | |
| # Create initramfs base directories | |
| vecho " Copying data into initramfs" | |
| idir=${scbootdir}/.initramfs | |
| mkdir -p ${idir}/sbin ${idir}/var/state/etc/openldap | |
| mkdir -p ${idir}/var/state/etc/udev/rules.d | |
| touch ${idir}/var/state/etc/udev/rules.d/70-persistent-net.rules | |
| # Copy in hosts file | |
| local hosts_file="/var/state/hosts" | |
| cp $hosts_file ${idir}/$hosts_file || die 1 "$hosts_file copy failed" | |
| cat <<-EOF >> ${idir}/$hosts_file | |
| 127.0.0.1 localhost | |
| # IPV6 versions of localhost and co | |
| ::1 ip6-localhost ip6-loopback | |
| fe00::0 ip6-localnet | |
| ff00::0 ip6-mcastprefix | |
| ff02::1 ip6-allnodes | |
| ff02::2 ip6-allrouters | |
| ff02::3 ip6-allhosts | |
| EOF | |
| # Copy in ldap.conf files that are modified to point to the SSP | |
| cat /etc/ldap.conf | grep -v "^ssl " | sed 's/^host .*$/host ssp/' \ | |
| > ${idir}/var/state/etc/ldap.conf \ | |
| || die 1 "ldap.conf copy failed" | |
| cat /etc/openldap/ldap.conf | sed 's/^HOST .*$/HOST ssp/' \ | |
| > ${idir}/var/state/etc/openldap/ldap.conf \ | |
| || die 1 "ldap.conf copy failed" | |
| # Copy in route_info.* files | |
| local route_info_file="/var/state/route_info.${partition}" | |
| if [ -f $route_info_file ]; then | |
| cp $route_info_file ${idir}/var/state/ \ | |
| || die 1 "$route_info_file copy failed" | |
| fi | |
| # Copy in /opt/sicortex/config | |
| local osc_dest="${idir}/var/state/opt_sicortex_config" | |
| mkdir -p "${osc_dest}" | |
| cp -a /opt/sicortex/config/* "${osc_dest}" || die 1 "${osc_dest} copy failed" | |
| # Copy in the generated config elements as well | |
| generated_config_dir=/var/state/config/${partition} | |
| if [ -d "${generated_config_dir}" ] ; then | |
| for file in $(ls ${generated_config_dir}/) ; do | |
| cp ${generated_config_dir}/${file} ${osc_dest}/ \ | |
| || die 1 "${osc_dest} copy failed" | |
| done | |
| fi | |
| # Copy in the boot scripts | |
| cp -a ${boot_scripts}/* ${idir}/ || die 1 "boot scripts copy failed" | |
| # Copy in fabricd | |
| cp ${fabricd} ${idir}/sbin/fabricd || die 1 "fabricd copy failed" | |
| # Setup kernel modules | |
| setup_node_kmodules ${idir} | |
| #-------- | |
| # Honestly, this must go somewhere else. | |
| #-------- | |
| # v--- | |
| for f in $SCv_initramfs_var_state_files; do | |
| cp $f ${idir}/var/state/ \ | |
| || die 1 "$f copy failed" | |
| done | |
| # ^--- This will replace this ---v | |
| cp $CTHLIB ${idir}/var/state/cthlib \ | |
| || die 1 "cthlib copy failed" | |
| echo "_SCv_system_profile=$SCv_system_profile" > ${idir}/var/state/scprofile | |
| # ----^ | |
| # Odd quoting should be fixed by changing eval's in cthlib | |
| for mnum in $modules; do | |
| calc_mod_info | |
| slotid="" | |
| case ${partition} in | |
| scx) slotid=$(( 0x00 + mnum )) ;; # SC5832 | |
| sci) slotid=$(( 0x24 + mnum )) ;; # SC1458 | |
| sc0) slotid=$(( 0x30 + mnum )) ;; # SC648 Left | |
| sc1) slotid=$(( 0x34 + mnum )) ;; # SC648 Right | |
| sx*) slotid=$(( 0x3b + ${partition/sx} )) ;; # SC162 | |
| sca) slotid=$(( 0x38 )) ;; # SC072 | |
| sf*) slotid=$(( 0x3f )) ;; # SC24 | |
| *) die 2 "Unknown system type ${partition//[0-9]/}" ;; | |
| esac | |
| ssp_gw_ip="$(get_hostip mgt0-ssp0)" | |
| . $CTHLIB \ | |
| SCv_boot_inst=$scboot_inst \ | |
| SCv_boot_dbg_points="'$dbg_points'" \ | |
| SCv_booted_node_count=${population} \ | |
| SCv_mfd_port=${mfd_port} \ | |
| SCv_clock_port=${clock_port} \ | |
| SCv_my_modnum=${modnum} \ | |
| SCv_rootfs="'$(get_hostip ssp):${root}'" \ | |
| SCv_rootfs_n32="'$(get_hostip ssp):${root_n32}'" \ | |
| SCv_modules="'$(get_hostip ssp):${modules_dir}'" \ | |
| SCv_partition="$partition" \ | |
| --boot_args \ | |
| > ${idir}/var/state/boot_args.$slotid \ | |
| || die 1 "cthlib call failed" | |
| cat >> ${idir}/var/state/boot_args.$slotid <<-EOF | |
| SCv_module_count='${module_count}' | |
| SCv_name='${cluster}' | |
| SCv_module_id='${mnum}' | |
| SCv_rootfs_mode='${root_mode}' | |
| SCv_rootfs_srv_port='${root_port}' | |
| SCv_rootfs_url='http://$(get_hostip ssp)${root_img}' | |
| SCv_rootfs_name='${root_img}' | |
| SCv_rootfs_n32_name='${root_n32_img}' | |
| SCv_kmod_srv_port='${kmod_port}' | |
| SCv_kmod_url='http://$(get_hostip ssp)${modules_img}' | |
| SCv_kmod_name='${modules_img}' | |
| SCv_dhcpc_args='-V ${cluster}-${mnum}' | |
| SCv_my_fqdn="${cluster}-m${mnum}n\${NodeID}.scsystem" | |
| SCv_ssp_gw_ip="${ssp_gw_ip}" | |
| SCv_nodes_per_module='${nodes_per_module}' | |
| EOF | |
| done | |
| #-------- | |
| vecho " Building initramfs cpio image" | |
| eval make -f ${initramfs_generator}/Makefile srcdir=${initramfs_generator} \ | |
| INITRAMFS_SOURCES=\"${initramfs_base} ${idir} ${initramfs_sources}\" \ | |
| CC=gcc ${redirect} || die 1 "initramfs build returned bad status $?" | |
| vecho " Building bootk.elf" | |
| eval make -f ${bootk_generator}/Makefile srcdir=${bootk_generator} \ | |
| PREFIX=mips64el-gentoo-linux-gnu- INITRAMFS=initramfs.cpio.gz \ | |
| VMLINUX=${scbootdir}/vmlinux KARGS=\"${kargs}\" \ | |
| ${redirect} || die 1 "bootk build returned bad status $?" | |
| } | |
| #### start_msp and utility functions #### | |
| # The per MSP state machine looks like this: | |
| # | |
| # o --> <CHECK STATE> <---------<-------+-----<----+ | |
| # | | | | |
| # +-> [timeout] --> <DIE> | | | |
| # | ^ | | |
| # +-> ['AWOL'] --> <DIE> | ^ | |
| # | | | | |
| # +-> ['msp_boot'] ---> <SET_REBOOTED> | | |
| # | ^ | |
| # +-> ['msp_kernel'] | | |
| # | | | |
| # +-> [badhash x 1] --> <REBOOT> -->-+ | |
| # | | | |
| # +-> [badhash x 2] --> <DIE> ^ | |
| # | | | |
| # +-> [goodhash x 1, rebooted] --->--+ | |
| # | | |
| # | | |
| # +-> [goodhash, not rebooted] --->--+ | |
| # | v | |
| # +-> [goodhash x 2] ----->----> <SUCCESS> | |
| # | |
| ## List processing routines ## | |
| # contains <item> <list_var> -> true if <item> is in $list_var | |
| contains() { eval [ \"\${${2}//\${1} /}\" != \"\${${2}}\" ]; } | |
| # append <item> <list_var> -> append <item> to $list_var | |
| append() { eval $2=\"\${$2}\$1 \"; } | |
| # remove <item> <list_var> -> remove <item> from $list_var | |
| remove() { eval $2=\"\${$2//\$1 /}\"; } | |
| # Get the MSPs into the right state | |
| start_msp() { | |
| # MSP state lists | |
| local rebooted= goodhash= badhash= finished= pending= forced= | |
| # Other locals | |
| local start_time=$(date +%s) elapsed= state_file= state_files= | |
| local msp= timestamp= state= timer_pid= | |
| local msp_list= msp_file= msp_hash=$(get_msp_hash) | |
| local timeout_file="${scbootdir}/mspstate.timeout" | |
| echo -e "\nChecking Module Service Processors" | |
| if [ "${start_msp}" = "skip" ]; then | |
| vecho " Skipping MSP checks" | |
| return 0 | |
| fi | |
| # Tell policyd we're rebooting... | |
| /usr/sbin/policydc env down | |
| /usr/sbin/policydc env reset | |
| # Per module setup | |
| for mnum in ${modules}; do | |
| msp="${cluster}-msp${mnum}" | |
| msp_file=/tftproot/${msp}_linux.bin | |
| append ${msp} msp_list | |
| state_file=${msp_state_dir}/${msp} | |
| append ${state_file} state_files | |
| [ -e ${state_file} ] || die 1 "State file ${state_file} missing" | |
| rm -f ${msp_file} | |
| ln -sf ${scbootdir}/msp_full.bin ${msp_file} | |
| done | |
| [ "${start_msp}" = "force" ] && forced="${msp_list}" | |
| rm -f "${timeout_file}" | |
| touch "${timeout_file}" | |
| # Don't quote scboot_traps, they're multiple args | |
| ( | |
| trap - ${scboot_traps} | |
| sleep "${msp_timeout}" | |
| echo "TIMEOUT" > "${timeout_file}" | |
| ) & | |
| timer_pid=$! | |
| pidlist="${pidlist}${timer_pid} " | |
| # The <CHECK STATE> loop | |
| while true; do | |
| if read msp timestamp state; then | |
| if [ "${msp}" = "TIMEOUT" ]; then | |
| pending="${msp_list} " | |
| for msp in ${finished}; do | |
| remove ${msp} pending | |
| done | |
| die 1 "Unable to setup the following MSP(s) in ${msp_timeout} seconds: ${pending}" | |
| fi | |
| if ! contains "${msp}" msp_list; then | |
| die 1 "Unknown MSP: ${msp}" | |
| fi | |
| # User requested force reboot | |
| if contains ${msp} forced; then | |
| state=FORCE | |
| remove ${msp} forced | |
| fi | |
| case ${state} in | |
| AWOL) # Lease deleted and never came back | |
| append ${msp} rebooted | |
| vecho " ${msp}: has AWOL dhcp lease. May cause timeout." | |
| ;; | |
| msp-boot*) # uboot dhcp | |
| vecho " ${msp}: uboot dhcp" | |
| append ${msp} rebooted | |
| remove ${msp} finished | |
| ;; | |
| msp-kernel-*-${msp_hash}) # uclinux: right hash | |
| vecho " ${msp}: uclinux dhcp with correct state" | |
| if ! contains ${msp} rebooted; then | |
| # It was not rebooted and it is in the right state | |
| append ${msp} finished | |
| vecho " ${msp}: ready" | |
| elif contains ${msp} goodhash; then | |
| # We got two good states; uclinux is finished booting | |
| append ${msp} finished | |
| echo " ${msp}: rebooted and ready" | |
| fi | |
| append ${msp} goodhash | |
| ;; | |
| msp-kernel-*|UNKNOWN|FORCE) # uclinux: wrong hash, unknown, forced | |
| vecho " ${msp}: uclinux dhcp with bad state: '${state}'" | |
| if contains ${msp} badhash; then | |
| # Second time means reboot failed | |
| die 1 " ${msp}: rebooted to wrong MSP image." | |
| else | |
| echo " ${msp}: rebooting" | |
| (sleep 1; echo -e "reboot\nexit"; sleep 2) \ | |
| | telnet ${msp} >/dev/null 2>&1 & | |
| fi | |
| append ${msp} badhash | |
| remove ${msp} finished | |
| ;; | |
| *) # What's this? | |
| die 1 " ${msp}: bad state data '${msp} ${timestamp} ${state}'" | |
| ;; | |
| esac | |
| fi | |
| # When 'pending' is empty, all MSPs are sane | |
| pending="${msp_list} " | |
| for msp in ${finished}; do | |
| remove ${msp} pending | |
| done | |
| if [ "${pending// /}" ]; then | |
| elapsed=$(( $(date +%s) - start_time )) | |
| vecho " Waiting (${elapsed}/${msp_timeout}s) for MSPs: ${pending//${cluster}-msp/}" | |
| else | |
| vecho " All MSPs ready" | |
| break | |
| fi | |
| done < <(tail -n1 -q -f ${state_files} ${timeout_file} --pid=$mypid) | |
| # Clean up the timer pid now, otherwise when scboot terminates we | |
| # can get nasty looking "Terminated" messages | |
| kill ${timer_pid} | |
| # Tell policyd we're back... | |
| /usr/sbin/policydc env up | |
| return 0 | |
| } | |
| # Push the bootloader, vmlinux, bootk, initramfs across with bamf | |
| bamf_linux() { | |
| cmd="${bamf} ${verbose:+-v -v} --log_dir=/var/log/${partition}" | |
| cmd="${cmd} tftp://msp-ssp/${scbootdir#/tftproot}" | |
| if [ -n "${vddc}${vddf}${vddl}${vddr}" ]; then | |
| cmd="${cmd} --voltages ${vddc},${vddf},${vddl},${vddr}" | |
| fi | |
| if [ -n "${pclk}" ]; then | |
| cmd="${cmd} --pclk ${pclk}" | |
| fi | |
| if [ -n "${dclk}" ]; then | |
| cmd="${cmd} --dclk ${dclk}" | |
| fi | |
| if [ -n "${sclk}" ]; then | |
| cmd="${cmd} --sclk ${sclk}" | |
| fi | |
| if [ -n "${pcirefclk}" ]; then | |
| cmd="${cmd} --pcirefclk ${pcirefclk}" | |
| fi | |
| cmd="${cmd} $*" | |
| if [ "${pretend}" ]; then | |
| # Show what would be done | |
| echo " Pretend:" | |
| echo " cd ${scbootdir}" | |
| echo " ${cmd} --prefix=' bamf: '" | |
| else | |
| # We know the arp cache is hot for the MSPs | |
| logger -t scboot -- "booting [${scboot_inst}]" | |
| vecho " ${cmd} --prefix=\" bamf: \"" | |
| cd ${scbootdir} | |
| ${cmd} --prefix=" bamf: " || die 1 "Failed loading linux" | |
| fi | |
| echo -e "Finished loading linux (kernel boot initiated)" | |
| } | |
| # Halt all nodes for this partition | |
| halt_nodes() { | |
| echo -e "\nHalting all nodes" | |
| for n in $(seq 0 $((module_count-1)) ); do | |
| hmsp="${partition}-msp${n}" | |
| sfile="${msp_state_dir}/${hmsp}" | |
| hmask=$(( (1<<$nodes_per_module) - 1 )) | |
| hexhmask="$(printf "%#x" ${hmask})" | |
| if sline="$(tail -n1 ${sfile} 2>/dev/null)"; then | |
| if ! grep -q "AWOL" <<<"$sline"; then | |
| if ! ${bamf} --log_dir /tmp -r NA ${hmsp}:${hexhmask}; then | |
| if ! grep -q "\<$n\>" <<<"$bad_mod_list"; then | |
| die 1 "Halt of nodes on ${hmsp} failed" | |
| else | |
| echo "Warning: Halt of nodes on ${hmsp} failed" | |
| fi | |
| fi | |
| fi | |
| fi | |
| done | |
| } | |
| # Cleanup the slurm state for boot | |
| setup_slurm() { | |
| # Cancel any outstanding slurm jobs | |
| vecho -e "\nCancelling outstanding slurm jobs" | |
| scancel -p ${partition} | |
| # Mark the corresponding SLURM partitions as available | |
| vecho -e "\nMarking slurm partitions as available" | |
| for x in $(sinfo -a -h -o "%P" |grep "^${cluster}"); do | |
| scontrol update PartitionName=${x} Hidden=no State=up | |
| done | |
| # Set all nodes "Down", then "Resume" | |
| # This clears any stuck jobs from the nodes, | |
| # as well as potentially stale "Drain" states from previous boots. | |
| node_states="version" | |
| NL=$'\n' | |
| for m in $(seq 0 $((module_count - 1))); do | |
| nrange="[0-$((nodes_per_module-1))]" | |
| node_states="${node_states}${NL}update NodeName=${partition}-m${m}n${nrange} State=Down" | |
| node_states="${node_states}${NL}update NodeName=${partition}-m${m}n${nrange} State=Resume" | |
| done | |
| if [ -n "${node_states}" ]; then | |
| scontrol <<<"${node_states}${NL}quit" 1>${scbootdir}/scontrol-idle.log \ | |
| || echo "Trouble setting nodes to SLURM 'Idle' state." | |
| fi | |
| # Mark bad modules and nodes in "Drain" state | |
| node_states="version" | |
| node_reason="NotInService" | |
| for m in ${bad_mod_list}; do | |
| nrange="[0-$((nodes_per_module-1))]" | |
| node_states="${node_states}${NL}update NodeName=${partition}-m${m}n${nrange} State=Drain Reason=$node_reason" | |
| done | |
| for node in ${bad_node_list}; do | |
| m=$((node / nodes_per_module)) | |
| n=$((node % nodes_per_module)) | |
| node_states="${node_states}${NL}update NodeName=${partition}-m${m}n${n} State=Drain Reason=$node_reason" | |
| done | |
| if [ -n "${node_states}" ]; then | |
| scontrol <<<"${node_states}${NL}quit" 1>${scbootdir}/scontrol-drain.log \ | |
| || echo "Trouble setting missing nodes to SLURM 'Drain' state." | |
| fi | |
| # TODO: set default partition if we know of one. | |
| # scontrol update PartitionName=${SCv_default_partition} Default=yes | |
| } | |
| # Clear ev1d state for this partition | |
| clear_ev1d() { | |
| vecho -e "\nCleaning up FabriCache client partitions" | |
| ev1_str="fetch~0~sx2_fab .* mds \ndone\n" | |
| for i in $(echo -en "$ev1_str" | nc localhost 1234 | awk '{print $2}'); do | |
| part="${i}_clients" | |
| lines=$(sinfo -h -p $part 2> /dev/null | wc -l) | |
| if [ $lines != 0 ]; then | |
| echo " deleting $part" | |
| scontrol delete PartitionName="$part" | |
| fi | |
| done | |
| vecho -e "\nClearing event daemon state" | |
| nc localhost 1234 <<-EOF | |
| discard~^${partition}_.* | |
| add~${partition}_nbd_rootfs_servers= | |
| add~${partition}_mgtnet_probed= | |
| done | |
| EOF | |
| [ $? = 0 ] || die 1 "Cannot contact event daemon" | |
| } | |
| coldboot() { | |
| ### Setup everything needed for boot | |
| echo -e "\nBooting partition: ${partition}" | |
| setup_msp_data # Setup MSP boot config data | |
| start_msp # Reboot/setup the MSPs if necessary | |
| [ "${msp_setup_only}" ] && die 0 | |
| setup_voltfreq_info # Load and check volt/freq settings | |
| setup_node_data # Setup ICE9 boot config data | |
| halt_nodes # Halt nodes in this partition | |
| setup_node_rootfs # Setup NBD rootfs/kernel image | |
| start_ev1d # Start/restart ev1d | |
| start_mfd # Start mfd for this Cortex | |
| setup_slurm # Setup slurm config for boot | |
| clear_ev1d # Clear ev1d state for this partition | |
| start_master_clock_agent # Start master_clock_agent for this Cortex | |
| ### Boot Linux on the nodes ### | |
| echo -e "\nLoading and booting linux" | |
| local bamf_list= | |
| for mnum in $modules; do | |
| calc_mod_info | |
| # Remove bad nodes | |
| mod_bad_nodes=$(awk '/^'${mnum}' / {print $2}' <<<"${bad_nodes}") | |
| nmask=$(( (1<<$nodes_per_module) - 1 )) | |
| if [ -n "${mod_bad_nodes}" ]; then | |
| vecho " ${msp}: Not booting disabled nodes:" ${mod_bad_nodes} | |
| for node in ${mod_bad_nodes}; do | |
| # XOR this node from mask | |
| nmask=$(( $nmask ^ (1<<$node) )) | |
| done | |
| fi | |
| # nmask="node mask" | |
| hexmask="$(printf "%#x" ${nmask})" | |
| bamf_list="${bamf_list} ${msp}:${hexmask}" | |
| done | |
| bamf_linux ${bamf_list} || die 1 "Linux boot failed" | |
| # If using an NBD mode then wait for the nodes to suck the image down | |
| case ${root_mode} in | |
| *nbd) | |
| echo -e "\nWaiting for NBD server nodes" | |
| nbd_servers=$( | |
| echo -e "fetch~1~^${partition}_mgtnet_probed=[XM]{${population}}\ndone" \ | |
| | nc localhost 1234 | |
| ) | |
| [ $? = 0 ] || die 1 "Cannot contact event daemon" | |
| ;; | |
| esac | |
| } | |
| check_warmboot_not_dirty() { | |
| local dirty= dirty_env= | |
| for x in "${!SCBOOT_@}"; do | |
| case "${x}" in | |
| SCBOOT_PARTITION) ;; | |
| *) dirty_env="${dirty_env} ${x}" ;; | |
| esac | |
| done | |
| [ "${dirty_env}" ] && dirty="${dirty}\n\tenvironment variables: ${dirty_env}" | |
| [ "${dirty_flags}" ] && dirty="${dirty}\n\tcommandline flags: ${dirty_flags}" | |
| [ "${dirty}" ] && die 2 "Warmboot (--nodes) does not allow setting: ${dirty}" | |
| } | |
| warmboot() | |
| { | |
| scontrol update nodeName=${partition}-${warmboot} state=Down | |
| local data= module= node= bamf_list= | |
| data=${warmboot#m} | |
| module=${data%n*} | |
| node=${data#*n} | |
| node_n=$node | |
| node=$((1<<$node)) | |
| printf -v node "%x" $node | |
| bamf_list="${partition}-msp${module}:0x${node}" | |
| # bug 5918 - warmboot does not support global clock | |
| # start_master_clock_agent # Start master_clock_agent for this Cortex | |
| # | |
| # FIXME start node agents on neighbors | |
| # x6: | |
| #neighbor=${partition}-mXnY | |
| #$(srun -p ${partition} -w ${neighbor} /bin/node_clock_agent -g DEBUG -p ${clock_port} )& | |
| echo -e "\nLoading and booting linux (warmboot m${module}n${node_n})" | |
| bamf_linux -w ${bamf_list} || die 1 "Linux boot failed" | |
| scontrol update nodeName=${partition}-${warmboot} state=Resume | |
| } | |
| # | |
| # Derivative settings and sanity checking | |
| # | |
| scboot_secs=$(date +%s) | |
| scboot_inst="${partition}-${scboot_secs}" | |
| scboot_time=$(date --date=@${scboot_secs} "+%F %T %Z") | |
| scbootdir="${scbootdir_base}/${partition}" | |
| modules_dir="${rootfs_base}/${partition}-kmod" | |
| modules_img="${modules_dir}.img" | |
| if [ "${warmboot}" ]; then | |
| check_warmboot_not_dirty | |
| # Warmboot requires temp directory | |
| noclean=1 | |
| fi | |
| # For now, scboot must be run as root | |
| euser="$(id -un)" | |
| if [ "$euser" != "root" ]; then | |
| die 2 "The scboot program must be run as 'root' user -- you are '${euser}'." | |
| fi | |
| [ "${verbose}" ] && redirect=">/dev/null" || redirect="&>/dev/null" | |
| [ "$(get_netblock)" ] || die 2 "INTERNAL_NETBLOCK not found in /proc/cmdline" | |
| [ -d "${root}" ] || die 2 "${root} is not a directory" | |
| if [ "${track_boot}" ] && [ ! -x "$(which ${scbootmon})" ]; then | |
| die 2 "Could not find ${scbootmon}." | |
| fi | |
| case ${root_mode} in | |
| nfs|nbd|nfsnbd|socatnbd|httpnbd) true ;; | |
| lustre) die 2 "Lustre rootfs mode not yet supported" ;; | |
| *) die 2 "Invalid rootfs mode: ${root_mode}" ;; | |
| esac | |
| # Figure out some derived values | |
| [ -z "${cluster}" ] && cluster=${partition} | |
| # Set values based on the type of system | |
| mfd_port_base=6170 | |
| root_port_base=6270 | |
| kmod_port_base=6370 | |
| clock_port_base=6470 | |
| case ${partition} in | |
| # SC24 | |
| sf*) nodes_per_module=4 module_count=1 | |
| mfd_port=$((mfd_port_base + ${partition/sf/})) | |
| root_port=$((root_port_base + ${partition/sf/})) | |
| kmod_port=$((kmod_port_base + ${partition/sf/})) | |
| clock_port=$((clock_port_base + ${partition/sf/})) | |
| ;; | |
| # SC072 | |
| sca) nodes_per_module=12 module_count=1 | |
| mfd_port=${mfd_port_base} | |
| root_port=${root_port_base} | |
| kmod_port=${kmod_port_base} | |
| clock_port=${clock_port_base} | |
| ;; | |
| # SC162 | |
| sx*) nodes_per_module=27 module_count=1 | |
| mfd_port=$((mfd_port_base + ${partition/sx/1})) | |
| root_port=$((root_port_base + ${partition/sx/1})) | |
| kmod_port=$((kmod_port_base + ${partition/sx/1})) | |
| clock_port=$((clock_port_base + ${partition/sx/1})) | |
| ;; | |
| # SC648 | |
| sc[01]) | |
| nodes_per_module=27 module_count=4 | |
| mfd_port=$((mfd_port_base + ${partition/sc/})) | |
| root_port=$((root_port_base + ${partition/sc/})) | |
| kmod_port=$((kmod_port_base + ${partition/sc/})) | |
| clock_port=$((clock_port_base + ${partition/sc/})) | |
| ;; | |
| # SC1458 | |
| sci) nodes_per_module=27 module_count=9 | |
| mfd_port=${mfd_port_base} | |
| root_port=${root_port_base} | |
| kmod_port=${kmod_port_base} | |
| clock_port=${clock_port_base} | |
| ;; | |
| # SC5832 | |
| scx) nodes_per_module=27 module_count=36 | |
| mfd_port=${mfd_port_base} | |
| root_port=${root_port_base} | |
| kmod_port=${kmod_port_base} | |
| clock_port=${clock_port_base} | |
| ;; | |
| *) die 2 "Unknown system type ${partition//[0-9]/}" ;; | |
| esac | |
| modules="$(seq 0 $(( module_count - 1 )) )" | |
| population=$(( module_count * nodes_per_module )) | |
| # population will change if we have a route_info file; save the original one | |
| full_population=${population} | |
| # Find first free port in the bucket | |
| mfd_port=$(get_free_ports $mfd_port) | |
| root_port=$(get_free_ports $root_port) | |
| kmod_port=$(get_free_ports $kmod_port) | |
| clock_port=$(get_free_ports $clock_port) | |
| # Eliminate MSPs for missing modules | |
| # route_info_${partition} | |
| bad_link_file="/var/state/route_info.${partition}" | |
| if [ -r "$bad_link_file" ]; then | |
| # check that the file is self-consistent | |
| if ! ${check_route_info} -c $bad_link_file; then | |
| die 1 "inconsistent route info file" | |
| fi | |
| # Build a set of regex patterns to filter modules | |
| # we need to sort before adding the ^ and $ for the numerical sort to work | |
| jumper_modules=$(awk '/^jumper module/ {print $3}' < "$bad_link_file" |sort -nu | awk '// {print"^"$1"$"}') | |
| bad_modules=$(awk '/^bad module/ {print $3}' < "$bad_link_file" |sort -nu | awk '// {print"^"$1"$"}') | |
| bad_nodes=$(awk '/^bad node/ {print $3}' < "$bad_link_file" |sort -nu) | |
| # mXnY format for display to the user | |
| bad_nodes_mxny=$(awk '/^bad node/ {print $3}' < "$bad_link_file" |sort -nu | awk '// {printf("m%dn%d ",$1 / 27,$1 % 27)}') | |
| bad_links_mxny=$(awk '/^bad link/ {printf("m%dn%d-rx%d ",$3 / 27, $3 % 27, $4)}' < "$bad_link_file" |sort -nu) | |
| bad_links_mxny=$(sed -e 's/rx3/tx0/g' -e 's/rx4/tx1/g' -e 's/rx5/tx2/g' <<<"$bad_links_mxny") | |
| jumper_module_count=$(wc -w <<<"$jumper_modules") | |
| bad_module_count=$(wc -w <<<"$bad_modules") | |
| bad_node_count=$(wc -w <<<"$bad_nodes") | |
| bad_link_count=$(wc -w <<<"$bad_links_mxny") | |
| jumper_mod_list=$(echo $jumper_modules|sed 's/[^0-9 ]//g') | |
| bad_mod_list=$(echo $bad_modules|sed 's/[^0-9 ]//g') | |
| bad_node_list=$(echo $bad_nodes|sed 's/[^0-9 ]//g') | |
| [ -n "$jumper_modules" ] && echo "Skipping $jumper_module_count placeholder modules: $jumper_mod_list" | |
| [ -n "$bad_modules" ] && echo "Skipping $bad_module_count disabled modules: $bad_mod_list" | |
| [ -n "$bad_nodes" ] && echo "Skipping $bad_node_count disabled nodes: $bad_nodes_mxny" | |
| [ -n "$bad_links_mxny" ] && echo "Skipping $bad_link_count disabled links: $bad_links_mxny" | |
| # Gross, I know. | |
| # TODO: make not gross | |
| population=$(( population - (jumper_module_count * nodes_per_module) - (bad_module_count * nodes_per_module) - bad_node_count )) | |
| if [ -n "$jumper_modules" -o -n "$bad_modules" -o -n "$bad_nodes" ]; then | |
| echo "Population reduced to: $population" | |
| fi | |
| # now combine the jumper modules and bad modules into one list | |
| bad_modules=$(awk '/^(bad|jumper) module/ {print "^"$3"$"}' < "$bad_link_file" |sort -n|uniq) | |
| bad_module_count=$(wc -w <<<"$bad_modules") | |
| bad_mod_list=$(echo $bad_modules|sed 's/[^0-9 ]//g') | |
| # Filter the list of modules: | |
| if [ -n "$bad_modules" ]; then | |
| modules=$(grep -v -f <(cat <<<"$bad_modules") <(cat <<<"$modules")) | |
| fi | |
| # Convert bad node ID list into module + node number list | |
| bnl="" | |
| for node in ${bad_nodes}; do | |
| m=$((node / nodes_per_module)) | |
| n=$((node % nodes_per_module)) | |
| bnl=${bnl}${m}" "${n}$'\n' | |
| done | |
| bad_nodes="$bnl" | |
| fi | |
| if [ "${verbose}" ] || [ "${show_settings}" ]; then | |
| show_settings | |
| [ "${show_settings}" ] && die 0 | |
| fi | |
| # ^---------- No changes to system | |
| #================================================================== | |
| # v---------- Changes made to system | |
| # Create and lock temporary directory | |
| scboot_lock="${scbootdir%%/}.scboot_lock" # NOT in the directory | |
| # Get the temporary directory to a sane state and lock it | |
| [ "${noclean}" ] || rm -rf ${scbootdir} | |
| mkdir -p ${scbootdir} | |
| lock_tree ${scboot_lock} | |
| # Log all the settings for this boot | |
| show_settings >> ${scboot_log} | |
| mkdir -p /var/log/scboot | |
| set > /var/log/scboot/${partition}.vars | |
| logger -t scboot -- "$orig_cmdline" | |
| if [ "${warmboot}" ]; then | |
| warmboot | |
| else | |
| coldboot | |
| fi | |
| # Cleanup | |
| unlock_tree ${scboot_lock} | |
| # FIXME everything from this point on is inadequate and should be | |
| # consolidated into something better | |
| # start watching for errors from MFD in the background | |
| ( | |
| ${mfd_watcher} -P ${partition} -L WARNING ${verbose:+--loglevel DEBUG} | |
| ) & | |
| # wait for errors from global clock in the background | |
| ( | |
| echo -e "\nWaiting for global clock completion" | |
| global_clock_result=$( | |
| echo -e "fetch~1~^${partition}_global_clock_state=\ndone" \ | |
| | nc localhost 1234 | |
| ) | |
| [ $? = 0 ] || die 1 "Cannot contact event daemon" | |
| [ "$global_clock_result" == "${partition}_global_clock_state=done" ] \ | |
| || die 1 "global clock sync failed: $global_clock_result\n" | |
| echo -e "\nglobal clock sync complete" | |
| ) & | |
| if [ -z "${warmboot}" ]; then | |
| exec ${scbootmon} "${scboot_inst}" "${population}" "${scbootmon_options}" | |
| fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment