Last active
September 30, 2025 22:56
-
-
Save basnijholt/2fbfa23081e4321abcd1253cf8a489f0 to your computer and use it in GitHub Desktop.
slurm-vscode.sh
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# slurm-vscode.sh - VS Code launcher for Slurm | |
# - Reads ~/.config/slurm_vscode/config (or $XDG_CONFIG_HOME) | |
# - Remembers JobID in ~/.cache/slurm_vscode/jobid (or $XDG_CACHE_HOME) | |
# - Reuses a running job if possible; otherwise submits a tiny "sleep infinity" holder job | |
# - Launches local VS Code against the remote node via "ssh-remote+<node>" | |
set -euo pipefail | |
APP_NAME="slurm_vscode" | |
CONFIG_FILE="${XDG_CONFIG_HOME:-$HOME/.config}/${APP_NAME}/config" | |
CACHE_FILE="${XDG_CACHE_HOME:-$HOME/.cache}/${APP_NAME}/jobid" | |
# ---- defaults (overridable via config) ---- | |
LOGIN_HOST="mycluster" # ssh alias/bastion | |
JOB_NAME="vscode-holder" # job name to reuse | |
SQUEUE_POLL_SEC=2 # seconds | |
CODE_REUSE_WINDOW=false # true/false | |
CODE_BINARY="code" # VS Code CLI command | |
DEFAULT_SBATCH_FLAGS='-p cpus -t 01:00:00 -N 1' | |
# ---- load config if present ---- | |
if [[ -f "$CONFIG_FILE" ]]; then | |
# shellcheck disable=SC1090 | |
source "$CONFIG_FILE" | |
fi | |
# ---- tiny helpers ---- | |
ensure_cache_dir() { mkdir -p "$(dirname "$CACHE_FILE")"; } | |
remember_job() { ensure_cache_dir; printf '%s\n' "$1" >"$CACHE_FILE"; } | |
recall_job() { [[ -s "$CACHE_FILE" ]] && cat "$CACHE_FILE" || return 1; } | |
forget_job() { rm -f "$CACHE_FILE"; } | |
log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*"; } | |
warn() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; } | |
TERMINAL_STATES=" COMPLETED CANCELLED FAILED TIMEOUT PREEMPTED OUT_OF_MEMORY NODE_FAIL " | |
is_terminal() { [[ " $TERMINAL_STATES " == *" $1 "* ]]; } | |
# Run a remote command with safe quoting: ssh LOGIN_HOST bash -lc "<cmd ...>" | |
run_ssh() { | |
local cmd_str | |
printf -v cmd_str '%q ' "$@" | |
ssh "$LOGIN_HOST" bash -lc "$cmd_str" | |
} | |
# squeue -O avoids quoting headaches (no spaces in the format arg) | |
describe_job() { | |
local jobid="$1" | |
local out | |
out="$(run_ssh squeue -j "$jobid" -h -O state,NodeList || true)" | |
[[ -z "$out" ]] && return 1 | |
# Expect: "<STATE> <NODELIST>" | |
local state nodes node | |
# shellcheck disable=SC2086 | |
read -r state nodes <<<"$out" | |
node="${nodes%%,*}" # first node only | |
[[ -z "${node:-}" || "$node" == "(null)" || "$node" == "n/a" ]] && node="" | |
printf '%s %s\n' "$state" "$node" | |
} | |
submit_holder_job() { | |
local -a default_flags extra_flags | |
# split strings into arrays on whitespace | |
IFS=' ' read -r -a default_flags <<<"$DEFAULT_SBATCH_FLAGS" | |
extra_flags=("$@") | |
local out | |
out="$(run_ssh sbatch --parsable --job-name "$JOB_NAME" "${default_flags[@]}" "${extra_flags[@]}" --wrap "sleep infinity")" | |
# sbatch may print "<jobid>;..." — keep first field | |
printf '%s\n' "${out%%;*}" | |
} | |
poll_for_node() { | |
local jobid="$1" | |
while :; do | |
local info state node | |
info="$(describe_job "$jobid" || true)" | |
if [[ -z "$info" ]]; then | |
warn "squeue returned no output (job may no longer exist)." | |
return 1 | |
fi | |
read -r state node <<<"$info" | |
if [[ "$state" == "RUNNING" && -n "${node:-}" ]]; then | |
printf '%s\n' "$node" | |
return 0 | |
fi | |
if is_terminal "$state"; then | |
warn "Job $jobid reached terminal state: $state" | |
return 1 | |
fi | |
sleep "${SQUEUE_POLL_SEC}" | |
done | |
} | |
launch_vscode() { | |
local node="$1" | |
local flag="--new-window" | |
[[ "${CODE_REUSE_WINDOW,,}" == "true" || "${CODE_REUSE_WINDOW}" == "1" ]] && flag="--reuse-window" | |
"$CODE_BINARY" "$flag" --remote "ssh-remote+${node}" | |
} | |
usage() { | |
cat <<EOF | |
Usage: | |
$0 launch [--reuse-cache|--force-new] [--cancel-on-failure|--keep-job] [--] [extra sbatch flags...] | |
$0 remember-job <JOBID> | |
$0 recall-job | |
$0 forget-job | |
$0 paths | |
Notes: | |
- Edit config at: $CONFIG_FILE | |
- Cache file: $CACHE_FILE | |
- Extra sbatch flags are forwarded after a literal "--" | |
EOF | |
} | |
paths_cmd() { | |
printf 'Config: %s\n' "$CONFIG_FILE" | |
printf 'Cache : %s\n' "$CACHE_FILE" | |
} | |
# ---- commands ---- | |
cmd="${1:-}" | |
case "$cmd" in | |
launch) | |
shift | |
REUSE_CACHE=1 | |
CANCEL_ON_FAILURE=1 | |
# parse a couple of light options; everything after -- goes to sbatch | |
EXTRA_FLAGS=() | |
while (($#)); do | |
case "$1" in | |
--reuse-cache) REUSE_CACHE=1; shift ;; | |
--force-new) REUSE_CACHE=0; shift ;; | |
--cancel-on-failure) CANCEL_ON_FAILURE=1; shift ;; | |
--keep-job) CANCEL_ON_FAILURE=0; shift ;; | |
--) shift; EXTRA_FLAGS=("$@"); break ;; | |
*) # ignore unknown options here to keep it simple | |
shift ;; | |
esac | |
done | |
ensure_cache_dir | |
# 1) Try to reuse cached job (if any) | |
SELECTION_JOBID="" | |
SELECTION_NODE="" | |
if (( REUSE_CACHE )) && jobid="$(recall_job 2>/dev/null || true)"; then | |
log "Found cached JobID $jobid; checking status…" | |
if info="$(describe_job "$jobid" 2>/dev/null || true)"; then | |
read -r state node <<<"$info" | |
if [[ -n "$node" ]]; then | |
log "Reusing running job $jobid on node $node." | |
SELECTION_JOBID="$jobid" | |
SELECTION_NODE="$node" | |
elif is_terminal "$state"; then | |
warn "Cached job $jobid is $state; clearing cache." | |
forget_job | |
else | |
log "Waiting for cached job $jobid (state $state) to receive a node…" | |
if node="$(poll_for_node "$jobid" 2>/dev/null)"; then | |
SELECTION_JOBID="$jobid" | |
SELECTION_NODE="$node" | |
else | |
warn "Cached job became unusable; clearing cache." | |
forget_job | |
fi | |
fi | |
else | |
warn "Unable to inspect cached job; clearing cache." | |
forget_job | |
fi | |
fi | |
# 2) If no reusable job, submit a fresh holder and wait for node | |
if [[ -z "$SELECTION_NODE" ]]; then | |
log "Submitting holder job via $LOGIN_HOST with flags: ${DEFAULT_SBATCH_FLAGS} ${EXTRA_FLAGS[*]:-}" | |
jobid="$(submit_holder_job "${EXTRA_FLAGS[@]}")" | |
if [[ -z "$jobid" ]]; then | |
warn "sbatch did not return a JobID"; exit 1 | |
fi | |
remember_job "$jobid" | |
log "Submitted JobID $jobid. Waiting for node assignment…" | |
if ! node="$(poll_for_node "$jobid")"; then | |
warn "Polling failed for JobID $jobid" | |
if (( CANCEL_ON_FAILURE )); then | |
warn "Cancelling job $jobid" | |
run_ssh scancel "$jobid" || true | |
fi | |
forget_job | |
exit 1 | |
fi | |
SELECTION_JOBID="$jobid" | |
SELECTION_NODE="$node" | |
fi | |
# 3) Launch VS Code locally against the remote node | |
log "Job $SELECTION_JOBID is running on node $SELECTION_NODE. Launching VS Code…" | |
launch_vscode "$SELECTION_NODE" || { | |
warn "Failed to launch VS Code." | |
exit 1 | |
} | |
remember_job "$SELECTION_JOBID" | |
log "VS Code launched. When finished, run: scancel $SELECTION_JOBID (or '$0 forget-job')." | |
;; | |
remember-job) | |
shift | |
[[ $# -eq 1 ]] || { usage; exit 1; } | |
remember_job "$1" | |
printf 'Remembered JobID at %s\n' "$CACHE_FILE" | |
;; | |
recall-job) | |
if ! recall_job; then | |
echo "No remembered job."; exit 1 | |
fi | |
;; | |
forget-job) | |
if [[ -f "$CACHE_FILE" ]]; then | |
forget_job | |
echo "Forgot remembered job." | |
else | |
echo "Nothing remembered." | |
fi | |
;; | |
paths) | |
paths_cmd | |
;; | |
""|help|-h|--help) | |
usage | |
;; | |
*) | |
usage; exit 1 | |
;; | |
esac |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment