Skip to content

Instantly share code, notes, and snippets.

@basnijholt
Last active September 30, 2025 22:56
Show Gist options
  • Save basnijholt/2fbfa23081e4321abcd1253cf8a489f0 to your computer and use it in GitHub Desktop.
Save basnijholt/2fbfa23081e4321abcd1253cf8a489f0 to your computer and use it in GitHub Desktop.
slurm-vscode.sh
#!/usr/bin/env bash
# slurm-vscode.sh - VS Code launcher for Slurm
# - Reads ~/.config/slurm_vscode/config (or $XDG_CONFIG_HOME)
# - Remembers JobID in ~/.cache/slurm_vscode/jobid (or $XDG_CACHE_HOME)
# - Reuses a running job if possible; otherwise submits a tiny "sleep infinity" holder job
# - Launches local VS Code against the remote node via "ssh-remote+<node>"
set -euo pipefail
APP_NAME="slurm_vscode"
CONFIG_FILE="${XDG_CONFIG_HOME:-$HOME/.config}/${APP_NAME}/config"
CACHE_FILE="${XDG_CACHE_HOME:-$HOME/.cache}/${APP_NAME}/jobid"
# ---- defaults (overridable via config) ----
LOGIN_HOST="mycluster" # ssh alias/bastion
JOB_NAME="vscode-holder" # job name to reuse
SQUEUE_POLL_SEC=2 # seconds
CODE_REUSE_WINDOW=false # true/false
CODE_BINARY="code" # VS Code CLI command
DEFAULT_SBATCH_FLAGS='-p cpus -t 01:00:00 -N 1'
# ---- load config if present ----
if [[ -f "$CONFIG_FILE" ]]; then
# shellcheck disable=SC1090
source "$CONFIG_FILE"
fi
# ---- tiny helpers ----
ensure_cache_dir() { mkdir -p "$(dirname "$CACHE_FILE")"; }
remember_job() { ensure_cache_dir; printf '%s\n' "$1" >"$CACHE_FILE"; }
recall_job() { [[ -s "$CACHE_FILE" ]] && cat "$CACHE_FILE" || return 1; }
forget_job() { rm -f "$CACHE_FILE"; }
log() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*"; }
warn() { printf '[%s] %s\n' "$(date +%H:%M:%S)" "$*" >&2; }
TERMINAL_STATES=" COMPLETED CANCELLED FAILED TIMEOUT PREEMPTED OUT_OF_MEMORY NODE_FAIL "
is_terminal() { [[ " $TERMINAL_STATES " == *" $1 "* ]]; }
# Run a remote command with safe quoting: ssh LOGIN_HOST bash -lc "<cmd ...>"
run_ssh() {
local cmd_str
printf -v cmd_str '%q ' "$@"
ssh "$LOGIN_HOST" bash -lc "$cmd_str"
}
# squeue -O avoids quoting headaches (no spaces in the format arg)
describe_job() {
local jobid="$1"
local out
out="$(run_ssh squeue -j "$jobid" -h -O state,NodeList || true)"
[[ -z "$out" ]] && return 1
# Expect: "<STATE> <NODELIST>"
local state nodes node
# shellcheck disable=SC2086
read -r state nodes <<<"$out"
node="${nodes%%,*}" # first node only
[[ -z "${node:-}" || "$node" == "(null)" || "$node" == "n/a" ]] && node=""
printf '%s %s\n' "$state" "$node"
}
submit_holder_job() {
local -a default_flags extra_flags
# split strings into arrays on whitespace
IFS=' ' read -r -a default_flags <<<"$DEFAULT_SBATCH_FLAGS"
extra_flags=("$@")
local out
out="$(run_ssh sbatch --parsable --job-name "$JOB_NAME" "${default_flags[@]}" "${extra_flags[@]}" --wrap "sleep infinity")"
# sbatch may print "<jobid>;..." — keep first field
printf '%s\n' "${out%%;*}"
}
poll_for_node() {
local jobid="$1"
while :; do
local info state node
info="$(describe_job "$jobid" || true)"
if [[ -z "$info" ]]; then
warn "squeue returned no output (job may no longer exist)."
return 1
fi
read -r state node <<<"$info"
if [[ "$state" == "RUNNING" && -n "${node:-}" ]]; then
printf '%s\n' "$node"
return 0
fi
if is_terminal "$state"; then
warn "Job $jobid reached terminal state: $state"
return 1
fi
sleep "${SQUEUE_POLL_SEC}"
done
}
launch_vscode() {
local node="$1"
local flag="--new-window"
[[ "${CODE_REUSE_WINDOW,,}" == "true" || "${CODE_REUSE_WINDOW}" == "1" ]] && flag="--reuse-window"
"$CODE_BINARY" "$flag" --remote "ssh-remote+${node}"
}
usage() {
cat <<EOF
Usage:
$0 launch [--reuse-cache|--force-new] [--cancel-on-failure|--keep-job] [--] [extra sbatch flags...]
$0 remember-job <JOBID>
$0 recall-job
$0 forget-job
$0 paths
Notes:
- Edit config at: $CONFIG_FILE
- Cache file: $CACHE_FILE
- Extra sbatch flags are forwarded after a literal "--"
EOF
}
paths_cmd() {
printf 'Config: %s\n' "$CONFIG_FILE"
printf 'Cache : %s\n' "$CACHE_FILE"
}
# ---- commands ----
cmd="${1:-}"
case "$cmd" in
launch)
shift
REUSE_CACHE=1
CANCEL_ON_FAILURE=1
# parse a couple of light options; everything after -- goes to sbatch
EXTRA_FLAGS=()
while (($#)); do
case "$1" in
--reuse-cache) REUSE_CACHE=1; shift ;;
--force-new) REUSE_CACHE=0; shift ;;
--cancel-on-failure) CANCEL_ON_FAILURE=1; shift ;;
--keep-job) CANCEL_ON_FAILURE=0; shift ;;
--) shift; EXTRA_FLAGS=("$@"); break ;;
*) # ignore unknown options here to keep it simple
shift ;;
esac
done
ensure_cache_dir
# 1) Try to reuse cached job (if any)
SELECTION_JOBID=""
SELECTION_NODE=""
if (( REUSE_CACHE )) && jobid="$(recall_job 2>/dev/null || true)"; then
log "Found cached JobID $jobid; checking status…"
if info="$(describe_job "$jobid" 2>/dev/null || true)"; then
read -r state node <<<"$info"
if [[ -n "$node" ]]; then
log "Reusing running job $jobid on node $node."
SELECTION_JOBID="$jobid"
SELECTION_NODE="$node"
elif is_terminal "$state"; then
warn "Cached job $jobid is $state; clearing cache."
forget_job
else
log "Waiting for cached job $jobid (state $state) to receive a node…"
if node="$(poll_for_node "$jobid" 2>/dev/null)"; then
SELECTION_JOBID="$jobid"
SELECTION_NODE="$node"
else
warn "Cached job became unusable; clearing cache."
forget_job
fi
fi
else
warn "Unable to inspect cached job; clearing cache."
forget_job
fi
fi
# 2) If no reusable job, submit a fresh holder and wait for node
if [[ -z "$SELECTION_NODE" ]]; then
log "Submitting holder job via $LOGIN_HOST with flags: ${DEFAULT_SBATCH_FLAGS} ${EXTRA_FLAGS[*]:-}"
jobid="$(submit_holder_job "${EXTRA_FLAGS[@]}")"
if [[ -z "$jobid" ]]; then
warn "sbatch did not return a JobID"; exit 1
fi
remember_job "$jobid"
log "Submitted JobID $jobid. Waiting for node assignment…"
if ! node="$(poll_for_node "$jobid")"; then
warn "Polling failed for JobID $jobid"
if (( CANCEL_ON_FAILURE )); then
warn "Cancelling job $jobid"
run_ssh scancel "$jobid" || true
fi
forget_job
exit 1
fi
SELECTION_JOBID="$jobid"
SELECTION_NODE="$node"
fi
# 3) Launch VS Code locally against the remote node
log "Job $SELECTION_JOBID is running on node $SELECTION_NODE. Launching VS Code…"
launch_vscode "$SELECTION_NODE" || {
warn "Failed to launch VS Code."
exit 1
}
remember_job "$SELECTION_JOBID"
log "VS Code launched. When finished, run: scancel $SELECTION_JOBID (or '$0 forget-job')."
;;
remember-job)
shift
[[ $# -eq 1 ]] || { usage; exit 1; }
remember_job "$1"
printf 'Remembered JobID at %s\n' "$CACHE_FILE"
;;
recall-job)
if ! recall_job; then
echo "No remembered job."; exit 1
fi
;;
forget-job)
if [[ -f "$CACHE_FILE" ]]; then
forget_job
echo "Forgot remembered job."
else
echo "Nothing remembered."
fi
;;
paths)
paths_cmd
;;
""|help|-h|--help)
usage
;;
*)
usage; exit 1
;;
esac
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment