Skip to content

Instantly share code, notes, and snippets.

@githubhjs
Last active September 5, 2025 09:19
Show Gist options
  • Select an option

  • Save githubhjs/994d2e7253d2504cf75c8565a554a5fb to your computer and use it in GitHub Desktop.

Select an option

Save githubhjs/994d2e7253d2504cf75c8565a554a5fb to your computer and use it in GitHub Desktop.
bsubrepeater: repeatedly (re)submit an LSF job when it disappears from bjobs
#!/usr/bin/env bash
# bsubrepeater with audit log: re-submit LSF job when it disappears
set -Eeuo pipefail
DEFAULT_MAX=100000
DEFAULT_INTERVAL=10
VERBOSE=0
LOG_FILE="${BSUBREPEATER_LOG_FILE:-$HOME/.bsubrepeater.log}"
usage() {
cat <<'USAGE'
Usage:
bsubrepeater [-n MAX_SUBMITS] [-t SECONDS] [-L FILE] [-v] COMMAND [ARGS...]
Options:
-n, --max MAX_SUBMITS 最多「總提交次數」,包含第一次(預設 100000)
-t, --interval SECONDS 每次檢查間隔秒數(預設 10)
-L, --log-file FILE 稽核日誌檔路徑(預設 $HOME/.bsubrepeater.log)
-v, --verbose 顯示較多日誌
-h, --help 顯示本說明
Log format (TS,EVENT,COUNT,JOBID,CMD,BSUB_OUT; tab-separated; escaped):
2025-09-05T12:34:56Z submitted 1 45782375 xeyes Job <45782375> is submitted to queue <...>.
USAGE
}
log() { printf '[%(%F %T)T] %s\n' -1 "$*"; }
vlog() { (( VERBOSE )) && log "$@"; }
# --- 參數解析 ---
MAX_SUBMITS="$DEFAULT_MAX"
INTERVAL="$DEFAULT_INTERVAL"
if PARSED=$(getopt -o n:t:L:vh -l max:,interval:,log-file:,verbose,help -- "$@"); then
eval set -- "$PARSED"
while true; do
case "$1" in
-n|--max) MAX_SUBMITS="$2"; shift 2 ;;
-t|--interval) INTERVAL="$2"; shift 2 ;;
-L|--log-file) LOG_FILE="$2"; shift 2 ;;
-v|--verbose) VERBOSE=1; shift ;;
-h|--help) usage; exit 0 ;;
--) shift; break ;;
*) usage; exit 2 ;;
esac
done
else
while getopts ":n:t:L:vh" opt; do
case "$opt" in
n) MAX_SUBMITS="$OPTARG" ;;
t) INTERVAL="$OPTARG" ;;
L) LOG_FILE="$OPTARG" ;;
v) VERBOSE=1 ;;
h) usage; exit 0 ;;
\?) echo "未知選項: -$OPTARG" >&2; usage; exit 2 ;;
:) echo "選項 -$OPTARG 需要參數" >&2; usage; exit 2 ;;
esac
done
shift $((OPTIND-1))
fi
# --- 基本檢查 ---
if [[ $# -lt 1 ]]; then usage; exit 2; fi
command -v bsub >/dev/null || { echo "Error: bsub not found." >&2; exit 127; }
command -v bjobs >/dev/null || { echo "Error: bjobs not found." >&2; exit 127; }
[[ "$MAX_SUBMITS" =~ ^[0-9]+$ ]] && (( MAX_SUBMITS >= 1 )) || { echo "Error: --max 必須為 >=1 的整數。" >&2; exit 2; }
[[ "$INTERVAL" =~ ^[0-9]+$ ]] && (( INTERVAL >= 1 )) || { echo "Error: --interval 必須為 >=1 的整數秒。" >&2; exit 2; }
CMD=("$@")
# --- 日誌相關 ---
ensure_log_ready() {
local dir
dir="$(dirname -- "$LOG_FILE")"
mkdir -p -- "$dir"
: > /dev/null # 佔位,避免 set -e 誤殺
}
escape_field() { # 將 \ / \t / \n 轉義,確保一事件一行
local s; s="$1"
s="${s//\\/\\\\}"; s="${s//$'\t'/\\t}"; s="${s//$'\n'/\\n}"
printf '%s' "$s"
}
log_event() {
# $1=EVENT $2=COUNT $3=JOBID $4=CMD_STR $5=BSUB_OUT
local ts evt cnt jid cmdstr out
ts="$(date -u +%FT%TZ)"
evt="$1"; cnt="$2"; jid="$3"; cmdstr="$4"; out="$5"
printf '%s\t%s\t%s\t%s\t%s\t%s\n' \
"$ts" "$(escape_field "$evt")" "$cnt" "$jid" \
"$(escape_field "$cmdstr")" "$(escape_field "$out")" >> "$LOG_FILE"
(( VERBOSE )) && log "log> $evt count=$cnt job=$jid"
}
# --- LSF 互動 ---
submit_and_get_id() {
local out jobid
vlog "Submitting: bsub ${CMD[*]}"
if ! out=$(bsub -XF "${CMD[@]}" 2>&1); then
log_event "submit_failed" "$count" "-" "${CMD[*]}" "$out"
echo "Error: bsub 提交失敗:$out" >&2
return 1
fi
jobid=$(awk 'match($0, /<([0-9]+)>/, a){print a[1]}' <<<"$out" | head -n1)
if [[ -z "$jobid" ]]; then
log_event "parse_jobid_failed" "$count" "-" "${CMD[*]}" "$out"
echo "Error: 無法解析 job id:$out" >&2
return 1
fi
log_event "submitted" "$((count+1))" "$jobid" "${CMD[*]}" "$out"
printf '%s\n' "$jobid"
}
job_exists() {
local jid="$1"
bjobs -noheader "$jid" >/dev/null 2>&1
}
# --- 主流程 ---
ensure_log_ready
log_event "start" "0" "-" "${CMD[*]}" "-"
count=0
jobid="$(submit_and_get_id)"
((count++))
while (( count < MAX_SUBMITS )); do
sleep "$INTERVAL"
if job_exists "$jobid"; then
log_event "still_active" "$count" "$jobid" "${CMD[*]}" "-"
continue
fi
log_event "disappeared" "$count" "$jobid" "${CMD[*]}" "-"
vlog "Job <$jobid> disappeared; re-submitting… ($((count+1))/$MAX_SUBMITS)"
jobid="$(submit_and_get_id)"
((count++))
done
log_event "max_reached_exit" "$count" "$jobid" "${CMD[*]}" "-"
vlog "Reached MAX_SUBMITS=$MAX_SUBMITS. Exiting."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment