Last active
September 5, 2025 09:19
-
-
Save githubhjs/994d2e7253d2504cf75c8565a554a5fb to your computer and use it in GitHub Desktop.
bsubrepeater: repeatedly (re)submit an LSF job when it disappears from bjobs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # bsubrepeater with audit log: re-submit LSF job when it disappears | |
| set -Eeuo pipefail | |
| DEFAULT_MAX=100000 | |
| DEFAULT_INTERVAL=10 | |
| VERBOSE=0 | |
| LOG_FILE="${BSUBREPEATER_LOG_FILE:-$HOME/.bsubrepeater.log}" | |
| usage() { | |
| cat <<'USAGE' | |
| Usage: | |
| bsubrepeater [-n MAX_SUBMITS] [-t SECONDS] [-L FILE] [-v] COMMAND [ARGS...] | |
| Options: | |
| -n, --max MAX_SUBMITS 最多「總提交次數」,包含第一次(預設 100000) | |
| -t, --interval SECONDS 每次檢查間隔秒數(預設 10) | |
| -L, --log-file FILE 稽核日誌檔路徑(預設 $HOME/.bsubrepeater.log) | |
| -v, --verbose 顯示較多日誌 | |
| -h, --help 顯示本說明 | |
| Log format (TS,EVENT,COUNT,JOBID,CMD,BSUB_OUT; tab-separated; escaped): | |
| 2025-09-05T12:34:56Z submitted 1 45782375 xeyes Job <45782375> is submitted to queue <...>. | |
| USAGE | |
| } | |
| log() { printf '[%(%F %T)T] %s\n' -1 "$*"; } | |
| vlog() { (( VERBOSE )) && log "$@"; } | |
| # --- 參數解析 --- | |
| MAX_SUBMITS="$DEFAULT_MAX" | |
| INTERVAL="$DEFAULT_INTERVAL" | |
| if PARSED=$(getopt -o n:t:L:vh -l max:,interval:,log-file:,verbose,help -- "$@"); then | |
| eval set -- "$PARSED" | |
| while true; do | |
| case "$1" in | |
| -n|--max) MAX_SUBMITS="$2"; shift 2 ;; | |
| -t|--interval) INTERVAL="$2"; shift 2 ;; | |
| -L|--log-file) LOG_FILE="$2"; shift 2 ;; | |
| -v|--verbose) VERBOSE=1; shift ;; | |
| -h|--help) usage; exit 0 ;; | |
| --) shift; break ;; | |
| *) usage; exit 2 ;; | |
| esac | |
| done | |
| else | |
| while getopts ":n:t:L:vh" opt; do | |
| case "$opt" in | |
| n) MAX_SUBMITS="$OPTARG" ;; | |
| t) INTERVAL="$OPTARG" ;; | |
| L) LOG_FILE="$OPTARG" ;; | |
| v) VERBOSE=1 ;; | |
| h) usage; exit 0 ;; | |
| \?) echo "未知選項: -$OPTARG" >&2; usage; exit 2 ;; | |
| :) echo "選項 -$OPTARG 需要參數" >&2; usage; exit 2 ;; | |
| esac | |
| done | |
| shift $((OPTIND-1)) | |
| fi | |
| # --- 基本檢查 --- | |
| if [[ $# -lt 1 ]]; then usage; exit 2; fi | |
| command -v bsub >/dev/null || { echo "Error: bsub not found." >&2; exit 127; } | |
| command -v bjobs >/dev/null || { echo "Error: bjobs not found." >&2; exit 127; } | |
| [[ "$MAX_SUBMITS" =~ ^[0-9]+$ ]] && (( MAX_SUBMITS >= 1 )) || { echo "Error: --max 必須為 >=1 的整數。" >&2; exit 2; } | |
| [[ "$INTERVAL" =~ ^[0-9]+$ ]] && (( INTERVAL >= 1 )) || { echo "Error: --interval 必須為 >=1 的整數秒。" >&2; exit 2; } | |
| CMD=("$@") | |
| # --- 日誌相關 --- | |
| ensure_log_ready() { | |
| local dir | |
| dir="$(dirname -- "$LOG_FILE")" | |
| mkdir -p -- "$dir" | |
| : > /dev/null # 佔位,避免 set -e 誤殺 | |
| } | |
| escape_field() { # 將 \ / \t / \n 轉義,確保一事件一行 | |
| local s; s="$1" | |
| s="${s//\\/\\\\}"; s="${s//$'\t'/\\t}"; s="${s//$'\n'/\\n}" | |
| printf '%s' "$s" | |
| } | |
| log_event() { | |
| # $1=EVENT $2=COUNT $3=JOBID $4=CMD_STR $5=BSUB_OUT | |
| local ts evt cnt jid cmdstr out | |
| ts="$(date -u +%FT%TZ)" | |
| evt="$1"; cnt="$2"; jid="$3"; cmdstr="$4"; out="$5" | |
| printf '%s\t%s\t%s\t%s\t%s\t%s\n' \ | |
| "$ts" "$(escape_field "$evt")" "$cnt" "$jid" \ | |
| "$(escape_field "$cmdstr")" "$(escape_field "$out")" >> "$LOG_FILE" | |
| (( VERBOSE )) && log "log> $evt count=$cnt job=$jid" | |
| } | |
| # --- LSF 互動 --- | |
| submit_and_get_id() { | |
| local out jobid | |
| vlog "Submitting: bsub ${CMD[*]}" | |
| if ! out=$(bsub -XF "${CMD[@]}" 2>&1); then | |
| log_event "submit_failed" "$count" "-" "${CMD[*]}" "$out" | |
| echo "Error: bsub 提交失敗:$out" >&2 | |
| return 1 | |
| fi | |
| jobid=$(awk 'match($0, /<([0-9]+)>/, a){print a[1]}' <<<"$out" | head -n1) | |
| if [[ -z "$jobid" ]]; then | |
| log_event "parse_jobid_failed" "$count" "-" "${CMD[*]}" "$out" | |
| echo "Error: 無法解析 job id:$out" >&2 | |
| return 1 | |
| fi | |
| log_event "submitted" "$((count+1))" "$jobid" "${CMD[*]}" "$out" | |
| printf '%s\n' "$jobid" | |
| } | |
| job_exists() { | |
| local jid="$1" | |
| bjobs -noheader "$jid" >/dev/null 2>&1 | |
| } | |
| # --- 主流程 --- | |
| ensure_log_ready | |
| log_event "start" "0" "-" "${CMD[*]}" "-" | |
| count=0 | |
| jobid="$(submit_and_get_id)" | |
| ((count++)) | |
| while (( count < MAX_SUBMITS )); do | |
| sleep "$INTERVAL" | |
| if job_exists "$jobid"; then | |
| log_event "still_active" "$count" "$jobid" "${CMD[*]}" "-" | |
| continue | |
| fi | |
| log_event "disappeared" "$count" "$jobid" "${CMD[*]}" "-" | |
| vlog "Job <$jobid> disappeared; re-submitting… ($((count+1))/$MAX_SUBMITS)" | |
| jobid="$(submit_and_get_id)" | |
| ((count++)) | |
| done | |
| log_event "max_reached_exit" "$count" "$jobid" "${CMD[*]}" "-" | |
| vlog "Reached MAX_SUBMITS=$MAX_SUBMITS. Exiting." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment