|
#!/bin/bash |
|
#=============================================================================== |
|
# |
|
# monitor-job.sh - A universal monitoring wrapper for cron/batch jobs. |
|
# |
|
# This script runs a given command, measures its execution time and exit status, |
|
# then pushes metrics to a Prometheus PushGateway. |
|
# |
|
# It reports: |
|
# - job_status (1 for success, 0 for failure) |
|
# - job_duration_seconds (runtime in seconds) |
|
# - job_last_run_timestamp (Unix timestamp when the job started) |
|
# |
|
# Environment variables can be loaded from an .env file via the --env-file flag |
|
# (or from a default .env if it exists). Values loaded from the env file are used |
|
# as defaults, but any flags provided on the command line will override them. |
|
# |
|
# Usage: |
|
# ./monitor-job.sh [options] -- <command> |
|
# |
|
# Options: |
|
# -e, --env-file FILE Path to an env file (default: .env if exists) |
|
# -p, --pushgateway URL Set the PushGateway URL (default: http://localhost:9091) |
|
# -u, --user USER Set the basic auth username (optional) |
|
# -w, --pass PASS Set the basic auth password (optional) |
|
# -n, --job-name NAME Set the job name (default: default_job) |
|
# -i, --instance NAME Set the instance name (default: hostname) |
|
# -t, --timeout SECONDS Set a timeout for the command (optional) |
|
# -h, --help Display this help message |
|
# |
|
#=============================================================================== |
|
|
|
set -euo pipefail |
|
|
|
# --- Function to load environment variables from a file --- |
|
read_env() { |
|
local filePath="${1:-.env}" |
|
|
|
if [ ! -f "$filePath" ]; then |
|
echo "Error: missing ${filePath}" |
|
exit 1 |
|
fi |
|
|
|
echo "Reading ${filePath}" |
|
while read -r LINE; do |
|
# Remove leading/trailing whitespace and carriage returns |
|
CLEANED_LINE=$(echo "$LINE" | awk '{$1=$1};1' | tr -d '\r') |
|
# Skip comments and lines without an '=' sign. |
|
if [[ $CLEANED_LINE != '#'* ]] && [[ $CLEANED_LINE == *'='* ]]; then |
|
export "$CLEANED_LINE" |
|
fi |
|
done < "$filePath" |
|
} |
|
|
|
# --- Manually extract the env-file option --- |
|
ENV_FILE_ARG="" |
|
ARGS=() |
|
SKIP=0 |
|
for arg in "$@"; do |
|
if [ $SKIP -eq 1 ]; then |
|
SKIP=0 |
|
continue |
|
fi |
|
case "$arg" in |
|
-e|--env-file) |
|
# Store the next argument as the env file; skip it. |
|
ENV_FILE_ARG="$2" |
|
SKIP=1 |
|
;; |
|
*) |
|
ARGS+=("$arg") |
|
;; |
|
esac |
|
done |
|
|
|
# Load the env file if provided, or use .env if it exists. |
|
if [ -n "$ENV_FILE_ARG" ]; then |
|
read_env "$ENV_FILE_ARG" |
|
elif [ -f ".env" ]; then |
|
read_env ".env" |
|
fi |
|
|
|
# Reset positional parameters without the env-file option. |
|
set -- "${ARGS[@]}" |
|
|
|
# --- Set default variables using any values loaded from the env file --- |
|
: "${PUSHGATEWAY_URL:=http://localhost:9091}" |
|
: "${PUSHGATEWAY_USER:=}" |
|
: "${PUSHGATEWAY_PASS:=}" |
|
: "${JOB_NAME:=default_job}" |
|
: "${INSTANCE:=$(hostname)}" |
|
: "${TIMEOUT:=}" |
|
: "${METRIC_PREFIX:=job}" |
|
|
|
# --- Now use getopt to parse the remaining flags (override defaults) --- |
|
TEMP=$(getopt -o p:u:w:n:i:t:h -l pushgateway:,user:,pass:,job-name:,instance:,timeout:,help -n "$0" -- "$@") |
|
if [ $? != 0 ]; then |
|
echo "Error in command line arguments." >&2 |
|
exit 1 |
|
fi |
|
eval set -- "$TEMP" |
|
|
|
# Parse remaining command-line options; these override the defaults. |
|
while true; do |
|
case "$1" in |
|
-p|--pushgateway) |
|
PUSHGATEWAY_URL="$2" |
|
shift 2 ;; |
|
-u|--user) |
|
PUSHGATEWAY_USER="$2" |
|
shift 2 ;; |
|
-w|--pass) |
|
PUSHGATEWAY_PASS="$2" |
|
shift 2 ;; |
|
-n|--job-name) |
|
JOB_NAME="$2" |
|
shift 2 ;; |
|
-i|--instance) |
|
INSTANCE="$2" |
|
shift 2 ;; |
|
-t|--timeout) |
|
TIMEOUT="$2" |
|
shift 2 ;; |
|
-h|--help) |
|
cat <<EOF |
|
Usage: $0 [options] -- <command> |
|
Options: |
|
-e, --env-file FILE Path to an env file (default: .env if exists) |
|
-p, --pushgateway URL Set the PushGateway URL (default: ${PUSHGATEWAY_URL}) |
|
-u, --user USER Set basic auth username for PushGateway (optional) |
|
-w, --pass PASS Set basic auth password for PushGateway (optional) |
|
-n, --job-name NAME Set the job name (default: ${JOB_NAME}) |
|
-i, --instance NAME Set the instance name (default: ${INSTANCE}) |
|
-t, --timeout SECONDS Set timeout (in seconds) for the command (optional) |
|
-h, --help Display this help message |
|
EOF |
|
exit 0 ;; |
|
--) |
|
shift |
|
break ;; |
|
*) |
|
echo "Unknown option: $1" >&2 |
|
exit 1 ;; |
|
esac |
|
done |
|
|
|
# Ensure a command is provided after the "--" |
|
if [ "$#" -eq 0 ]; then |
|
echo "Error: No command provided to execute." |
|
exit 1 |
|
fi |
|
|
|
# The command to run is all the remaining arguments. |
|
CMD=("$@") |
|
|
|
# Record the start time (high resolution) and the Unix timestamp. |
|
START_TIME=$(date +%s.%N) |
|
START_TIME_INT=$(date +%s) |
|
|
|
# Execute the command (with timeout if specified) |
|
if [ -n "$TIMEOUT" ]; then |
|
timeout "$TIMEOUT" "${CMD[@]}" |
|
EXIT_CODE=$? |
|
else |
|
"${CMD[@]}" |
|
EXIT_CODE=$? |
|
fi |
|
|
|
# Record the end time and calculate the duration. |
|
END_TIME=$(date +%s.%N) |
|
DURATION=$(echo "$END_TIME - $START_TIME" | bc) |
|
|
|
# Set job_status: 1 if success, 0 otherwise. |
|
if [ "$EXIT_CODE" -eq 0 ]; then |
|
JOB_STATUS=1 |
|
else |
|
JOB_STATUS=0 |
|
fi |
|
|
|
# Prepare the metrics in Prometheus text exposition format. |
|
METRICS=$(cat <<EOF |
|
# TYPE ${METRIC_PREFIX}_status gauge |
|
${METRIC_PREFIX}_status{job="${JOB_NAME}",instance="${INSTANCE}"} ${JOB_STATUS} |
|
# TYPE ${METRIC_PREFIX}_duration_seconds gauge |
|
${METRIC_PREFIX}_duration_seconds{job="${JOB_NAME}",instance="${INSTANCE}"} ${DURATION} |
|
# TYPE ${METRIC_PREFIX}_last_run_timestamp gauge |
|
${METRIC_PREFIX}_last_run_timestamp{job="${JOB_NAME}",instance="${INSTANCE}"} ${START_TIME_INT} |
|
EOF |
|
) |
|
|
|
# Optionally, print the metrics for debugging. |
|
echo "Pushing the following metrics to ${PUSHGATEWAY_URL}/metrics/job/${JOB_NAME}/instance/${INSTANCE}:" |
|
echo "$METRICS" |
|
|
|
# Build the curl command to push metrics. |
|
CURL_CMD=(curl --silent --show-error --fail --data-binary @- "${PUSHGATEWAY_URL}/metrics/job/${JOB_NAME}/instance/${INSTANCE}") |
|
|
|
# Add basic auth if provided. |
|
if [ -n "$PUSHGATEWAY_USER" ] && [ -n "$PUSHGATEWAY_PASS" ]; then |
|
CURL_CMD+=(--user "${PUSHGATEWAY_USER}:${PUSHGATEWAY_PASS}") |
|
fi |
|
|
|
# Push the metrics to the PushGateway. |
|
echo "$METRICS" | "${CURL_CMD[@]}" |
|
|
|
# Exit with the same status as the executed command. |
|
exit $EXIT_CODE |