Skip to content

Instantly share code, notes, and snippets.

@jazzl0ver
Created May 2, 2024 13:48
Show Gist options
  • Save jazzl0ver/625dc440f07413da719528a4e41ab3ce to your computer and use it in GitHub Desktop.
Save jazzl0ver/625dc440f07413da719528a4e41ab3ce to your computer and use it in GitHub Desktop.
#!/bin/bash
################################################################################
# Script: check_es_system.sh #
# Author: Claudio Kuenzler www.claudiokuenzler.com #
# Purpose: Monitor ElasticSearch Store (Disk) Usage #
# Docs: www.claudiokuenzler.com/monitoring-plugins/check_es_system.php #
# License: GPLv2 #
# GNU General Public Licence (GPL) http://www.gnu.org/ #
# This program is free software; you can redistribute it and/or #
# modify it under the terms of the GNU General Public License #
# as published by the Free Software Foundation; either version 2 #
# of the License, or (at your option) any later version. #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with this program; if not, see <https://www.gnu.org/licenses/>. #
# #
# Copyright 2016,2018-2021,2023 Claudio Kuenzler #
# Copyright 2018 Tomas Barton #
# Copyright 2020 NotAProfessionalDeveloper #
# Copyright 2020 tatref #
# Copyright 2020 fbomj #
# Copyright 2021 chicco27 #
# Copyright 2024 jazzl0ver #
# #
# History/Changelog: #
# 20160429: Started programming plugin #
# 20160601: Continued programming. Working now as it should =) #
# 20160906: Added memory usage check, check types option (-t) #
# 20160906: Renamed plugin from check_es_store to check_es_system #
# 20160907: Change internal referenced variable name for available size #
# 20160907: Output now contains both used and available sizes #
# 20161017: Add missing -t in usage output #
# 20180105: Fix if statement for authentication (@deric) #
# 20180105: Fix authentication when wrong credentials were used #
# 20180313: Configure max_time for Elastic to respond (@deric) #
# 20190219: Fix alternative subject name in ssl (issue 4), direct to auth #
# 20190220: Added status check type #
# 20190403: Check for mandatory parameter checktype, adjust help #
# 20190403: Catch connection refused error #
# 20190426: Catch unauthorized (403) error #
# 20190626: Added readonly check type #
# 20190905: Catch empty cluster health status (issue #13) #
# 20190909: Added jthreads and tps (thread pool stats) check types #
# 20190909: Handle correct curl return codes #
# 20190924: Missing 'than' in tps output #
# 20191104: Added master check type #
# 20200401: Fix/handle 503 errors with curl exit code 0 (issue #20) #
# 20200409: Fix 503 error lookup (issue #22) #
# 20200430: Support both jshon and jq as json parsers (issue #18) #
# 20200609: Fix readonly check on ALL indices (issue #26) #
# 20200723: Add cluster name to status output #
# 20200824: Fix typo in readonly check output #
# 20200916: Internal renaming of -i parameter, use for tps check (issue #28) #
# 20201110: Fix thresholds in jthreads check #
# 20201125: Show names of read_only indexes with jq, set jq as default parser #
# 20210616: Fix authentication bug (#38) and non ES URL responding (#39) #
# 20211202: Added local node (-L), SSL settings (-K, -E), cpu check #
# 20230929: Bugfix in readonly check type for missing privileges #
# 20240502: Added max shards check #
################################################################################
#Variables and defaults
STATE_OK=0 # define the exit code if status is OK
STATE_WARNING=1 # define the exit code if status is Warning
STATE_CRITICAL=2 # define the exit code if status is Critical
STATE_UNKNOWN=3 # define the exit code if status is Unknown
export PATH=$PATH:/usr/local/bin:/usr/bin:/bin # Set path
version=1.12.1
port=9200
httpscheme=http
unit=G
include='_all'
max_time=30
parsers=(jq jshon)
################################################################################
#Functions
help () {
echo -e "$0 $version (c) 2016-$(date +%Y) Claudio Kuenzler and contributors (open source rulez!)
Usage: ./check_es_system.sh -H ESNode [-P port] [-S] [-u user -p pass|-E cert -K key] -t checktype [-o unit] [-w int] [-c int] [-m int] [-e string] [-X parser]
Options:
* -H Hostname or ip address of ElasticSearch Node
-L Run check on local node instead of cluster
-P Port (defaults to 9200)
-S Use https
-E Certs for Authentication
-K Key for Authentication
-u Username if authentication is required
-p Password if authentication is required
* -t Type of check (disk, mem, cpu, status, readonly, jthreads, tps, master)
-o Disk space unit (K|M|G) (defaults to G)
-i Space separated list of included object names to be checked (index names on readonly check, pool names on tps check)
-w Warning threshold (see usage notes below)
-c Critical threshold (see usage notes below)
-m Maximum time in seconds to wait for response (default: 30)
-e Expect master node (used with 'master' check)
-X The json parser to be used jshon or jq (default: jq)
-h Help!
*mandatory options
Threshold format for 'disk', 'mem' and 'cpu': int (for percent), defaults to 80 (warn) and 95 (crit)
Threshold format for 'tps': int,int,int (active, queued, rejected), no defaults
Threshold format for all other check types': int, no defaults
Requirements: curl, expr and one of $(IFS=,; echo "${parsers[*]}")"
exit $STATE_UNKNOWN;
}
authlogic () {
if [[ -z $user ]] && [[ -z $pass ]]; then echo "ES SYSTEM UNKNOWN - Authentication required but missing username and password"; exit $STATE_UNKNOWN
elif [[ -n $user ]] && [[ -z $pass ]]; then echo "ES SYSTEM UNKNOWN - Authentication required but missing password"; exit $STATE_UNKNOWN
elif [[ -n $pass ]] && [[ -z $user ]]; then echo "ES SYSTEM UNKNOWN - Missing username"; exit $STATE_UNKNOWN
fi
}
authlogic_cert () {
if [[ -z $cert ]] && [[ -z $key ]]; then echo "ES SYSTEM UNKNOWN - Authentication required but missing cert and key"; exit $STATE_UNKNOWN
elif [[ -n $cert ]] && [[ -z $key ]]; then echo "ES SYSTEM UNKNOWN - Authentication required but missing key"; exit $STATE_UNKNOWN
elif [[ -n $key ]] && [[ -z $cert ]]; then echo "ES SYSTEM UNKNOWN - Missing cert"; exit $STATE_UNKNOWN
fi
}
unitcalc() {
# ES presents the currently used disk space in Bytes
if [[ -n $unit ]]; then
case $unit in
K) availsize=$(expr $available / 1024); outputsize=$(expr ${size} / 1024);;
M) availsize=$(expr $available / 1024 / 1024); outputsize=$(expr ${size} / 1024 / 1024);;
G) availsize=$(expr $available / 1024 / 1024 / 1024); outputsize=$(expr ${size} / 1024 / 1024 / 1024);;
esac
if [[ -n $warning ]] ; then
warningsize=$(expr $warning \* ${available} / 100)
fi
if [[ -n $critical ]] ; then
criticalsize=$(expr $critical \* ${available} / 100)
fi
usedpercent=$(expr $size \* 100 / $available)
else echo "UNKNOWN - Shouldnt exit here. No units given"; exit $STATE_UNKNOWN
fi
}
thresholdlogic () {
if [ -n $warning ] && [ -z $critical ]; then echo "UNKNOWN - Define both warning and critical thresholds"; exit $STATE_UNKNOWN; fi
if [ -n $critical ] && [ -z $warning ]; then echo "UNKNOWN - Define both warning and critical thresholds"; exit $STATE_UNKNOWN; fi
}
default_percentage_thresholds() {
if [ -z $warning ] || [ "${warning}" = "" ]; then warning=80; fi
if [ -z $critical ] || [ "${critical}" = "" ]; then critical=95; fi
}
json_parse() {
json_parse_usage() { echo "$0: [-r] [-q] [-c] [-a] -x arg1 -x arg2 ..." 1>&2; exit; }
local OPTIND opt r q c a x
while getopts ":rqcax:" opt
do
case "${opt}" in
r) raw=1;;
q) quiet=1;; # only required for jshon
c) continue=1;; # only required for jshon
a) across=1;;
x) args+=("$OPTARG");;
*) json_parse_usage;;
esac
done
case ${parser} in
jshon)
cmd=()
for arg in "${args[@]}"; do
cmd+=(-e $arg)
done
jshon ${quiet:+-Q} ${continue:+-C} ${across:+-a} "${cmd[@]}" ${raw:+-u}
;;
jq)
cmd=()
for arg in "${args[@]}"; do
cmd+=(.$arg)
done
jq ${raw:+-r} $(IFS=; echo ${across:+.[]}"${cmd[*]}")
;;
esac
}
################################################################################
# Check for people who need help - aren't we all nice ;-)
if [ "${1}" = "--help" -o "${#}" = "0" ]; then help; exit $STATE_UNKNOWN; fi
################################################################################
# Get user-given variables
while getopts "H:LP:SE:K:u:p:d:o:i:w:c:t:m:e:X:" Input
do
case ${Input} in
H) host=${OPTARG};;
L) local=true;;
P) port=${OPTARG};;
S) httpscheme=https;;
E) cert=${OPTARG};;
K) key=${OPTARG};;
u) user=${OPTARG};;
p) pass=${OPTARG};;
d) oldavailable=${OPTARG};;
o) unit=${OPTARG};;
i) include=${OPTARG};;
w) warning=${OPTARG};;
c) critical=${OPTARG};;
t) checktype=${OPTARG};;
m) max_time=${OPTARG};;
e) expect_master=${OPTARG};;
X) parser=${OPTARG:=jq};;
*) help;;
esac
done
# Check for mandatory opts
if [[ -z ${host} ]]; then help; exit $STATE_UNKNOWN; fi
if [[ -z ${checktype} ]]; then help; exit $STATE_UNKNOWN; fi
# Check for deprecated opts
if [[ -n ${oldavailable} ]]; then
echo "ES SYSTEM UNKNOWN: -d parameter is now invalid. Capacities are now discovered directly from Elasticsearch."
exit ${STATE_UNKNOWN}
fi
# Local checks are only useful for certain check types
if [[ -n ${local} ]] && ( ! [[ ${checktype} =~ ^(cpu|mem|disk|jthreads)$ ]] ); then
echo "ES SYSTEM UNKNOWN: Node local checks (-L) only work with the following check types: cpu, mem, disk, jthreads"
exit ${STATE_UNKNOWN}
fi
################################################################################
# Check requirements
for cmd in curl expr ${parser}; do
if ! `which ${cmd} >/dev/null 2>&1`; then
echo "UNKNOWN: ${cmd} does not exist, please check if command exists and PATH is correct"
exit ${STATE_UNKNOWN}
fi
done
# Find parser
if [ -z ${parser} ]; then
for cmd in ${parsers[@]}; do
if `which ${cmd} >/dev/null 2>&1`; then
parser=${cmd}
break
fi
done
if [ -z "${parser}" ]; then
echo "UNKNOWN: No JSON parser found. Either one of the following is required: $(IFS=,; echo "${parsers[*]}")"
exit ${STATE_UNKNOWN}
fi
fi
################################################################################
# Retrieve information from Elasticsearch cluster
getstatus() {
if [[ ${local} ]]; then
esurl="${httpscheme}://${host}:${port}/_nodes/_local/stats"
else
esurl="${httpscheme}://${host}:${port}/_cluster/stats"
fi
eshealthurl="${httpscheme}://${host}:${port}/_cluster/health"
essettingsurl="${httpscheme}://${host}:${port}/_cluster/settings?include_defaults=true&filter_path=persistent.cluster.max_shards_per_node"
if [[ -z $user ]] && [[ -z $cert ]]; then
# Without authentication
esstatus=$(curl -k -s --max-time ${max_time} $esurl)
esstatusrc=$?
if [[ $esstatusrc -eq 7 ]]; then
echo "ES SYSTEM CRITICAL - Failed to connect to ${host} port ${port}: Connection refused"
exit $STATE_CRITICAL
elif [[ $esstatusrc -eq 28 ]]; then
echo "ES SYSTEM CRITICAL - server did not respond within ${max_time} seconds"
exit $STATE_CRITICAL
elif [[ "$esstatus" =~ "503 Service Unavailable" ]]; then
echo "ES SYSTEM CRITICAL - Elasticsearch not available: ${host}:${port} return error 503"
exit $STATE_CRITICAL
elif [[ "$esstatus" =~ "Unknown resource" ]]; then
echo "ES SYSTEM CRITICAL - Elasticsearch not available: ${esstatus}"
exit $STATE_CRITICAL
elif ! [[ "$esstatus" =~ "cluster_name" ]]; then
echo "ES SYSTEM CRITICAL - Elasticsearch not available at this address ${host}:${port}"
exit $STATE_CRITICAL
fi
# Additionally get cluster health infos
if [ $checktype = status ]; then
eshealth=$(curl -k -s --max-time ${max_time} $eshealthurl)
if [[ -z $eshealth ]]; then
echo "ES SYSTEM CRITICAL - unable to get cluster health information"
exit $STATE_CRITICAL
fi
essettings=$(curl -k -s --max-time ${max_time} $essettingsurl)
if [[ -z $essettings ]]; then
echo "ES SYSTEM CRITICAL - unable to get cluster settings"
exit $STATE_CRITICAL
fi
fi
fi
if [[ -n $user ]] || [[ -n $(echo $esstatus | grep -i authentication) ]] ; then
# Authentication required
authlogic
esstatus=$(curl -k -s --max-time ${max_time} --basic -u ${user}:${pass} $esurl)
esstatusrc=$?
if [[ $esstatusrc -eq 7 ]]; then
echo "ES SYSTEM CRITICAL - Failed to connect to ${host} port ${port}: Connection refused"
exit $STATE_CRITICAL
elif [[ $esstatusrc -eq 28 ]]; then
echo "ES SYSTEM CRITICAL - server did not respond within ${max_time} seconds"
exit $STATE_CRITICAL
elif [[ "$esstatus" =~ "503 Service Unavailable" ]]; then
echo "ES SYSTEM CRITICAL - Elasticsearch not available: ${host}:${port} return error 503"
exit $STATE_CRITICAL
elif [[ "$esstatus" =~ "Unknown resource" ]]; then
echo "ES SYSTEM CRITICAL - Elasticsearch not available: ${esstatus}"
exit $STATE_CRITICAL
elif [[ -n $(echo "$esstatus" | grep -i "unable to authenticate") ]]; then
echo "ES SYSTEM CRITICAL - Unable to authenticate user $user for REST request"
exit $STATE_CRITICAL
elif [[ -n $(echo "$esstatus" | grep -i "unauthorized") ]]; then
echo "ES SYSTEM CRITICAL - User $user is unauthorized"
exit $STATE_CRITICAL
elif ! [[ "$esstatus" =~ "cluster_name" ]]; then
echo "ES SYSTEM CRITICAL - Elasticsearch not available at this address ${host}:${port}"
exit $STATE_CRITICAL
fi
# Additionally get cluster health infos
if [[ $checktype = status ]]; then
eshealth=$(curl -k -s --max-time ${max_time} --basic -u ${user}:${pass} $eshealthurl)
if [[ -z $eshealth ]]; then
echo "ES SYSTEM CRITICAL - unable to get cluster health information"
exit $STATE_CRITICAL
fi
essettings=$(curl -k -s --max-time ${max_time} --basic -u ${user}:${pass} $essettingsurl)
if [[ -z $essettings ]]; then
echo "ES SYSTEM CRITICAL - unable to get cluster settings"
exit $STATE_CRITICAL
fi
fi
fi
if [[ -n $cert ]] || [[ -n $(echo $esstatus | grep -i authentication) ]] ; then
# Authentication with certificate
authlogic_cert
esstatus=$(curl -k -s --max-time ${max_time} -E ${cert} --key ${key} $esurl)
esstatusrc=$?
if [[ $esstatusrc -eq 7 ]]; then
echo "ES SYSTEM CRITICAL - Failed to connect to ${host} port ${port}: Connection refused"
exit $STATE_CRITICAL
elif [[ $esstatusrc -eq 28 ]]; then
echo "ES SYSTEM CRITICAL - server did not respond within ${max_time} seconds"
exit $STATE_CRITICAL
elif [[ "$esstatus" =~ "503 Service Unavailable" ]]; then
echo "ES SYSTEM CRITICAL - Elasticsearch not available: ${host}:${port} return error 503"
exit $STATE_CRITICAL
elif [[ -n $(echo "$esstatus" | grep -i "unable to authenticate") ]]; then
echo "ES SYSTEM CRITICAL - Unable to authenticate user $user for REST request"
exit $STATE_CRITICAL
elif [[ -n $(echo "$esstatus" | grep -i "unauthorized") ]]; then
echo "ES SYSTEM CRITICAL - User $user is unauthorized"
exit $STATE_CRITICAL
fi
# Additionally get cluster health infos
if [[ $checktype = status ]]; then
eshealth=$(curl -k -s --max-time ${max_time} -E ${cert} --key ${key} $eshealthurl)
if [[ -z $eshealth ]]; then
echo "ES SYSTEM CRITICAL - unable to get cluster health information"
exit $STATE_CRITICAL
fi
essettings=$(curl -k -s --max-time ${max_time} -E ${cert} --key ${key} $essettingsurl)
if [[ -z $essettings ]]; then
echo "ES SYSTEM CRITICAL - unable to get cluster settings"
exit $STATE_CRITICAL
fi
fi
fi
# Catch empty reply from server (typically happens when ssl port used with http connection)
if [[ -z $esstatus ]] || [[ $esstatus = '' ]]; then
echo "ES SYSTEM UNKNOWN - Empty reply from server (verify ssl settings)"
exit $STATE_UNKNOWN
fi
}
################################################################################
# Do the checks
case $checktype in
disk) # Check disk usage
getstatus
default_percentage_thresholds
if [[ ${local} ]]; then
size=$(echo $esstatus | json_parse -x 'nodes|' -x '[]' -x indices -x store -x size_in_bytes)
available=$(echo $esstatus | json_parse -x 'nodes|' -x '[]' -x fs -x total -x total_in_bytes)
else
size=$(echo $esstatus | json_parse -x indices -x store -x size_in_bytes)
available=$(echo $esstatus | json_parse -x nodes -x fs -x total_in_bytes)
fi
unitcalc
if [ -n "${warning}" ] || [ -n "${critical}" ]; then
# Handle tresholds
thresholdlogic
if [ $size -ge $criticalsize ]; then
echo "ES SYSTEM CRITICAL - Disk usage is at ${usedpercent}% ($outputsize $unit from $availsize $unit)|es_disk=${size}B;${warningsize};${criticalsize};0;${available}"
exit $STATE_CRITICAL
elif [ $size -ge $warningsize ]; then
echo "ES SYSTEM WARNING - Disk usage is at ${usedpercent}% ($outputsize $unit from $availsize $unit)|es_disk=${size}B;${warningsize};${criticalsize};0;${available}"
exit $STATE_WARNING
else
echo "ES SYSTEM OK - Disk usage is at ${usedpercent}% ($outputsize $unit from $availsize $unit)|es_disk=${size}B;${warningsize};${criticalsize};0;${available}"
exit $STATE_OK
fi
else
# No thresholds
echo "ES SYSTEM OK - Disk usage is at ${usedpercent}% ($outputsize $unit from $availsize $unit)|es_disk=${size}B;;;0;${available}"
exit $STATE_OK
fi
;;
mem) # Check memory usage
getstatus
default_percentage_thresholds
if [[ ${local} ]]; then
size=$(echo $esstatus | json_parse -x 'nodes|' -x '[]' -x jvm -x mem -x heap_used_in_bytes)
available=$(echo $esstatus | json_parse -x 'nodes|' -x '[]' -x jvm -x mem -x heap_max_in_bytes)
else
size=$(echo $esstatus | json_parse -x nodes -x jvm -x mem -x heap_used_in_bytes)
available=$(echo $esstatus | json_parse -x nodes -x jvm -x mem -x heap_max_in_bytes)
fi
unitcalc
if [ -n "${warning}" ] || [ -n "${critical}" ]; then
# Handle tresholds
thresholdlogic
if [ $size -ge $criticalsize ]; then
echo "ES SYSTEM CRITICAL - Memory usage is at ${usedpercent}% ($outputsize $unit) from $availsize $unit|es_memory=${size}B;${warningsize};${criticalsize};0;${available}"
exit $STATE_CRITICAL
elif [ $size -ge $warningsize ]; then
echo "ES SYSTEM WARNING - Memory usage is at ${usedpercent}% ($outputsize $unit from $availsize $unit)|es_memory=${size}B;${warningsize};${criticalsize};0;${available}"
exit $STATE_WARNING
else
echo "ES SYSTEM OK - Memory usage is at ${usedpercent}% ($outputsize $unit from $availsize $unit)|es_memory=${size}B;${warningsize};${criticalsize};0;${available}"
exit $STATE_OK
fi
else
# No thresholds
echo "ES SYSTEM OK - Memory usage is at ${usedpercent}% ($outputsize $unit from $availsize $unit)|es_memory=${size}B;;;0;${available}"
exit $STATE_OK
fi
;;
cpu) # Check memory usage
getstatus
default_percentage_thresholds
if [[ ${local} ]]; then
value=$(echo $esstatus | json_parse -x 'nodes|' -x '[]' -x process -x cpu -x percent)
else
value=$(echo $esstatus | json_parse -x nodes -x process -x cpu -x percent)
fi
if [ -n "${warning}" ] || [ -n "${critical}" ]; then
# Handle tresholds
thresholdlogic
if [ $value -ge $critical ]; then
echo "ES SYSTEM CRITICAL - CPU usage is at ${value}% |es_cpu=${value}%;${warning};${critical};0;100"
exit $STATE_CRITICAL
elif [ $value -ge $warning ]; then
echo "ES SYSTEM WARNING - CPU usage is at ${value}% |es_cpu=${value}%;${warning};${critical};0;100"
exit $STATE_WARNING
else
echo "ES SYSTEM OK - CPU usage is at ${value}% |es_cpu=${value}%;${warning};${critical};0;100"
exit $STATE_OK
fi
else
# No thresholds
echo "ES SYSTEM OK - CPU usage is at ${value}% |es_cpu=${value}%;${warning};${critical};0;100"
exit $STATE_OK
fi
;;
status) # Check Elasticsearch status
getstatus
status=$(echo $esstatus | json_parse -r -x status)
clustername=$(echo $esstatus | json_parse -r -x cluster_name)
shards=$(echo $esstatus | json_parse -r -x indices -x shards -x total)
docs=$(echo $esstatus | json_parse -r -x indices -x docs -x count)
nodest=$(echo $esstatus | json_parse -r -x nodes -x count -x total)
nodesd=$(echo $esstatus | json_parse -r -x nodes -x count -x data)
relocating=$(echo $eshealth | json_parse -r -x relocating_shards)
init=$(echo $eshealth | json_parse -r -x initializing_shards)
unass=$(echo $eshealth | json_parse -r -x unassigned_shards)
max_shards_per_node=$(echo $essettings | json_parse -r -x persistent.cluster.max_shards_per_node)
max_shards=$(($max_shards_per_node*$nodest))
max_shards_warn=$(($max_shards*98/100))
max_shards_crit=$(($max_shards*99/100))
if [ "$status" = "green" -a $shards -lt $max_shards_warn ]; then
echo "ES SYSTEM OK - Elasticsearch Cluster \"$clustername\" is $status (${nodest} nodes, ${nodesd} data nodes, ${shards}/$max_shards shards, ${docs} docs)|total_nodes=${nodest};;;; data_nodes=${nodesd};;;; total_shards=${shards};${max_shards_warn};${max_shards_crit};;${max_shards} relocating_shards=${relocating};;;; initializing_shards=${init};;;; unassigned_shards=${unass};;;; docs=${docs};;;;"
exit $STATE_OK
elif [ "$status" = "yellow" ] || [ $shards -ge $max_shards_warn -a $shards -lt $max_shards_crit ]; then
echo "ES SYSTEM WARNING - Elasticsearch Cluster \"$clustername\" is $status (${nodest} nodes, ${nodesd} data nodes, ${shards}/$max_shards shards, ${relocating} relocating shards, ${init} initializing shards, ${unass} unassigned shards, ${docs} docs)|total_nodes=${nodest};;;; data_nodes=${nodesd};;;; total_shards=${shards};${max_shards_warn};${max_shards_crit};;${max_shards} relocating_shards=${relocating};;;; initializing_shards=${init};;;; unassigned_shards=${unass};;;; docs=${docs};;;;"
exit $STATE_WARNING
elif [ "$status" = "red" -o $shards -ge $max_shards_crit ]; then
echo "ES SYSTEM CRITICAL - Elasticsearch Cluster \"$clustername\" is $status (${nodest} nodes, ${nodesd} data nodes, ${shards}/$max_shards shards, ${relocating} relocating shards, ${init} initializing shards, ${unass} unassigned shards, ${docs} docs)|total_nodes=${nodest};;;; data_nodes=${nodesd};;;; total_shards=${shards};${max_shards_warn};${max_shards_crit};;${max_shards} relocating_shards=${relocating};;;; initializing_shards=${init};;;; unassigned_shards=${unass};;;; docs=${docs};;;;"
exit $STATE_CRITICAL
fi
;;
readonly) # Check Readonly status on given indexes
getstatus
icount=0
for index in $include; do
if [[ -z $user ]]; then
# Without authentication
settings=$(curl -k -s --max-time ${max_time} ${httpscheme}://${host}:${port}/$index/_settings)
if [[ $? -eq 7 ]]; then
echo "ES SYSTEM CRITICAL - Failed to connect to ${host} port ${port}: Connection refused"
exit $STATE_CRITICAL
elif [[ $? -eq 28 ]]; then
echo "ES SYSTEM CRITICAL - server did not respond within ${max_time} seconds"
exit $STATE_CRITICAL
elif [[ "$settings" =~ "is unauthorized" ]]; then
errormsg=$(echo "$settings" | json_parse -r -q -c -x error -x reason)
echo "ES SYSTEM CRITICAL - Access denied ($errormsg)"
exit $STATE_CRITICAL
fi
rocount=$(echo $settings | json_parse -r -q -c -a -x settings -x index -x blocks -x read_only | grep -c true)
roadcount=$(echo $settings | json_parse -r -q -c -a -x settings -x index -x blocks -x read_only_allow_delete | grep -c true)
if [[ $rocount -gt 0 ]]; then
output[${icount}]=" $index is read-only -"
roerror=true
fi
if [[ $roadcount -gt 0 ]]; then
output[${icount}]+=" $index is read-only (allow delete) -"
roerror=true
fi
fi
if [[ -n $user ]] || [[ -n $(echo $esstatus | grep -i authentication) ]] ; then
# Authentication required
authlogic
settings=$(curl -k -s --max-time ${max_time} --basic -u ${user}:${pass} ${httpscheme}://${host}:${port}/$index/_settings)
settingsrc=$?
if [[ $settingsrc -eq 7 ]]; then
echo "ES SYSTEM CRITICAL - Failed to connect to ${host} port ${port}: Connection refused"
exit $STATE_CRITICAL
elif [[ $settingsrc -eq 28 ]]; then
echo "ES SYSTEM CRITICAL - server did not respond within ${max_time} seconds"
exit $STATE_CRITICAL
elif [[ -n $(echo "$settings" | grep -i "unable to authenticate") ]]; then
echo "ES SYSTEM CRITICAL - Unable to authenticate user $user for REST request"
exit $STATE_CRITICAL
elif [[ "$settings" =~ "is unauthorized" ]]; then
errormsg=$(echo "$settings" | json_parse -r -q -c -x error -x reason)
echo "ES SYSTEM CRITICAL - Access denied ($errormsg)"
exit $STATE_CRITICAL
fi
rocount=$(echo $settings | json_parse -r -q -c -a -x settings -x index -x blocks -x read_only | grep -c true)
roadcount=$(echo $settings | json_parse -r -q -c -a -x settings -x index -x blocks -x read_only_allow_delete | grep -c true)
if [[ $rocount -gt 0 ]]; then
if [[ "$index" = "_all" ]]; then
if [[ $parser = "jq" ]]; then
roindexes=$(echo $settings | jq -r '.[].settings.index |select(.blocks.read_only == "true").provided_name')
fi
output[${icount}]=" $rocount index(es) found read-only $roindexes -"
else output[${icount}]=" $index is read-only -"
fi
roerror=true
fi
if [[ $roadcount -gt 0 ]]; then
if [[ "$index" = "_all" ]]; then
if [[ $parser = "jq" ]]; then
roadindexes=$(echo $settings | jq -r '.[].settings.index |select(.blocks.read_only_allow_delete == "true").provided_name' | tr '\n' ' ')
fi
output[${icount}]+=" $roadcount index(es) found read-only (allow delete) $roadindexes"
else output[${icount}]+=" $index is read-only (allow delete) -"
fi
roerror=true
fi
fi
let icount++
done
if [[ $roerror ]]; then
echo "ES SYSTEM CRITICAL - ${output[*]}"
exit $STATE_CRITICAL
else
echo "ES SYSTEM OK - Elasticsearch Indexes ($include) are writeable"
exit $STATE_OK
fi
;;
jthreads) # Check JVM threads
getstatus
if [[ ${local} ]]; then
threads=$(echo $esstatus | json_parse -x 'nodes|' -x '[]' -x jvm -x threads -x count)
else
threads=$(echo $esstatus | json_parse -r -x nodes -x jvm -x "threads")
fi
if [ -n "${warning}" ] || [ -n "${critical}" ]; then
# Handle tresholds
thresholdlogic
if [[ $threads -ge $critical ]]; then
echo "ES SYSTEM CRITICAL - Number of JVM threads is ${threads}|es_jvm_threads=${threads};${warning};${critical};;"
exit $STATE_CRITICAL
elif [[ $threads -ge $warning ]]; then
echo "ES SYSTEM WARNING - Number of JVM threads is ${threads}|es_jvm_threads=${threads};${warning};${critical};;"
exit $STATE_WARNING
else
echo "ES SYSTEM OK - Number of JVM threads is ${threads}|es_jvm_threads=${threads};${warning};${critical};;"
exit $STATE_OK
fi
else
# No thresholds
echo "ES SYSTEM OK - Number of JVM threads is ${threads}|es_jvm_threads=${threads};${warning};${critical};;"
exit $STATE_OK
fi
;;
tps) # Check Thread Pool Statistics
getstatus
if [[ -z $user ]]; then
# Without authentication
threadpools=$(curl -k -s --max-time ${max_time} ${httpscheme}://${host}:${port}/_cat/thread_pool)
threadpoolrc=$?
if [[ $threadpoolrc -eq 7 ]]; then
echo "ES SYSTEM CRITICAL - Failed to connect to ${host} port ${port}: Connection refused"
exit $STATE_CRITICAL
elif [[ $threadpoolrc -eq 28 ]]; then
echo "ES SYSTEM CRITICAL - server did not respond within ${max_time} seconds"
exit $STATE_CRITICAL
fi
fi
if [[ -n $user ]] || [[ -n $(echo $esstatus | grep -i authentication) ]] ; then
# Authentication required
authlogic
threadpools=$(curl -k -s --max-time ${max_time} --basic -u ${user}:${pass} ${httpscheme}://${host}:${port}/_cat/thread_pool)
threadpoolrc=$?
if [[ $threadpoolrc -eq 7 ]]; then
echo "ES SYSTEM CRITICAL - Failed to connect to ${host} port ${port}: Connection refused"
exit $STATE_CRITICAL
elif [[ $threadpoolrc -eq 28 ]]; then
echo "ES SYSTEM CRITICAL - server did not respond within ${max_time} seconds"
exit $STATE_CRITICAL
elif [[ -n $(echo $esstatus | grep -i "unable to authenticate") ]]; then
echo "ES SYSTEM CRITICAL - Unable to authenticate user $user for REST request"
exit $STATE_CRITICAL
elif [[ -n $(echo $esstatus | grep -i "unauthorized") ]]; then
echo "ES SYSTEM CRITICAL - User $user is unauthorized"
exit $STATE_CRITICAL
fi
fi
if ! [[ $include = "_all" ]]; then
tpsgrep=$(echo "$include" | sed "s/ /|/g")
threadpools=$(echo "$threadpools" | egrep -i "(${tpsgrep})")
if [[ $(echo ${threadpools[*]}) = "" ]]; then
echo "Thread Pool check is critical: No thread pools found with given name(s): ${include}."
exit $STATE_CRITICAL
fi
fi
tpname=($(echo "$threadpools" | awk '{print $1"-"$2}' | sed "s/\n//g"))
tpactive=($(echo "$threadpools" | awk '{print $3}' | sed "s/\n//g"))
tpqueue=($(echo "$threadpools" | awk '{print $4}' | sed "s/\n//g"))
tprejected=($(echo "$threadpools" | awk '{print $5}' | sed "s/\n//g"))
if [ -n "${warning}" ] || [ -n "${critical}" ]; then
# Handle thresholds. They have to come in a special format: n,n,n (active, queue, rejected)
thresholdlogic
wactive=$(echo ${warning} | awk -F',' '{print $1}')
wqueue=$(echo ${warning} | awk -F',' '{print $2}')
wrejected=$(echo ${warning} | awk -F',' '{print $3}')
cactive=$(echo ${critical} | awk -F',' '{print $1}')
cqueue=$(echo ${critical} | awk -F',' '{print $2}')
crejected=$(echo ${critical} | awk -F',' '{print $3}')
i=0; for tp in ${tpname[*]}; do
perfdata[$i]="tp_${tp}_active=${tpactive[$i]};${wactive};${cactive};; tp_${tp}_queue=${tpqueue[$i]};${wqueue};${cqueue};; tp_${tp}_rejected=${tprejected[$i]};${wrejected};${crejected};; "
let i++
done
i=0
for tpa in $(echo ${tpactive[*]}); do
if [[ $tpa -ge $cactive ]]; then
echo "Thread Pool ${tpname[$i]} is critical: Active ($tpa) is equal or higher than threshold ($cactive)|${perfdata[*]}"
exit $STATE_CRITICAL
elif [[ $tpa -ge $wactive ]]; then
echo "Thread Pool ${tpname[$i]} is warning: Active ($tpa) is equal or higher than threshold ($wactive)|${perfdata[*]}"
exit $STATE_WARNING
fi
let i++
done
i=0
for tpq in $(echo ${tpqueue[*]}); do
if [[ $tpq -ge $cqueue ]]; then
echo "Thread Pool ${tpname[$i]} is critical: Queue ($tpq) is equal or higher than threshold ($cqueue)|${perfdata[*]}"
exit $STATE_CRITICAL
elif [[ $tpq -ge $wqueue ]]; then
echo "Thread Pool ${tpname[$i]} is warning: Queue ($tpq) is equal or higher than threshold ($wqueue)|${perfdata[*]}"
exit $STATE_WARNING
fi
let i++
done
i=0
for tpr in $(echo ${tprejected[*]}); do
if [[ $tpr -ge $crejected ]]; then
echo "Thread Pool ${tpname[$i]} is critical: Rejected ($tpr) is equal or higher than threshold ($crejected)|${perfdata[*]}"
exit $STATE_CRITICAL
elif [[ $tpr -ge $wrejected ]]; then
echo "Thread Pool ${tpname[$i]} is warning: Rejected ($tpr) is equal or higher than threshold ($wrejected)|${perfdata[*]}"
exit $STATE_WARNING
fi
let i++
done
echo "ES SYSTEM OK - Found ${#tpname[*]} thread pools in cluster|${perfdata[*]}"
exit $STATE_OK
fi
# No Thresholds
i=0; for tp in ${tpname[*]}; do
perfdata[$i]="tp_${tp}_active=${tpactive[$i]};;;; tp_${tp}_queue=${tpqueue[$i]};;;; tp_${tp}_rejected=${tprejected[$i]};;;; "
let i++
done
echo "ES SYSTEM OK - Found ${#tpname[*]} thread pools in cluster|${perfdata[*]}"
exit $STATE_OK
;;
master) # Check Cluster Master
getstatus
if [[ -z $user ]]; then
# Without authentication
master=$(curl -k -s --max-time ${max_time} ${httpscheme}://${host}:${port}/_cat/master)
masterrc=$?
if [[ $masterrc -eq 7 ]]; then
echo "ES SYSTEM CRITICAL - Failed to connect to ${host} port ${port}: Connection refused"
exit $STATE_CRITICAL
elif [[ $masterrc -eq 28 ]]; then
echo "ES SYSTEM CRITICAL - server did not respond within ${max_time} seconds"
exit $STATE_CRITICAL
fi
fi
if [[ -n $user ]] || [[ -n $(echo $esstatus | grep -i authentication) ]] ; then
# Authentication required
authlogic
master=$(curl -k -s --max-time ${max_time} --basic -u ${user}:${pass} ${httpscheme}://${host}:${port}/_cat/master)
masterrc=$?
if [[ $threadpoolrc -eq 7 ]]; then
echo "ES SYSTEM CRITICAL - Failed to connect to ${host} port ${port}: Connection refused"
exit $STATE_CRITICAL
elif [[ $threadpoolrc -eq 28 ]]; then
echo "ES SYSTEM CRITICAL - server did not respond within ${max_time} seconds"
exit $STATE_CRITICAL
elif [[ -n $(echo $esstatus | grep -i "unable to authenticate") ]]; then
echo "ES SYSTEM CRITICAL - Unable to authenticate user $user for REST request"
exit $STATE_CRITICAL
elif [[ -n $(echo $esstatus | grep -i "unauthorized") ]]; then
echo "ES SYSTEM CRITICAL - User $user is unauthorized"
exit $STATE_CRITICAL
fi
fi
masternode=$(echo "$master" | awk '{print $NF}')
if [[ -n ${expect_master} ]]; then
if [[ "${expect_master}" = "${masternode}" ]]; then
echo "ES SYSTEM OK - Master node is $masternode"
exit $STATE_OK
else
echo "ES SYSTEM WARNING - Master node is $masternode but expected ${expect_master}"
exit $STATE_WARNING
fi
else
echo "ES SYSTEM OK - Master node is $masternode"
exit $STATE_OK
fi
;;
*) help
esac
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment