-
-
Save Fedalto/5658639 to your computer and use it in GitHub Desktop.
The MIT License (MIT) | |
Copyright (c) 2015 Leonardo Fedalto | |
Permission is hereby granted, free of charge, to any person obtaining a copy | |
of this software and associated documentation files (the "Software"), to deal | |
in the Software without restriction, including without limitation the rights | |
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the Software is | |
furnished to do so, subject to the following conditions: | |
The above copyright notice and this permission notice shall be included in all | |
copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
SOFTWARE. |
#!/bin/bash | |
# This script will get the list af active cores and issues a ping request to them. | |
# If a single request failes, the check will return ERROR | |
# All requests must return OK | |
# Local variables | |
EXIT_CODE=0 | |
SOLR_HOST="localhost" | |
SOLR_HOST_PORT="8983" | |
# Last replication event must have been at least xxx sec ago. | |
REPLICATION_TIME_TOLERANCE=900 # = 15 min | |
OUT="" | |
set -u | |
parse_args() { | |
# If no arguments are given | |
if [ "$#" -eq 0 ]; then | |
pod2usage "$0" | |
exit 1 | |
fi | |
# Parse arguments | |
while [ $# -gt 0 ] | |
do | |
case "$1" in | |
--help|-h|-\?) | |
pod2usage -verbose 1 "$0" | |
exit 1 | |
;; | |
--man) | |
pod2usage -verbose 2 "$0" | |
exit 1 | |
;; | |
--host|-H) | |
SOLR_HOST="$2" | |
shift 2 | |
;; | |
--port|-P) | |
SOLR_HOST_PORT="$2" | |
shift 2 | |
;; | |
--diff|-D) | |
REPLICATION_TIME_TOLERANCE="$2" | |
shift 2 | |
;; | |
*) | |
# Non option argument | |
break # Finish for loop | |
;; | |
esac | |
done | |
# main code | |
[ -n "$SOLR_HOST" ] || { | |
echo "UNKNOWN: --host argument is not set" | |
exit 2 | |
} | |
} | |
call_solr_api() { | |
curl --retry 3 --max-time 15 --fail --silent "http://${SOLR_HOST}:${SOLR_HOST_PORT}/solr/${1}" | |
if [ "$?" != "0" ]; then | |
echo "CRITICAL: server \"$SOLR_HOST\" is not responding or returned incorrect data." >&2 | |
exit 2 | |
fi | |
} | |
get_cores_status() { | |
call_solr_api "admin/cores" | |
} | |
get_replication_details() { | |
call_solr_api "${1}/replication?command=details" | |
} | |
ping_solr_core() { | |
call_solr_api "${1}/admin/ping" | |
} | |
get_solr_cores() { | |
local response=$(<<< "$1" xmllint --format - | grep '<str name="name"' | sed -e 's/[ ]*<str name="name">//g' -e 's/<\/str>//g') | |
if [ -z "$response" ]; then | |
echo "CRITICAL: server \"$SOLR_HOST\" returned an empty list of cores." | |
exit 2 | |
fi | |
echo "$response" | |
} | |
is_master() { | |
[[ $(xmlstarlet sel -t -v "/response/lst[@name='details']/str[@name='isMaster']" <<< "$1") == "true" ]] | |
} | |
check_init_failures() { | |
local cores_status="$1" | |
local cores_with_init_failures=$(xmlstarlet sel -t -m "/response/lst[@name='initFailures']/str" -m "@name" -v . -n <<< "${cores_status="$1"}" | grep -v '^$') | |
for bad_core in $cores_with_init_failures; do | |
local problem=$(xmlstarlet sel -t -v "/response/lst[@name='initFailures']/str[@name='$bad_core']" <<< "${cores_status="$1"}") | |
OUT="${OUT}Core \"$bad_core\" could not initialize: ${problem}\n" | |
EXIT_CODE=2 | |
done | |
} | |
check_master_core() { | |
local core="$1" | |
local result=$(ping_solr_core "$core") | |
result=$(xmlstarlet sel -t -v "/response/str[@name='status']" <<< "$result") | |
OUT="${OUT}Core \"${core}\" returned \"${result}\".\n" | |
[ "$result" = "OK" ] || EXIT_CODE=2 | |
} | |
check_slave_core() { | |
local core="$1" | |
local replication_details="$2" | |
#Verifying replication health is harder, so there will be several checks. | |
# 1) Check if master host is defined. | |
# 2) Check indexReplicatedAtList and compare it with replicationFailedAtList. This will determine if replication is working. | |
# 3) Check if indexReplicatedAtList is more then 2 hours behind now(). If it is then something might be wrong. | |
# 4) Check if indexVersion on master and slave match. | |
# Get the URL to the corresponding master core | |
SOLR_SLAVE_MASTERURL=$(xmlstarlet sel -t -v "/response/lst[@name='details']/lst[@name='slave']/str[@name='masterUrl']" <<< "$replication_details") | |
# Get the master index version | |
SOLR_MASTER_INDEXVERSION=$(xmlstarlet sel -t -v "/response/lst[@name='details']/lst[@name='slave']/lst[@name='masterDetails']/long[@name='indexVersion']" <<< "$replication_details") | |
# Get slave index version | |
SOLR_SLAVE_INDEXVERSION=$(xmlstarlet sel -t -v "/response/lst[@name='details']/long[@name='indexVersion']" <<< "$replication_details") | |
# Get the last time the core replicated correctly. | |
SOLR_SLAVE_REPLICATEDAT=$(xmlstarlet sel -t -v "/response/lst[@name='details']/lst[@name='slave']/arr[@name='indexReplicatedAtList']/str[1]" <<< "$replication_details") | |
# Get the last time the core failed to replicate. | |
SOLR_SLAVE_FAILEDAT=$(xmlstarlet sel -t -v "/response/lst[@name='details']/lst[@name='slave']/arr[@name='replicationFailedAtList']/str[1]" <<< "$replication_details") | |
# Is this core replicating (aka pulling index from master) right now? | |
SOLR_SLAVE_REPLICATING=$(xmlstarlet sel -t -v "/response/lst[@name='details']/lst[@name='slave']/str[@name='isReplicating']" <<< "$replication_details") | |
# masterUrl is not set, replication is broken. | |
if [ -z "$SOLR_SLAVE_MASTERURL" ]; then | |
OUT="$OUT\n"."Core \"${core}\" has no masterUrl set, solr.xml misconfiguration or orphaned core." | |
EXIT_CODE=2 | |
continue | |
fi | |
# Slave could not get the master index version. Maybe the master is down or network issues. | |
if [[ -z "$SOLR_MASTER_INDEXVERSION" ]]; then | |
OUT="${OUT}Core \"$core\" could not get master index version.\n" | |
EXIT_CODE=2 | |
continue | |
fi | |
# If SOLR_SLAVE_REPLICATEDAT is empty then the instance has not replicated once, this is an error. | |
# Actually it's possible that this check will cause a false alert if the index is being replicated for the first time just now, so check for this also. | |
if [ -z "$SOLR_SLAVE_FAILEDAT" -a -z "$SOLR_SLAVE_REPLICATEDAT" -a "$SOLR_SLAVE_REPLICATING" == "true" ]; then | |
# Everything is ok, this is the first time this core has been triggered to replicate. | |
OUT="${OUT}Core \"${core}\" is replicating for the first time.\n" | |
continue | |
fi | |
if [ \( -n "$SOLR_SLAVE_FAILEDAT" -a -z "$SOLR_SLAVE_REPLICATEDAT" \) -o \( -z "$SOLR_SLAVE_FAILEDAT" -a -z "$SOLR_SLAVE_REPLICATEDAT" \) ]; then | |
# This core has never replicated, this is an error. | |
OUT="${OUT}Core \"${core}\" has problems replicating.\n" | |
EXIT_CODE=2 | |
continue | |
fi | |
# We need to calculate if the last replication attempt was successfull. | |
LAST_REPLICATION_SUCCESSFUL=`date -d "$SOLR_SLAVE_REPLICATEDAT" +%s` | |
TOLERANCE=`date -d "now - $REPLICATION_TIME_TOLERANCE seconds" +%s` | |
# Verify that either the timestamp is not older then REPLICATION_TIME_TOLERANCE seconds and if it is check that indexversions match. | |
if [ $LAST_REPLICATION_SUCCESSFUL -gt $TOLERANCE -o $SOLR_MASTER_INDEXVERSION = $SOLR_SLAVE_INDEXVERSION ]; then | |
# Everything is ok | |
OUT="${OUT}Core \"${core}\" is up to date.\n" | |
else | |
# Slave if outdated. | |
OUT="${OUT}Core \"${core}\" is behind master more then $REPLICATION_TIME_TOLERANCE seconds.\n" | |
EXIT_CODE=2 | |
fi | |
} | |
main() { | |
parse_args "$@" | |
local cores_status=$(get_cores_status) | |
local solr_cores=$(get_solr_cores "$cores_status") | |
check_init_failures "$cores_status" | |
for core in $solr_cores; do | |
local replication_details=$(get_replication_details "$core") | |
if is_master "$replication_details"; then | |
check_master_core "$core" | |
else | |
check_slave_core "$core" "$replication_details" | |
fi | |
done | |
echo -e -n "$OUT" | |
exit $EXIT_CODE | |
} | |
main "$@" | |
__END__ | |
=pod | |
=head1 NAME | |
solr-monitor - Simple script to check the status of all defined cores on a solr server | |
=head1 SYNOPSIS | |
solr-monitor [OPTIONS] | |
=head1 OPTIONS | |
=over 4 | |
=item B<--help> | B<-h> | |
Print the brief help message and exit. | |
=item B<--man> | |
Print the manual page and exit. | |
=item B<--host> | B<-H> HOST | |
Check this host instead of localhost. | |
=item B<--port> | B<-P> Port | |
Use this port instead of the default(8983) to connect. | |
=item B<--diff> | B<-D> Time difference between now and when solr last replicated | |
Use this option to set the maximum difference in seconds between the time when the solr slave replicated and now. | |
=back | |
Use '--' to separate options and argument if it starts with '-'. | |
=head1 DESCRIPTION | |
Simple script to run that will query the solr server for a list of defined cores | |
and then verify that all of them respond to ping requests | |
=head1 EXAMPLES | |
solr-monitor --host qa-c1-solrmst1 | |
solr-monitor -H qa-c1-solrmst1 -P 80 -D 900 | |
=head1 AUTHOR | |
Alexander V. Chykysh <[email protected]> | |
Leonardo Fedalto <[email protected]> | |
=cut |
Hi, I can help you improve this.
Let me know if you are interested.
skype : pal.rajesh
Hi, I am trying to use this script in dev server. I have only edited Line 68 "http://${SOLR_HOST}:${SOLR_HOST_PORT}/solr/${1}" to http://172.35.43.10:9001/solr/. The other local host and port detail configurations are same. While executing the script with ./checksolr.sh --host localhost, i am getting this error:
-:162.1: Premature end of data in tag link line 43
-:162.1: Premature end of data in tag link line 42
-:162.1: Premature end of data in tag link line 41
-:162.1: Premature end of data in tag link line 40
-:162.1: Premature end of data in tag link line 39
-:162.1: Premature end of data in tag link line 38
-:162.1: Premature end of data in tag link line 37
-:162.1: Premature end of data in tag link line 36
-:162.1: Premature end of data in tag link line 35
-:162.1: Premature end of data in tag link line 34
-:162.1: Premature end of data in tag link line 33
-:162.1: Premature end of data in tag link line 32
-:162.1: Premature end of data in tag link line 31
-:162.1: Premature end of data in tag link line 30
-:162.1: Premature end of data in tag link line 29
-:162.1: Premature end of data in tag link line 28
-:162.1: Premature end of data in tag link line 27
-:162.1: Premature end of data in tag link line 25
-:162.1: Premature end of data in tag head line 21
-:162.1: Premature end of data in tag html line 2
-:56.8: Opening and ending tag mismatch: meta line 45 and head
^ -:82.17: Entity 'nbsp' not defined ^ -:111.59: Opening and ending tag mismatch: p line 111 and li operties" class="global">
Java Properties ^ -:130.15: Opening and ending tag mismatch: ul line 90 and div ^ -:160.8: Opening and ending tag mismatch: div line 59 and body ^ -:161.8: Opening and ending tag mismatch: body line 57 and html ^ -:162.1: Premature end of data in tag link line 43 -:162.1: Premature end of data in tag link line 42 -:162.1: Premature end of data in tag link line 41 -:162.1: Premature end of data in tag link line 40 -:162.1: Premature end of data in tag link line 39 -:162.1: Premature end of data in tag link line 38 -:162.1: Premature end of data in tag link line 37 -:162.1: Premature end of data in tag link line 36 -:162.1: Premature end of data in tag link line 35 -:162.1: Premature end of data in tag link line 34 -:162.1: Premature end of data in tag link line 33 -:162.1: Premature end of data in tag link line 32 -:162.1: Premature end of data in tag link line 31 -:162.1: Premature end of data in tag link line 30 -:162.1: Premature end of data in tag link line 29 -:162.1: Premature end of data in tag link line 28 -:162.1: Premature end of data in tag link line 27 -:162.1: Premature end of data in tag link line 25 -:162.1: Premature end of data in tag head line 21 -:162.1: Premature end of data in tag html line 2 -:56.8: Opening and ending tag mismatch: meta line 45 and head ^ -:65.19: Entity 'nbsp' not defined
^ -:82.17: Entity 'nbsp' not defined ^ -:111.59: Opening and ending tag mismatch: p line 111 and li operties" class="global">
Java Properties ^ -:130.15: Opening and ending tag mismatch: ul line 90 and div ^ -:160.8: Opening and ending tag mismatch: div line 59 and body ^ -:161.8: Opening and ending tag mismatch: body line 57 and html ^ -:162.1: Premature end of data in tag link line 43 -:162.1: Premature end of data in tag link line 42 -:162.1: Premature end of data in tag link line 41 -:162.1: Premature end of data in tag link line 40 -:162.1: Premature end of data in tag link line 39 -:162.1: Premature end of data in tag link line 38 -:162.1: Premature end of data in tag link line 37 -:162.1: Premature end of data in tag link line 36 -:162.1: Premature end of data in tag link line 35 -:162.1: Premature end of data in tag link line 34 -:162.1: Premature end of data in tag link line 33 -:162.1: Premature end of data in tag link line 32 -:162.1: Premature end of data in tag link line 31 -:162.1: Premature end of data in tag link line 30 -:162.1: Premature end of data in tag link line 29 -:162.1: Premature end of data in tag link line 28 -:162.1: Premature end of data in tag link line 27 -:162.1: Premature end of data in tag link line 25 -:162.1: Premature end of data in tag head line 21 -:162.1: Premature end of data in tag html line 2 -:56.8: Opening and ending tag mismatch: meta line 45 and head ^ -:65.19: Entity 'nbsp' not defined
^ -:82.17: Entity 'nbsp' not defined ^ -:111.59: Opening and ending tag mismatch: p line 111 and li operties" class="global">
Java Properties ^ -:130.15: Opening and ending tag mismatch: ul line 90 and div ^ -:160.8: Opening and ending tag mismatch: div line 59 and body ^ -:161.8: Opening and ending tag mismatch: body line 57 and html ^ -:162.1: Premature end of data in tag link line 43 -:162.1: Premature end of data in tag link line 42 -:162.1: Premature end of data in tag link line 41 -:162.1: Premature end of data in tag link line 40 -:162.1: Premature end of data in tag link line 39 -:162.1: Premature end of data in tag link line 38 -:162.1: Premature end of data in tag link line 37 -:162.1: Premature end of data in tag link line 36 -:162.1: Premature end of data in tag link line 35 -:162.1: Premature end of data in tag link line 34 -:162.1: Premature end of data in tag link line 33 -:162.1: Premature end of data in tag link line 32 -:162.1: Premature end of data in tag link line 31 -:162.1: Premature end of data in tag link line 30 -:162.1: Premature end of data in tag link line 29 -:162.1: Premature end of data in tag link line 28 -:162.1: Premature end of data in tag link line 27 -:162.1: Premature end of data in tag link line 25 -:162.1: Premature end of data in tag head line 21 -:162.1: Premature end of data in tag html line 2
.Core "CRITICAL:" has no masterUrl set, solr.xml misconfiguration or orphaned core.
.Core "server" has no masterUrl set, solr.xml misconfiguration or orphaned core.
.Core ""localhost"" has no masterUrl set, solr.xml misconfiguration or orphaned core.
.Core "returned" has no masterUrl set, solr.xml misconfiguration or orphaned core.
.Core "an" has no masterUrl set, solr.xml misconfiguration or orphaned core.
.Core "empty" has no masterUrl set, solr.xml misconfiguration or orphaned core.
.Core "list" has no masterUrl set, solr.xml misconfiguration or orphaned core.
.Core "of" has no masterUrl set, solr.xml misconfiguration or orphaned core.
.Core "cores." has no masterUrl set, solr.xml misconfiguration or orphaned core.[ecommadm@k1c20050so101vl solr_script]$ ls -ltr
total 8
-rwxr-xr-x 1 ecommadm sorianahyb 8027 Apr 7 00:44 checksolr.sh
[ecommadm@k1c20050so101vl solr_script]$
Can someone please help me in resolving this?
Dependencies: curl, xmllint, xmlstarlet.
Tested on Linux and OS X, Solr 4.1 API.
By default, it will check Solr cores in localhost on port 8983 and will have a replication tolerance of 15 minutes.
Meaning that if a slave core is behind the master and didn't replicate in 15 minutes, the script will return 2.
Return codes:
Example usage: