Skip to content

Instantly share code, notes, and snippets.

@JPablomr
Created May 24, 2016 15:46
Show Gist options
  • Save JPablomr/d0769ebdec58e8a67308513ab2ff822a to your computer and use it in GitHub Desktop.
Save JPablomr/d0769ebdec58e8a67308513ab2ff822a to your computer and use it in GitHub Desktop.
Scout's MySQL Replication Monitor, allowing for more than one replication failure before reporting it
require 'time'
require 'date'
class MysqlReplicationMonitor < Scout::Plugin
needs 'mysql2'
OPTIONS=<<-EOS
host:
name: Host
notes: The slave host to monitor
default: 127.0.0.1
port:
name: Port
notes: The port number on the slave host
default: 3306
username:
name: Username
notes: The MySQL username to use
default: root
password:
name: Password
notes: The password for the mysql user
default:
attributes: password
socket:
name: MySQL socket
notes: Specify the location of the MySQL socket
ignore_window_start:
name: Ignore Window Start
notes: Time to start ignoring replication failures. Useful for disabling replication for backups. For Example, 7:00pm
default:
ignore_window_end:
name: Ignore Window End
notes: Time to resume notifications on replication failure. For Example, 2:00am
default:
default_file:
name: Mysql Default File
notes: Optional path to the MySQL default file. For Example, /home/scout/.my.cnf
default:
fail_count:
name: Maximum failed polls
notes: Number of failed polls that have to happen before alerting
default: 1
EOS
attr_accessor :connection
def build_report
res={"Seconds Behind Master" => -1, "Replication Running"=>0}
begin
self.connection=Mysql2::Client.new(
:host => option(:host),
:username => option(:username),
:password => option(:password),
:port => (option(:port).nil? ? nil : option(:port).to_i),
:socket => option(:socket),
:default_file => (option(:default_file) unless option(:default_file).nil? || option(:default_file).empty?)
)
y = connection.query("show slave status")
down_at = memory(:down_at)
# There's no Replication
if y.count == 0
error("Replication not configured")
else
h = y.each {|r| r}[0]
if h["Seconds_Behind_Master"].nil? && !down_at
if in_ignore_window?
res["Replication Running"] = replication_check("up")
else
res["Replication Running"] = replication_check("down")
down_at = Time.now
end
elsif h["Slave_IO_Running"] == "Yes" && h["Slave_SQL_Running"] == "Yes"
res["Seconds Behind Master"] = h["Seconds_Behind_Master"]
res["Replication Running"] = replication_check("up")
down_at = nil if down_at
elsif !down_at
if in_ignore_window?
res["Replication Running"] = replication_check("up")
else
down_at = Time.now
res["Replication Running"] = replication_check("down")
end
end
end
remember(:down_at,down_at)
rescue Mysql2::Error=>e
if in_ignore_window?
res["Replication Running"] = replication_check("up")
else
error("Unable to connect to MySQL",e.to_s)
end
end
report(res)
end
def replication_check(status)
failed_poll_count = memory(:failed_poll_count)
# Replication's running, return 1 and clean failed_poll_count
if status == "up"
failed_poll_count = 0
return_code = 1
else # Bump up fail count, check if we're over the maximum allowed and fail if so.
failed_poll_count += 1
if (failed_poll_count >= option(:fail_count))
return_code = 0
else
return_code = 1
end
end
# Remember the failures for next run
remember(:failed_poll_count,failed_poll_count)
return return_code
end
def in_ignore_window?
if (s = option(:ignore_window_start)) && (e = option(:ignore_window_end))
start_time = Time.parse("#{Date.today} #{s}")
end_time = Time.parse("#{Date.today} #{e}")
if start_time < end_time
return Time.now > start_time && Time.now < end_time
else
return Time.now > start_time || Time.now < end_time
end
else
false
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment