Skip to content

Instantly share code, notes, and snippets.

@majormoses
Last active September 1, 2018 00:32
Show Gist options
  • Save majormoses/6b670730551bf59c60165b19a3afdefe to your computer and use it in GitHub Desktop.
Save majormoses/6b670730551bf59c60165b19a3afdefe to your computer and use it in GitHub Desktop.
How to manage checks with chef and a various collection of sensu configuration artifacts such as filters, handler configs, etc.
# generic to the service
consul_url = "http://#{node['MY_CUSTOM_NAMESPACE']['sensu']['monitoring']['consul']['host']}:8500"
service = node['MY_CUSTOM_NAMESPACE']['service']['common']['vars']['dependencies']['APPLICATION_ADMIN_PORTAL_HOST']
role = node['MY_CUSTOM_NAMESPACE']['service']['common']['vars']['dependencies']['APPLICATION_ADMIN_PORTAL_HOST'].tr('-', '_')
port = node['MY_CUSTOM_NAMESPACE']['service']['common']['vars']['dependencies']['APPLICATION_ADMIN_PORTAL_ADMIN_PORT']
blacklist = node['MY_CUSTOM_NAMESPACE']['sensu']['monitoring']['healthcheck_alert']['blacklist']
healthy_instances_warning = node['MY_CUSTOM_NAMESPACE']['sensu']['consul'][role]['expected_healthy_instances']['warning']
healthy_instances_critical = node['MY_CUSTOM_NAMESPACE']['sensu']['consul'][role]['expected_healthy_instances']['critical']
# process checks
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_process"]['command'] = "check-process.rb -p 'java .* -jar .*/MY-CUSTOM-NAMESPACE/(\\b#{role})/(\\b#{role.tr('_', '.')}).*.jar'"
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_process"]['handlers'] = ['pagerduty', 'remediator']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_process"]['subscribers'] = [role]
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_process"]['interval'] = 15
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_process"]['additional'] = {
'pager_team' => 'non_urgent',
'notification' => "No #{role} java process is running",
'occurrences' => 4,
'remediation' => {
"#{role}_process_remediate" => {
'occurrences' => ['1-5'],
'severities' => [2]
}
},
'runbook' => 'https://docs.google.com/document/d/<REDACTED>'
}
# remediation un-published check, which does the actual restart
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_process_remediate"]['command'] = "sudo -u #{node['MY_CUSTOM_NAMESPACE']['service']['common']['USER']} service #{role} restart"
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_process_remediate"]['handlers'] = ['pagerduty']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_process_remediate"]['subscribers'] = [
role,
"client:#{node.name}"
]
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_process_remediate"]['publish'] = false
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_process_remediate"]['standalone'] = false
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_process_remediate"]['interval'] = 10
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_process_remediate"]['additional'] = {
'pager_team' => 'non_urgent',
'notification' => "Remediate failed: Can not start #{role} service",
'occurrences' => 3
}
# multiple processes
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_multi"]['command'] = "check-process.rb -p 'java .* -jar .*/STRING_MATCHER/(\\b#{role})/(\\b#{role.tr('_', '.')}).*.jar' -c 1 -W 0 -C 0"
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_multi"]['handlers'] = ['pagerduty']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_multi"]['subscribers'] = [role]
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_multi"]['interval'] = 30
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_multi"]['additional'] = {
'pager_team' => 'urgent',
'notification' => "Multiple #{role} java process is running",
'occurrences' => 2,
'runbook' => 'https://docs.google.com/document/d/d/<REDACTED>'
}
# healthchecks
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_healthcheck"]['custom'] = 'true'
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_healthcheck"]['command'] = if !blacklist.nil?
"MY_CUSTOM_NAMESPACE/check-healthcheck.rb -p #{port} -b #{blacklist}"
else
"MY_CUSTOM_NAMESPACE/check-healthcheck.rb -p #{port}"
end
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_healthcheck"]['handlers'] = ['pagerduty', 'remediator']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_healthcheck"]['subscribers'] = [role]
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_healthcheck"]['interval'] = 30
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_healthcheck"]['additional'] = {
'pager_team' => 'urgent',
'notification' => "HealthCheck is failing for #{role} service",
'occurrences' => 2,
'dependencies' => [
"client:#{node.name}/#{role}_process"
],
'remediation' => {
"#{role}_process_remediate" => {
'occurrences' => ['2-5'],
'severities' => [2]
}
},
'runbook' => 'https://docs.google.com/document/d/<REDACTED>'
}
# consul healthcheck
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["consul_health_#{role}"]['command'] = "check-consul-service-health.rb -c #{consul_url} -s #{role.tr('_', '-')} --fail-if-not-found"
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["consul_health_#{role}"]['handlers'] = ['pagerduty']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["consul_health_#{role}"]['subscribers'] = ['sensu_ext_monitor']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["consul_health_#{role}"]['interval'] = 5
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["consul_health_#{role}"]['additional'] = {
'pager_team' => 'non_urgent',
'notification' => "Consul health checks are failing for #{role} service",
'occurrences' => 12,
'runbook' => 'https://docs.google.com/document/d/<REDACTED>'
}
# heap
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_heap"]['custom'] = 'true'
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_heap"]['command'] = "MY_CUSTOM_NAMESPACE/check-cc-heap-limit.rb -p #{port} -w 70"
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_heap"]['handlers'] = ['pagerduty']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_heap"]['subscribers'] = [role]
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_heap"]['interval'] = 30
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_heap"]['additional'] = {
'pager_team' => 'app_non_urgent',
'notification' => "Java heap usage for #{role} is approaching its limit",
'occurrences' => 6,
'dependencies' => [
"client:#{node.name}/no_java_process",
"client:#{node.name}/#{role}_process"
],
'runbook' => 'https://docs.google.com/document/d/<REDACTED>'
}
# online
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_online"]['custom'] = true
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_online"]['command'] = "MY_CUSTOM_NAMESPACE/check-cc-consul-services-online.rb -u #{consul_url} -s #{service} -c #{healthy_instances_critical} -w #{healthy_instances_warning}"
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_online"]['handlers'] = ['pagerduty']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_online"]['subscribers'] = ['sensu_ext_monitor']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_online"]['interval'] = 60
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_online"]['additional'] = {
'pager_team' => 'urgent',
'notification' => "Not enough service instances for #{service}",
'occurrences' => 8,
'runbook' => 'https://docs.google.com/document/d/<REDACTED>'
}
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']["#{role}_online"]['standalone'] = false
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['cpu']['command'] = 'check-cpu.rb -w 90 -c 100 --sleep 5'
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['cpu']['handlers'] = ['pagerduty']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['cpu']['subscribers'] = ['base']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['cpu']['interval'] = 15
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['cpu']['additional'] = {
'pager_team' => 'non_urgent',
'notification' => 'CPU is running hot!',
'occurrences' => 60,
'subdue' => {
'days' => {
'all' => [
{
'begin' => '6PM PST',
'end' => '10AM PST'
}
]
}
},
'runbook' => 'https://docs.google.com/document/d/<REDACTED>',
'hooks' => {
'non-zero' => {
# show the top 10 processes by highest cpu utilization
'command' => 'ps aux | sort -nrk 3,3 | head -n 10',
'timeout' => 10
}
}
}
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['cpu']['low_flap_threshold'] = 25
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['cpu']['high_flap_threshold'] = 50
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['cpu']['handle'] = false
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['disk']['command'] = 'check-disk-usage.rb -c 85 -w 80 -K 95 -W 80'
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['disk']['handlers'] = ['pagerduty']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['disk']['subscribers'] = ['base']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['disk']['interval'] = 15
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['disk']['additional'] = {
'pager_team' => 'non_urgent',
'notification' => 'Disk is nearing full. Do something!',
'occurrences' => 2,
'runbook' => 'https://docs.google.com/document/d/<REDACTED>',
'hooks' => {
'non-zero' => {
# show the 5 largest directories going 3 dirs deep, show the same for indodes
# the IOnice is meant to reduce impact of disk IO on application/database that are needed
'command' => 'ionice --class 3 du -ah -d3 / 2>/dev/null | sort -h | tail -n 5 && ionice --class 3 du -ah --inodes -d3 / 2>/dev/null | sort -h | tail -n 5',
# depending on how large your filesystems are you might need a larger value, as disk cache comes into play if it
# times out the first few times that may be OK depending on your phiolosophy
'timeout' => 20
}
}
}
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['dyn_throttle']['command'] = "check-dynamodb-throttle.rb -r us-west-2 -t #{node['MY_CUSTOM_NAMESPACE']['service']['common']['DYNAMODB_TABLES'].join(',')} --warning-over 5 --critical-over 10"
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['dyn_throttle']['handlers'] = ['pagerduty']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['dyn_throttle']['subscribers'] = ['sensu_ext_monitor']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['dyn_throttle']['interval'] = 30
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['dyn_throttle']['additional'] = {
'pager_team' => 'app_non_urgent',
'notification' => "AWS Dynamo Throttle limit in #{node.chef_environment} is being approached.",
'occurrences' => 2,
'runbook' => 'https://docs.google.com/document/d/<REDACTED>'
}
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['dyn_throughput']['command'] = "check-dynamodb-capacity.rb -r us-west-2 -t #{node['MY_CUSTOM_NAMESPACE']['service']['common']['DYNAMODB_TABLES'].join(',')} --warning-over 80 --critical-over 90"
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['dyn_throughput']['handlers'] = ['pagerduty']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['dyn_throughput']['subscribers'] = ['sensu_ext_monitor']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['dyn_throughput']['interval'] = 30
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['dyn_throughput']['additional'] = {
'pager_team' => 'app_non_urgent',
'notification' => "AWS Dynamo Read/Write limit in #{node.chef_environment} is being approached.",
'occurrences' => 2,
'runbook' => 'https://docs.google.com/document/d/<REDACTED>'
}
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['es_cluster']['command'] = 'check-es-cluster-status.rb -h :::address::: -T 30'
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['es_cluster']['handlers'] = ['pagerduty']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['es_cluster']['subscribers'] = [
'roundrobin:es_app'
]
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['es_cluster']['interval'] = 30
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['es_cluster']['additional'] = {
'pager_team' => 'urgent',
'notification' => 'ES cluster health is red.',
'occurrences' => 2,
'dependencies' => ['es_service', 'es_node_status'],
'runbook' => 'https://docs.google.com/document/d/<REDACTED>'
}
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['es_cluster']['source'] = ':::es.app.cluster.name:::'
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['es_node_metrics']['command'] = "metrics-es-node-graphite.rb -s #{node.roles[0]}.#{node.hostname}.elasticsearch -h #{node.ipaddress}"
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['es_node_metrics']['handlers'] = ['relay']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['es_node_metrics']['subscribers'] = ['es_app']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['es_node_metrics']['interval'] = 60
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['es_node_metrics']['additional'] = {
'type' => 'metric'
# as this is a metric gatherer there is no run book
}
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['ping_four8s']['command'] = 'check-ping.rb -h 8.8.8.8 -T 5'
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['ping_four8s']['handlers'] = ['pagerduty']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['ping_four8s']['subscribers'] = ['base']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['ping_four8s']['interval'] = 5
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['ping_four8s']['additional'] = {
'pager_team' => 'urgent',
'notification' => 'cant reach 8.8.8.8',
'occurrences' => 13,
'runbook' => 'https://docs.google.com/document/d/<REDACTED>',
'hooks' => {
'non-zero' => {
# make sure we can talk to our gateway
'command' => 'ping -c 1 `route -n | awk \'$1 == "0.0.0.0" { print $2 }\'`',
'timeout' => 10
}
}
}
rmq_user = 'sensu_monitoring'
rmp_host = node['MY_CUSTOM_NAMESPACE']['sensu']['monitoring']['rabbitmq']['host']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['rabbitmq_clusterhealth']['command'] = "check-rabbitmq-cluster-health.rb -w #{rmp_host} -u #{rmq_user} -p :::rabbitmq.password::: -P 443 --ssl --verify_ssl_off"
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['rabbitmq_clusterhealth']['handlers'] = ['pagerduty']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['rabbitmq_clusterhealth']['subscribers'] = ['sensu_ext_monitor']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['rabbitmq_clusterhealth']['interval'] = 30
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['rabbitmq_clusterhealth']['additional'] = {
'pager_team' => 'urgent',
'notification' => 'rabbitmq cluster is not healthy!',
'occurrences' => 4,
'runbook' => 'https://docs.google.com/document/d/<REDACTED>'
}
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['some_portal_monitoring']['command'] = "check-http.rb -u #{node['MY_CUSTOM_NAMESPACE']['sensu']['monitoring']['SOME_portal_monitoring']['url']} -q SOME_STRING"
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['some_portal_monitoring']['handlers'] = ['pagerduty']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['some_portal_monitoring']['subscribers'] = ['MY_SUBSCRIPTION']
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['some_portal_monitoring']['interval'] = 30
default['MY_CUSTOM_NAMESPACE']['sensu']['checks']['some_portal_monitoring']['additional'] = {
'pager_team' => 'urgent',
'notification' => 'Some_PORTAL is not reachable',
'occurrences' => 4,
'runbook' => 'https://docs.google.com/document/d/<REDACTED>'
}
sensu_filter 'actions' do
# only fire pagerduty actions for `create` or `resolve` this effectively excludes flapping events
# see: https://docs.sensu.io/sensu-core/1.4/reference/events/#event-actions for documentation on event actions
attributes(action: 'eval: %w[create resolve].include? value.to_s')
end
sensu_filter 'dokken' do
attributes(client: { name: 'dokken' })
negate true
end
%w(ci test staging development qa).each do |env|
sensu_filter env do
attributes(client: { environment: env })
negate true
end
end
sensu_filter env do
attributes(client: { environment: "eval: ['qa','staging','test'].include? ':::client.environment|unknown:::'" })
negate true
end
sensu_filter 'state_change' do
negate false
# only send alerts 10am-10pm eastern
attributes(occurrences: "eval: value == 1 || ':::action:::' == 'resolve'")
end
sensu_filter 'ten_to_ten_eastern' do
# only send alerts 10am-10pm eastern
attributes(timestamp: "eval: ENV['TZ'] = 'America/New_York'; Time.at(value).hour.between?(10,22)")
end
node['MY_CUSTOM_NAMESPACE']['sensu']['checks'].each do |name, check|
sensu_check name do
if check['command']
if node['MY_CUSTOM_NAMESPACE']['sensu']['checks'][name]['custom']
command "#{node['sensu']['directory']}/plugins/" + check['command']
else
command check['command']
end
else
Chef::Log.error "Unable to find a command for check: #{name}"
end
if check['handlers']
handlers check['handlers']
else
Chef::Log.info "Warning there are no handlers defined for check: #{name}"
end
if check['subscribers']
subscribers check['subscribers']
else
Chef::Log.error "Unable to find a subscriber for check: #{name}"
end
if check['interval']
interval check['interval']
else
Chef::Log.info "No defined interval for check: #{name}, using default of #{node['MY_CUSTOM_NAMESPACE']['sensu']['check_interval']}"
interval node['MY_CUSTOM_NAMESPACE']['sensu']['check_interval']
end
publish check['publish'] unless check['publish'].nil?
if check['additional']
additional check['additional']
else
Chef::Log.error "No defined addtional for check: #{name}, this includes occurances, notification message, etc."
end
# use unless nil instead of if because its a bool value
standalone check['standalone'] unless check['standalone'].nil?
if check['low_flap_threshold'] && check['high_flap_threshold']
if check['high_flap_threshold'] > check['low_flap_threshold']
low_flap_threshold check['low_flap_threshold']
high_flap_threshold check['high_flap_threshold']
end
end
only_if do
# Generate all checks on sensu server, base checks, or checks that are in the intersection of roles and subscribers.
(node['roles'].include? 'sensu_server')\
|| (check['subscribers'].include? 'base')\
|| (!node['CUSTOM_NAMESPACE']['sensu']['roles'].nil?\
&& !(check['subscribers'] & node['CUSTOM_NAMESPACE']['sensu']['roles']).empty?)
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment