WebPlatformDocs · January 11, 2020 22:55
diff --git a/00-monit-configuration.md b/00-monit-configuration.md
diff --git a/accounts.conf b/accounts.conf
 check host fxa-auth-server with address "localhost"
  start program = "/usr/sbin/service fxa-auth-server start"
  stop program = "/usr/sbin/service fxa-auth-server stop"
  if failed port 9000 protocol HTTP
    request /
    with timeout 10 seconds
    then restart
    depends on nginx
    depends on fxa-content-server

 check host fxa-content-server with address "localhost"
  start program = "/usr/sbin/service fxa-content-server start"
  stop program = "/usr/sbin/service fxa-content-server stop"
  if failed port 3030 protocol HTTP
    request /
    with timeout 10 seconds
    then restart
    depends on nginx

 check host fxa-oauth-server with address "localhost"
  start program = "/usr/sbin/service fxa-oauth-server start"
  stop program = "/usr/sbin/service fxa-oauth-server stop"
  if failed port 9010 protocol HTTP
    request /
    with timeout 15 seconds
    then restart
    depends on nginx

 check host fxa-profile-server with address "localhost"
  start program = "/usr/sbin/service fxa-profile-server start"
  stop program = "/usr/sbin/service fxa-profile-server stop"
  if failed port 8081 protocol HTTP
    request /
    with timeout 10 seconds
    then restart
    depends on nginx

 # See in salt/fxa/checks.sls
 #        salt/fxa/files/profile-check.sh
 check program fxa-profile-server-check with path "/srv/webplatform/auth/profile-check.sh"
  with timeout 20 seconds
  start program = "/usr/sbin/service fxa-profile-server start"
  stop program = "/usr/sbin/service fxa-profile-server stop"
  if status != 0
    then restart
    depends on fxa-profile-server
diff --git a/apache.conf b/apache.conf
 # ref: http://mmonit.com/wiki/Monit/MonitorApacheStatus
 check process apache2
  with pidfile "/var/run/apache2/apache2.pid"
  group www
  start = "/usr/sbin/service apache2 start"
  stop  = "/usr/sbin/service apache2 stop"
  if failed host 127.0.0.1 port 80
    protocol apache-status loglimit > 80% or
                           dnslimit > 25% or
                           waitlimit < 20%
    then restart
  if failed host {{ ip4_interfaces[0]|default('127.0.0.1') }} port 80
    protocol HTTP then restart
  if not exist for 3 cycles then restart
  if 5 restarts within 5 cycles then timeout
  if 3 restarts within 5 cycles then alert
diff --git a/elasticsearch.conf b/elasticsearch.conf
 # Ref: https://github.com/elasticsearch/cookbook-elasticsearch/blob/master/templates/default/elasticsearch.monitrc.conf.erb
 # #TODO Improve error email, good format example see ^ 
 check process elasticsearch
  with pidfile "/var/run/elasticsearch.pid"
  group elasticsearch
  start = "/usr/sbin/service elasticsearch start" with timeout 60 seconds
  stop  = "/usr/sbin/service elasticsearch stop"
  if cpu > 90% for 5 cycles then alert
  if totalmem > 90% for 15 cycles then alert
  if loadavg(15min) greater than 10 for 50 cycles then alert
  if not exist for 3 cycles then restart
  if 5 restarts within 5 cycles then timeout
  if 3 restarts within 5 cycles then alert

 check host elasticsearch_connection with address {{ ip4_interfaces[0]|default('0.0.0.0') }}
  if failed url http://{{ ip4_interfaces[0]|default('0.0.0.0') }}:{{ elastic_port|default(9200) }}/
  with timeout 15 seconds
  then alert
  group elasticsearch

 check host elasticsearch_cluster_health with address {{ ip4_interfaces[0]|default('0.0.0.0') }}
  if failed url http://{{ ip4_interfaces[0]|default('0.0.0.0') }}:{{ elastic_port|default(9200) }}/_cluster/health
    and content == 'green'
    with timeout 60 seconds
    then alert
  group elasticsearch
diff --git a/exim4.conf b/exim4.conf
 check process exim4
  with pidfile "/var/run/exim4/exim.pid"
  group mail
  start = "/usr/sbin/service exim4 start"
  stop  = "/usr/sbin/service exim4 stop"
  if failed port 25 protocol SMTP then restart
  if not exist for 3 cycles then restart
  if 3 restarts within 5 cycles then alert
  if 5 restarts within 5 cycles then timeout
diff --git a/gdnsd.conf b/gdnsd.conf
 check process gdnsd
  with pidfile "/var/run/gdnsd/gdnsd.pid"
  start = "/usr/sbin/service gdnsd start"
  stop  = "/usr/sbin/service gdnsd stop"
  if failed port 53 protocol DNS then restart
  if not exist for 3 cycles then restart
  if 5 restarts within 5 cycles then timeout
  if 3 restarts within 5 cycles then alert

diff --git a/memcached.conf b/memcached.conf
 # http://www.alphadevx.com/a/392-Monitoring-Memcache-with-Monit
 check process memcached
  with pidfile "/var/run/memcached.pid"
  group keystore
  start = "/usr/sbin/service memcached start"
  stop  = "/usr/sbin/service memcached stop"
  if failed host 127.0.0.1 port 11211 protocol MEMCACHE then restart
  if not exist for 3 cycles then restart
  if 3 restarts within 5 cycles then alert
  if 5 restarts within 5 cycles then timeout

diff --git a/monit.conf b/monit.conf
 # Managed by Salt Stack, please DO NOT TOUCH, or ALL CHANGES WILL be LOST!

 #
 # This file should contain only what’s common for EVERY nodes
 #
 #
 # Ref:
 #   - http://mmonit.com/monit/documentation/monit.html
 #



 check system {{ nodename }}
  if loadavg (1min) > 4 then alert
  if loadavg (5min) > 2 then alert
  if memory usage > 75% then alert
  if swap usage > 25% then alert
  if cpu usage (user) > 70% then alert
  if cpu usage (system) > 30% then alert
  if cpu usage (wait) > 20% then alert

 check process salt-minion
  with pidfile "/var/run/salt-minion.pid"
  group salt
  start = "/usr/sbin/service salt-minion start"
  stop  = "/usr/sbin/service salt-minion stop"
  if not exist for 3 cycles then restart
  if 5 restarts within 5 cycles then timeout

 set httpd port {{ monit_port|default(2812) }} and
  use address localhost      # only accept connection from localhost
  allow localhost            # allow localhost to connect to the server and
  allow 10.10.10.0/24
  allow admin:{{ monit_pw }} # require user 'admin' with password that is defined in Salt Stack
  allow @monit               # allow users of group 'monit' to connect (rw)
  allow @users readonly      # allow users of group 'users' to connect readonly

 set mailserver mail.{{ tld }}
  using sslauto

 set mail-format { from: monit@{{ nodename }} }
 set alert hostmaster@{{ tld }} not on { pid ppid }
diff --git a/mysql.conf b/mysql.conf
 # Get config defaults
 # 
 # Run:
 #
 #       /usr/sbin/mysqld --print-defaults
 #
 check process mysql
  with pidfile "/var/run/mysqld/mysqld.pid"
  group database
  start = "/usr/sbin/service mysql start"
  stop  = "/usr/sbin/service mysql stop"
  if failed host {{ ip4_interfaces[0]|default('127.0.0.1') }} port 3306
    protocol MYSQL then restart
  if not exist for 3 cycles then restart
  if 5 restarts within 5 cycles then timeout
  if 3 restarts within 5 cycles then alert

diff --git a/nginx.conf b/nginx.conf
 check process nginx
  with pidfile /var/run/nginx.pid
  group www
  group nginx
  start program = "/usr/sbin/service nginx start"
  stop program =  "/usr/sbin/service nginx stop"
  if failed host {{ ip4_interfaces[0]|default('127.0.0.1') }} port 80
    protocol HTTP then restart
  if 5 restarts with 5 cycles then timeout
  depend nginx_bin
  depend nginx_rc

 check file nginx_bin with path /usr/sbin/nginx
  group nginx
  include /etc/monit/templates/rootbin

 check file nginx_rc with path /etc/init.d/nginx
  group nginx
  include /etc/monit/templates/rootbin
diff --git a/notes-server.conf b/notes-server.conf
 {#
 # Expected variables and values:
 #   - hypothesis_host: '127.0.0.1'
 #   - hypothesis_port: 8000
 #   - elastic_host: '10.10.10.2'
 #   - elastic_port: 9002
 #}
 check process hypothesis
  matching "notes-server"
  start program = "/usr/sbin/service hypothesis start"
  stop program =  "/usr/sbin/service hypothesis stop"
  if failed port {{ hypothesis_port }}
    type TCP with timeout 10 seconds
    then restart
  if 5 restarts with 5 cycles then timeout

 check host elasticsearch-remote with address "{{ elastic_host }}"
  if failed port {{ elastic_port }} protocol HTTP
    request "/_aliases"
    with timeout 10 seconds
    then alert

 check host hypothesis-available with address "{{ hypothesis_host }}"
  start program = "/usr/sbin/service hypothesis start"
  stop program =  "/usr/sbin/service hypothesis stop"
  if failed port {{ hypothesis_port }}
    protocol HTTP request "/ruok" with timeout 10 seconds
    then restart
 
diff --git a/nutcracker.conf b/nutcracker.conf
 #
 # Ref:
 #   - https://github.com/twitter/twemproxy
 #
 # To check stats, look result on stats port 22222
 #
 #    curl http://localhost:22222 | python -m json.tool
 #
 check process nutcracker
  matching "nutcracker"
  group keystore
  start = "/usr/sbin/service nutcracker start"
  stop  = "/usr/sbin/service nutcracker stop"
  if failed host 127.0.0.1 port 22222 type TCP
    with timeout 2 seconds
    then restart
  if not exist for 3 cycles then restart
  if 3 restarts within 5 cycles then alert
  if 5 restarts within 5 cycles then timeout

diff --git a/php-fpm.conf b/php-fpm.conf
 # http://tobias.is/blog/to-boldly-monitor-what-no-one-has-monitored-before/
 check process php5-fpm
  with pidfile "/var/run/php5-fpm.pid"
  group php5-fpm
  start = "/usr/sbin/service php5-fpm start" with timeout 60 seconds
  stop  = "/usr/sbin/service php5-fpm stop"
  if failed host {{ ip4_interface|default('0.0.0.0') }} port {{ fpm_port|default(9000) }} type TCP then restart
  if not exist for 3 cycles then restart
  if 5 restarts within 5 cycles then timeout
  if 3 restarts within 5 cycles then alert

diff --git a/salt-master.conf b/salt-master.conf
 check process salt-master
  with pidfile "/var/run/salt-master.pid"
  group salt
  start = "/usr/sbin/service salt-master start"
  stop  = "/usr/sbin/service salt-master stop"
  if not exist for 3 cycles then restart
  if 3 restarts within 5 cycles then alert
  if 5 restarts within 5 cycles then timeout
	check host fxa-auth-server with address "localhost"
	start program = "/usr/sbin/service fxa-auth-server start"
	stop program = "/usr/sbin/service fxa-auth-server stop"
	if failed port 9000 protocol HTTP
	request /
	with timeout 10 seconds
	then restart
	depends on nginx
	depends on fxa-content-server

	check host fxa-content-server with address "localhost"
	start program = "/usr/sbin/service fxa-content-server start"
	stop program = "/usr/sbin/service fxa-content-server stop"
	if failed port 3030 protocol HTTP
	request /
	with timeout 10 seconds
	then restart
	depends on nginx

	check host fxa-oauth-server with address "localhost"
	start program = "/usr/sbin/service fxa-oauth-server start"
	stop program = "/usr/sbin/service fxa-oauth-server stop"
	if failed port 9010 protocol HTTP
	request /
	with timeout 15 seconds
	then restart
	depends on nginx

	check host fxa-profile-server with address "localhost"
	start program = "/usr/sbin/service fxa-profile-server start"
	stop program = "/usr/sbin/service fxa-profile-server stop"
	if failed port 8081 protocol HTTP
	request /
	with timeout 10 seconds
	then restart
	depends on nginx

	# See in salt/fxa/checks.sls
	# salt/fxa/files/profile-check.sh
	check program fxa-profile-server-check with path "/srv/webplatform/auth/profile-check.sh"
	with timeout 20 seconds
	start program = "/usr/sbin/service fxa-profile-server start"
	stop program = "/usr/sbin/service fxa-profile-server stop"
	if status != 0
	then restart
	depends on fxa-profile-server
	# ref: http://mmonit.com/wiki/Monit/MonitorApacheStatus
	check process apache2
	with pidfile "/var/run/apache2/apache2.pid"
	group www
	start = "/usr/sbin/service apache2 start"
	stop = "/usr/sbin/service apache2 stop"
	if failed host 127.0.0.1 port 80
	protocol apache-status loglimit > 80% or
	dnslimit > 25% or
	waitlimit < 20%
	then restart
	if failed host {{ ip4_interfaces[0]\|default('127.0.0.1') }} port 80
	protocol HTTP then restart
	if not exist for 3 cycles then restart
	if 5 restarts within 5 cycles then timeout
	if 3 restarts within 5 cycles then alert
	# Ref: https://github.com/elasticsearch/cookbook-elasticsearch/blob/master/templates/default/elasticsearch.monitrc.conf.erb
	# #TODO Improve error email, good format example see ^
	check process elasticsearch
	with pidfile "/var/run/elasticsearch.pid"
	group elasticsearch
	start = "/usr/sbin/service elasticsearch start" with timeout 60 seconds
	stop = "/usr/sbin/service elasticsearch stop"
	if cpu > 90% for 5 cycles then alert
	if totalmem > 90% for 15 cycles then alert
	if loadavg(15min) greater than 10 for 50 cycles then alert
	if not exist for 3 cycles then restart
	if 5 restarts within 5 cycles then timeout
	if 3 restarts within 5 cycles then alert

	check host elasticsearch_connection with address {{ ip4_interfaces[0]\|default('0.0.0.0') }}
	if failed url http://{{ ip4_interfaces[0]\|default('0.0.0.0') }}:{{ elastic_port\|default(9200) }}/
	with timeout 15 seconds
	then alert
	group elasticsearch

	check host elasticsearch_cluster_health with address {{ ip4_interfaces[0]\|default('0.0.0.0') }}
	if failed url http://{{ ip4_interfaces[0]\|default('0.0.0.0') }}:{{ elastic_port\|default(9200) }}/_cluster/health
	and content == 'green'
	with timeout 60 seconds
	then alert
	group elasticsearch
	check process exim4
	with pidfile "/var/run/exim4/exim.pid"
	group mail
	start = "/usr/sbin/service exim4 start"
	stop = "/usr/sbin/service exim4 stop"
	if failed port 25 protocol SMTP then restart
	if not exist for 3 cycles then restart
	if 3 restarts within 5 cycles then alert
	if 5 restarts within 5 cycles then timeout
	check process gdnsd
	with pidfile "/var/run/gdnsd/gdnsd.pid"
	start = "/usr/sbin/service gdnsd start"
	stop = "/usr/sbin/service gdnsd stop"
	if failed port 53 protocol DNS then restart
	if not exist for 3 cycles then restart
	if 5 restarts within 5 cycles then timeout
	if 3 restarts within 5 cycles then alert
	# http://www.alphadevx.com/a/392-Monitoring-Memcache-with-Monit
	check process memcached
	with pidfile "/var/run/memcached.pid"
	group keystore
	start = "/usr/sbin/service memcached start"
	stop = "/usr/sbin/service memcached stop"
	if failed host 127.0.0.1 port 11211 protocol MEMCACHE then restart
	if not exist for 3 cycles then restart
	if 3 restarts within 5 cycles then alert
	if 5 restarts within 5 cycles then timeout
	# Managed by Salt Stack, please DO NOT TOUCH, or ALL CHANGES WILL be LOST!

	#
	# This file should contain only what’s common for EVERY nodes
	#
	#
	# Ref:
	# - http://mmonit.com/monit/documentation/monit.html
	#



	check system {{ nodename }}
	if loadavg (1min) > 4 then alert
	if loadavg (5min) > 2 then alert
	if memory usage > 75% then alert
	if swap usage > 25% then alert
	if cpu usage (user) > 70% then alert
	if cpu usage (system) > 30% then alert
	if cpu usage (wait) > 20% then alert

	check process salt-minion
	with pidfile "/var/run/salt-minion.pid"
	group salt
	start = "/usr/sbin/service salt-minion start"
	stop = "/usr/sbin/service salt-minion stop"
	if not exist for 3 cycles then restart
	if 5 restarts within 5 cycles then timeout

	set httpd port {{ monit_port\|default(2812) }} and
	use address localhost # only accept connection from localhost
	allow localhost # allow localhost to connect to the server and
	allow 10.10.10.0/24
	allow admin:{{ monit_pw }} # require user 'admin' with password that is defined in Salt Stack
	allow @monit # allow users of group 'monit' to connect (rw)
	allow @users readonly # allow users of group 'users' to connect readonly

	set mailserver mail.{{ tld }}
	using sslauto

	set mail-format { from: monit@{{ nodename }} }
	set alert hostmaster@{{ tld }} not on { pid ppid }
	# Get config defaults
	#
	# Run:
	#
	# /usr/sbin/mysqld --print-defaults
	#
	check process mysql
	with pidfile "/var/run/mysqld/mysqld.pid"
	group database
	start = "/usr/sbin/service mysql start"
	stop = "/usr/sbin/service mysql stop"
	if failed host {{ ip4_interfaces[0]\|default('127.0.0.1') }} port 3306
	protocol MYSQL then restart
	if not exist for 3 cycles then restart
	if 5 restarts within 5 cycles then timeout
	if 3 restarts within 5 cycles then alert
	check process nginx
	with pidfile /var/run/nginx.pid
	group www
	group nginx
	start program = "/usr/sbin/service nginx start"
	stop program = "/usr/sbin/service nginx stop"
	if failed host {{ ip4_interfaces[0]\|default('127.0.0.1') }} port 80
	protocol HTTP then restart
	if 5 restarts with 5 cycles then timeout
	depend nginx_bin
	depend nginx_rc

	check file nginx_bin with path /usr/sbin/nginx
	group nginx
	include /etc/monit/templates/rootbin

	check file nginx_rc with path /etc/init.d/nginx
	group nginx
	include /etc/monit/templates/rootbin
	{#
	# Expected variables and values:
	# - hypothesis_host: '127.0.0.1'
	# - hypothesis_port: 8000
	# - elastic_host: '10.10.10.2'
	# - elastic_port: 9002
	#}
	check process hypothesis
	matching "notes-server"
	start program = "/usr/sbin/service hypothesis start"
	stop program = "/usr/sbin/service hypothesis stop"
	if failed port {{ hypothesis_port }}
	type TCP with timeout 10 seconds
	then restart
	if 5 restarts with 5 cycles then timeout

	check host elasticsearch-remote with address "{{ elastic_host }}"
	if failed port {{ elastic_port }} protocol HTTP
	request "/_aliases"
	with timeout 10 seconds
	then alert

	check host hypothesis-available with address "{{ hypothesis_host }}"
	start program = "/usr/sbin/service hypothesis start"
	stop program = "/usr/sbin/service hypothesis stop"
	if failed port {{ hypothesis_port }}
	protocol HTTP request "/ruok" with timeout 10 seconds
	then restart