kivanio · April 4, 2022 12:43
diff --git a/alert-manager.md b/alert-manager.md
diff --git a/alerting_rules.yml b/alerting_rules.yml
 # alert rules (to be used by AlertManager)
 groups:
  - name: alerting_rules
    # rules evaluation period
    interval: 1s
    rules:
      # rabbitmq metrics simple rule
      - alert: rabbitmqDown
        # rabbitmq is not running
        expr: rabbitmq_up == 0
        # during 10 seconds
        for: 10s
        labels:
          # sounds important
          severity: critical
        annotations:
          summary: "RabbitMQ is Down"
          description: "RabbitMQ is so dead"
diff --git a/alertmanager.png b/alertmanager.png
diff --git a/alertmanager.yml b/alertmanager.yml
 global:
  resolve_timeout: 1m

 route:
  group_by: ['instance', 'severity']
  # wait to send notification
  group_wait: 1s
  # wait to resend notification
  repeat_interval: 1h
  routes:
  - match:
      # alert name defined in alerting_rules.yml
      alertname: rabbitmqDown
  receiver: 'tranque-slack-hook'

 receivers:
 # slack hook configuration
 - name: 'tranque-slack-hook'
  slack_configs:
  - api_url: "https://hooks.slack.com/services/your-slack-hook"
    title: "{{ .CommonAnnotations.summary }}"
    title_link: ""
    text: "RabbitMQ server got down for 10 seconds"
diff --git a/alertmanager_experiment.png b/alertmanager_experiment.png
diff --git a/docker-compose.yml b/docker-compose.yml
 version:               "3.4"

 services:
  # set up any service to be monitored using prometheus. I chose rabbitmq with no particular reason
  rabbitmq:
    image:             rabbitmq:3.7.8-management-alpine
    restart:           always
  # export rabbitmq metrics to prometheus
  rabbitmq-exporter:
    image:             kbudde/rabbitmq-exporter:v0.29.0
    restart:           always
    environment:
      RABBIT_URL:      "http://rabbitmq:15672"
  # prometheus server
  prometheus:
    image:             prom/prometheus:v2.6.0
    restart:           always
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
      - ./alerting_rules.yml:/etc/prometheus/alerting_rules.yml
  # alertmanager server
  alertmanager:
    image: prom/alertmanager:v0.15.3
    restart: always
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
  # alertmanager UI
  unsee:
    image: cloudflare/unsee:latest
    restart: always
    environment:
      ALERTMANAGER_URI: http://alertmanager:9093
      ALERTMANAGER_PROXY: true
      ALERTMANAGER_INTERVAL: 5s
    ports:
      - 8080:8080
diff --git a/prometheus.yml b/prometheus.yml
 # set a label to this prometheus instance
 global:
  external_labels:
      monitor: 'tranque-monitor'

 # connect to AlertManager
 alerting:
  alertmanagers:
  - static_configs:
    - targets:
      - 'alertmanager:9093'

 # include alert rules (to be used by AlertManager)
 rule_files:
  - "/etc/prometheus/alerting_rules.yml"

 # read metrics from several sources
 scrape_configs:
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

  - job_name: 'rabbitmq-test'
    scrape_interval: 1s
    metrics_path: /metrics
    static_configs:
      - targets: ['rabbitmq-exporter:9090']
diff --git a/slack-notification.png b/slack-notification.png
diff --git a/unsee.png b/unsee.png
	# alert rules (to be used by AlertManager)
	groups:
	- name: alerting_rules
	# rules evaluation period
	interval: 1s
	rules:
	# rabbitmq metrics simple rule
	- alert: rabbitmqDown
	# rabbitmq is not running
	expr: rabbitmq_up == 0
	# during 10 seconds
	for: 10s
	labels:
	# sounds important
	severity: critical
	annotations:
	summary: "RabbitMQ is Down"
	description: "RabbitMQ is so dead"
	global:
	resolve_timeout: 1m

	route:
	group_by: ['instance', 'severity']
	# wait to send notification
	group_wait: 1s
	# wait to resend notification
	repeat_interval: 1h
	routes:
	- match:
	# alert name defined in alerting_rules.yml
	alertname: rabbitmqDown
	receiver: 'tranque-slack-hook'

	receivers:
	# slack hook configuration
	- name: 'tranque-slack-hook'
	slack_configs:
	- api_url: "https://hooks.slack.com/services/your-slack-hook"
	title: "{{ .CommonAnnotations.summary }}"
	title_link: ""
	text: "RabbitMQ server got down for 10 seconds"
	version: "3.4"

	services:
	# set up any service to be monitored using prometheus. I chose rabbitmq with no particular reason
	rabbitmq:
	image: rabbitmq:3.7.8-management-alpine
	restart: always
	# export rabbitmq metrics to prometheus
	rabbitmq-exporter:
	image: kbudde/rabbitmq-exporter:v0.29.0
	restart: always
	environment:
	RABBIT_URL: "http://rabbitmq:15672"
	# prometheus server
	prometheus:
	image: prom/prometheus:v2.6.0
	restart: always
	volumes:
	- ./prometheus.yml:/etc/prometheus/prometheus.yml
	- ./alerting_rules.yml:/etc/prometheus/alerting_rules.yml
	# alertmanager server
	alertmanager:
	image: prom/alertmanager:v0.15.3
	restart: always
	volumes:
	- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
	# alertmanager UI
	unsee:
	image: cloudflare/unsee:latest
	restart: always
	environment:
	ALERTMANAGER_URI: http://alertmanager:9093
	ALERTMANAGER_PROXY: true
	ALERTMANAGER_INTERVAL: 5s
	ports:
	- 8080:8080
	# set a label to this prometheus instance
	global:
	external_labels:
	monitor: 'tranque-monitor'

	# connect to AlertManager
	alerting:
	alertmanagers:
	- static_configs:
	- targets:
	- 'alertmanager:9093'

	# include alert rules (to be used by AlertManager)
	rule_files:
	- "/etc/prometheus/alerting_rules.yml"

	# read metrics from several sources
	scrape_configs:
	- job_name: 'prometheus'
	static_configs:
	- targets: ['localhost:9090']

	- job_name: 'rabbitmq-test'
	scrape_interval: 1s
	metrics_path: /metrics
	static_configs:
	- targets: ['rabbitmq-exporter:9090']