Skip to content

Instantly share code, notes, and snippets.

@vi7
Last active April 15, 2022 12:27
Show Gist options
  • Save vi7/95f13b1904686296f55a00eff176421f to your computer and use it in GitHub Desktop.
Save vi7/95f13b1904686296f55a00eff176421f to your computer and use it in GitHub Desktop.
Docker Compose file with Prometheus, Alertmanager, Node Exporter and cAdvisor. Sample configs included.
---
global:
resolve_timeout: 3m
slack_api_url: 'https://hooks.slack.com/services/<TOKEN>'
smtp_smarthost: 'mail.example.com:25'
smtp_from: '[email protected]'
smtp_require_tls: false
smtp_hello: 'alertmanager-dev'
# The directory from which notification templates are read.
templates:
- '/etc/alertmanager/templates/*.tmpl'
route:
receiver: default-slack
group_by: [alertname, job, instance]
group_wait: 10s
group_interval: 5m
repeat_interval: 12h
routes:
- receiver: email
group_by: [alertname, job, instance]
group_wait: 10s
group_interval: 5m
repeat_interval: 12h
continue: true
matchers:
- noc = true
- receiver: slack
group_by: [alertname, job, instance]
group_wait: 10s
group_interval: 5m
repeat_interval: 12h
continue: true
matchers:
- severity = critical
receivers:
- name: default-slack
slack_configs:
- channel: '#prometheus-dev'
send_resolved: true
- name: email
email_configs:
- to: '[email protected]'
send_resolved: true
- name: slack
slack_configs:
- channel: '#prometheus-critical'
send_resolved: true
services:
prometheus:
image: prom/prometheus:v2.34.0
container_name: prometheus
command:
- --web.external-url=http://${HOSTNAME}:9090
- --web.page-title=Prometheus DEV
- --web.enable-lifecycle
- --web.enable-admin-api
- --config.file=/etc/prometheus/prometheus.yml
- --storage.tsdb.path=/prometheus
- --storage.tsdb.retention.size=1GB
- --web.console.libraries=/usr/share/prometheus/console_libraries
- --web.console.templates=/usr/share/prometheus/consoles
ports:
- "9090:9090"
networks:
- monitoring
extra_hosts:
- "node-exporter:172.28.0.1"
volumes:
- prometheus-data:/prometheus
- ./prometheus:/etc/prometheus
alertmanager:
image: prom/alertmanager:v0.24.0
container_name: alertmanager
command:
- --web.external-url=http://${HOSTNAME}:9093
- --config.file=/etc/alertmanager/alertmanager.yml
- --storage.path=/alertmanager
ports:
- "9093:9093"
networks:
- monitoring
volumes:
- alertmanager-data:/alertmanager
- ./alertmanager:/etc/alertmanager
node-exporter:
image: prom/node-exporter:v1.3.1
container_name: node-exporter
restart: unless-stopped
command:
- --path.rootfs=/host
- --path.procfs=/host/proc
- --path.sysfs=/host/sys
- --web.listen-address=0.0.0.0:9100
- --no-collector.infiniband
- --no-collector.zfs
- '--collector.diskstats.ignored-devices=^(ram|loop|fd)\\d+$$'
- '--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+)($$|/)'
- '--collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$$'
# RAPL disabled due to more strict permissions for `/sys/class/powercap/intel-rapl:0/energy_uj` in newer kernels
- --no-collector.rapl
network_mode: host
pid: host
volumes:
- /:/host:ro,rslave
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.44.0
container_name: cadvisor
ports:
- 8080:8080
networks:
- monitoring
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
volumes:
prometheus-data:
alertmanager-data:
networks:
monitoring:
ipam:
config:
- subnet: 172.28.0.0/16
gateway: 172.28.0.1
---
global:
# How frequently to scrape targets by default
scrape_interval: 30s # default = 1m
# How frequently to evaluate rules
evaluation_interval: 15s # default = 1m
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
external_labels:
monitor: prometheus-dev
# Rules and alerts are read from the specified file(s)
rule_files:
- rules.yml
# Alerting specifies settings related to the Alertmanager
alerting:
alertmanagers:
- static_configs:
- targets:
# Alertmanager's default port is 9093
- alertmanager:9093
# A scrape configuration containing exactly one endpoint to scrape:
scrape_configs:
- job_name: prometheus-dev
static_configs:
- targets:
- localhost:9090
- job_name: alertmanager-dev
static_configs:
- targets:
- alertmanager:9093
- job_name: node-exporter-dev
scrape_interval: 10s
static_configs:
- targets:
- node-exporter:9100
- job_name: cadvisor
scrape_interval: 10s
static_configs:
- targets:
- cadvisor:8080
---
groups:
- name: AllInstances
rules:
- alert: Watchdog
expr: vector(1)
annotations:
description: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
This alert is always firing, therefore it should always be firing in Alertmanager
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.
'
runbook_url: TBD
summary: An alert that should always be firing to certify that Alertmanager is working properly.
message: Prometheus Watchdog
labels:
severity: none
- alert: InstanceDown
# Condition for alerting
expr: up == 0
for: 1m
# Annotation - additional informational labels to store more information
annotations:
title: 'Instance {{ $labels.instance }} down.'
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute.'
runbook: https://instancedown-runbook-url.example.com
# Labels - additional labels to be attached to the alert
labels:
severity: critical
noc: true
- alert: NodeCPUUsageHigh
expr: (sum by (nodename) (irate(node_cpu_seconds_total{job="node-exporter-dev", mode!="idle"}[5m]) * on(instance) group_left(nodename) node_uname_info{job="node-exporter-dev"}) * 100 / count by (nodename) (irate(node_cpu_seconds_total{job="node-exporter-dev", mode="idle"}[5m]) * on(instance) group_left(nodename) node_uname_info{job="node-exporter-dev"})) > 90
for: 1m
annotations:
title: 'CPU usage high on {{ $labels.nodename }}.'
description: 'Over the last 10 mins CPU usage exceeds 90% on {{ $labels.nodename }}'
runbook: https://cpu-runbook-url.example.com
labels:
severity: warning
- alert: NodeLoadAvg5High
expr: (sum by (nodename) (node_load5{job="node-exporter-dev"} * on(instance) group_left(nodename) node_uname_info{job="node-exporter-dev"}) * 100 / count by (nodename) (irate(node_cpu_seconds_total{job="node-exporter-dev", mode="idle"}[5m]) * on(instance) group_left(nodename) node_uname_info{job="node-exporter-dev"})) > 90
for: 1m
annotations:
title: 'CPU 5min Load Average high on {{ $labels.nodename }}.'
description: 'CPU 5min Load Average exceeds 90% of available cores on {{ $labels.nodename }}'
runbook: https://loadavg-runbook-url.example.com
labels:
severity: warning
- alert: NodeMemUsageHigh
expr: (sum by (nodename) ((node_memory_MemTotal_bytes{job="node-exporter-dev"} - node_memory_MemAvailable_bytes{job="node-exporter-dev"}) * on(instance) group_left(nodename) node_uname_info{job="node-exporter-dev"}) * 100 / avg by (nodename) (node_memory_MemTotal_bytes{job="node-exporter-dev"} * on(instance) group_left(nodename) node_uname_info{job="node-exporter-dev"})) > 90
for: 1m
annotations:
title: 'Memory usage high on {{ $labels.nodename }}.'
description: 'Over the last 5 mins RAM usage exceeds 90% on {{ $labels.nodename }}'
runbook: https://mem-runbook-url.example.com
labels:
severity: critical
noc: true
- alert: NodeDiskUsageHigh
expr: (min by (nodename,device) (node_filesystem_avail_bytes{job="node-exporter-dev",device!~"rootfs|tmpfs"} * on(instance) group_left(nodename) node_uname_info{job="node-exporter-dev"}) * 100) / (min by (nodename,device) (node_filesystem_size_bytes{job="node-exporter-dev",device!~"rootfs|tmpfs"} * on(instance) group_left(nodename) node_uname_info{job="node-exporter-dev"})) < 10
for: 1m
annotations:
title: 'Disk usage high on {{ $labels.nodename }}.'
description: '{{ $labels.device }} on {{ $labels.nodename }} has less than 10% space left'
runbook: https://disk-runbook-url.example.com
labels:
severity: critical
noc: true
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment