Last active
April 15, 2022 12:27
-
-
Save vi7/95f13b1904686296f55a00eff176421f to your computer and use it in GitHub Desktop.
Docker Compose file with Prometheus, Alertmanager, Node Exporter and cAdvisor. Sample configs included.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
global: | |
resolve_timeout: 3m | |
slack_api_url: 'https://hooks.slack.com/services/<TOKEN>' | |
smtp_smarthost: 'mail.example.com:25' | |
smtp_from: '[email protected]' | |
smtp_require_tls: false | |
smtp_hello: 'alertmanager-dev' | |
# The directory from which notification templates are read. | |
templates: | |
- '/etc/alertmanager/templates/*.tmpl' | |
route: | |
receiver: default-slack | |
group_by: [alertname, job, instance] | |
group_wait: 10s | |
group_interval: 5m | |
repeat_interval: 12h | |
routes: | |
- receiver: email | |
group_by: [alertname, job, instance] | |
group_wait: 10s | |
group_interval: 5m | |
repeat_interval: 12h | |
continue: true | |
matchers: | |
- noc = true | |
- receiver: slack | |
group_by: [alertname, job, instance] | |
group_wait: 10s | |
group_interval: 5m | |
repeat_interval: 12h | |
continue: true | |
matchers: | |
- severity = critical | |
receivers: | |
- name: default-slack | |
slack_configs: | |
- channel: '#prometheus-dev' | |
send_resolved: true | |
- name: email | |
email_configs: | |
- to: '[email protected]' | |
send_resolved: true | |
- name: slack | |
slack_configs: | |
- channel: '#prometheus-critical' | |
send_resolved: true |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
services: | |
prometheus: | |
image: prom/prometheus:v2.34.0 | |
container_name: prometheus | |
command: | |
- --web.external-url=http://${HOSTNAME}:9090 | |
- --web.page-title=Prometheus DEV | |
- --web.enable-lifecycle | |
- --web.enable-admin-api | |
- --config.file=/etc/prometheus/prometheus.yml | |
- --storage.tsdb.path=/prometheus | |
- --storage.tsdb.retention.size=1GB | |
- --web.console.libraries=/usr/share/prometheus/console_libraries | |
- --web.console.templates=/usr/share/prometheus/consoles | |
ports: | |
- "9090:9090" | |
networks: | |
- monitoring | |
extra_hosts: | |
- "node-exporter:172.28.0.1" | |
volumes: | |
- prometheus-data:/prometheus | |
- ./prometheus:/etc/prometheus | |
alertmanager: | |
image: prom/alertmanager:v0.24.0 | |
container_name: alertmanager | |
command: | |
- --web.external-url=http://${HOSTNAME}:9093 | |
- --config.file=/etc/alertmanager/alertmanager.yml | |
- --storage.path=/alertmanager | |
ports: | |
- "9093:9093" | |
networks: | |
- monitoring | |
volumes: | |
- alertmanager-data:/alertmanager | |
- ./alertmanager:/etc/alertmanager | |
node-exporter: | |
image: prom/node-exporter:v1.3.1 | |
container_name: node-exporter | |
restart: unless-stopped | |
command: | |
- --path.rootfs=/host | |
- --path.procfs=/host/proc | |
- --path.sysfs=/host/sys | |
- --web.listen-address=0.0.0.0:9100 | |
- --no-collector.infiniband | |
- --no-collector.zfs | |
- '--collector.diskstats.ignored-devices=^(ram|loop|fd)\\d+$$' | |
- '--collector.filesystem.mount-points-exclude=^/(dev|proc|sys|var/lib/docker/.+)($$|/)' | |
- '--collector.filesystem.fs-types-exclude=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$$' | |
# RAPL disabled due to more strict permissions for `/sys/class/powercap/intel-rapl:0/energy_uj` in newer kernels | |
- --no-collector.rapl | |
network_mode: host | |
pid: host | |
volumes: | |
- /:/host:ro,rslave | |
cadvisor: | |
image: gcr.io/cadvisor/cadvisor:v0.44.0 | |
container_name: cadvisor | |
ports: | |
- 8080:8080 | |
networks: | |
- monitoring | |
volumes: | |
- /:/rootfs:ro | |
- /var/run:/var/run:rw | |
- /sys:/sys:ro | |
- /var/lib/docker/:/var/lib/docker:ro | |
volumes: | |
prometheus-data: | |
alertmanager-data: | |
networks: | |
monitoring: | |
ipam: | |
config: | |
- subnet: 172.28.0.0/16 | |
gateway: 172.28.0.1 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
global: | |
# How frequently to scrape targets by default | |
scrape_interval: 30s # default = 1m | |
# How frequently to evaluate rules | |
evaluation_interval: 15s # default = 1m | |
# Attach these labels to any time series or alerts when communicating with | |
# external systems (federation, remote storage, Alertmanager). | |
external_labels: | |
monitor: prometheus-dev | |
# Rules and alerts are read from the specified file(s) | |
rule_files: | |
- rules.yml | |
# Alerting specifies settings related to the Alertmanager | |
alerting: | |
alertmanagers: | |
- static_configs: | |
- targets: | |
# Alertmanager's default port is 9093 | |
- alertmanager:9093 | |
# A scrape configuration containing exactly one endpoint to scrape: | |
scrape_configs: | |
- job_name: prometheus-dev | |
static_configs: | |
- targets: | |
- localhost:9090 | |
- job_name: alertmanager-dev | |
static_configs: | |
- targets: | |
- alertmanager:9093 | |
- job_name: node-exporter-dev | |
scrape_interval: 10s | |
static_configs: | |
- targets: | |
- node-exporter:9100 | |
- job_name: cadvisor | |
scrape_interval: 10s | |
static_configs: | |
- targets: | |
- cadvisor:8080 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
groups: | |
- name: AllInstances | |
rules: | |
- alert: Watchdog | |
expr: vector(1) | |
annotations: | |
description: 'This is an alert meant to ensure that the entire alerting pipeline is functional. | |
This alert is always firing, therefore it should always be firing in Alertmanager | |
and always fire against a receiver. There are integrations with various notification | |
mechanisms that send a notification when this alert is not firing. For example the | |
"DeadMansSnitch" integration in PagerDuty. | |
' | |
runbook_url: TBD | |
summary: An alert that should always be firing to certify that Alertmanager is working properly. | |
message: Prometheus Watchdog | |
labels: | |
severity: none | |
- alert: InstanceDown | |
# Condition for alerting | |
expr: up == 0 | |
for: 1m | |
# Annotation - additional informational labels to store more information | |
annotations: | |
title: 'Instance {{ $labels.instance }} down.' | |
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute.' | |
runbook: https://instancedown-runbook-url.example.com | |
# Labels - additional labels to be attached to the alert | |
labels: | |
severity: critical | |
noc: true | |
- alert: NodeCPUUsageHigh | |
expr: (sum by (nodename) (irate(node_cpu_seconds_total{job="node-exporter-dev", mode!="idle"}[5m]) * on(instance) group_left(nodename) node_uname_info{job="node-exporter-dev"}) * 100 / count by (nodename) (irate(node_cpu_seconds_total{job="node-exporter-dev", mode="idle"}[5m]) * on(instance) group_left(nodename) node_uname_info{job="node-exporter-dev"})) > 90 | |
for: 1m | |
annotations: | |
title: 'CPU usage high on {{ $labels.nodename }}.' | |
description: 'Over the last 10 mins CPU usage exceeds 90% on {{ $labels.nodename }}' | |
runbook: https://cpu-runbook-url.example.com | |
labels: | |
severity: warning | |
- alert: NodeLoadAvg5High | |
expr: (sum by (nodename) (node_load5{job="node-exporter-dev"} * on(instance) group_left(nodename) node_uname_info{job="node-exporter-dev"}) * 100 / count by (nodename) (irate(node_cpu_seconds_total{job="node-exporter-dev", mode="idle"}[5m]) * on(instance) group_left(nodename) node_uname_info{job="node-exporter-dev"})) > 90 | |
for: 1m | |
annotations: | |
title: 'CPU 5min Load Average high on {{ $labels.nodename }}.' | |
description: 'CPU 5min Load Average exceeds 90% of available cores on {{ $labels.nodename }}' | |
runbook: https://loadavg-runbook-url.example.com | |
labels: | |
severity: warning | |
- alert: NodeMemUsageHigh | |
expr: (sum by (nodename) ((node_memory_MemTotal_bytes{job="node-exporter-dev"} - node_memory_MemAvailable_bytes{job="node-exporter-dev"}) * on(instance) group_left(nodename) node_uname_info{job="node-exporter-dev"}) * 100 / avg by (nodename) (node_memory_MemTotal_bytes{job="node-exporter-dev"} * on(instance) group_left(nodename) node_uname_info{job="node-exporter-dev"})) > 90 | |
for: 1m | |
annotations: | |
title: 'Memory usage high on {{ $labels.nodename }}.' | |
description: 'Over the last 5 mins RAM usage exceeds 90% on {{ $labels.nodename }}' | |
runbook: https://mem-runbook-url.example.com | |
labels: | |
severity: critical | |
noc: true | |
- alert: NodeDiskUsageHigh | |
expr: (min by (nodename,device) (node_filesystem_avail_bytes{job="node-exporter-dev",device!~"rootfs|tmpfs"} * on(instance) group_left(nodename) node_uname_info{job="node-exporter-dev"}) * 100) / (min by (nodename,device) (node_filesystem_size_bytes{job="node-exporter-dev",device!~"rootfs|tmpfs"} * on(instance) group_left(nodename) node_uname_info{job="node-exporter-dev"})) < 10 | |
for: 1m | |
annotations: | |
title: 'Disk usage high on {{ $labels.nodename }}.' | |
description: '{{ $labels.device }} on {{ $labels.nodename }} has less than 10% space left' | |
runbook: https://disk-runbook-url.example.com | |
labels: | |
severity: critical | |
noc: true |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment