Skip to content

Instantly share code, notes, and snippets.

@theinhumaneme
Last active December 13, 2024 15:15
Show Gist options
  • Save theinhumaneme/ac5264620086b8c5722e13230ef5f808 to your computer and use it in GitHub Desktop.
Save theinhumaneme/ac5264620086b8c5722e13230ef5f808 to your computer and use it in GitHub Desktop.
NODE-MONITORING
networks:
monitoring:
external: true
services:
alertmanager:
image: prom/alertmanager:v0.27.0
container_name: alertmanager
restart: unless-stopped
ports:
- "9093:9093"
volumes:
- ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
networks:
- monitoring
calert:
container_name: calert
image: ghcr.io/mr-karan/calert:latest
restart: unless-stopped
ports:
- "6000:6000"
volumes:
- ./ca-alert-config.toml:/app/config.sample.toml
- ./ca-alert-message.tmpl:/etc/calert/message.tmpl
networks:
- monitoring
global:
resolve_timeout: 1m
route:
receiver: alerts
group_by: ['alertname'] # Group alerts by the 'alertname' label
group_wait: 0s # Send alerts immediately without waiting
group_interval: 2m # Wait time between groups of alerts
repeat_interval: 2m # Repeat notifications every 1 minute if the alert is still firing
receivers:
- name: 'alerts'
webhook_configs:
- url: 'http://calert:6000/dispatch'

Nodes Being Monitored

All prom instances remote write to it's mimir instance All prom instances alert to the common alertmanager

monitoring excalidraw

Reverse Proxy for all mimir instances

path-mimir excalidraw

[app]
address = "0.0.0.0:6000" # Address of the HTTP Server.
server_timeout = "60s" # Server timeout for HTTP requests.
enable_request_logs = true # Whether to log incoming HTTP requests or not.
log = "info" # Use `debug` to enable verbose logging. Can be set to `info` otherwise.
[providers.alerts]
type = "google_chat" # Type of provider. Currently supported value is `google_chat`.
endpoint = "..." # Google Chat Webhook URL
max_idle_conns = 50 # Max idle connections in the HTTP Client.
timeout = "30s" # Timeout for making requests to Provider.
# proxy_url = "http://internal-squid-proxy.com:3128" # Specify `proxy_url` as your proxy endpoint to route all HTTP requests to the provider via a proxy.
template = "static/message.tmpl" # Path to specify the message template path.
thread_ttl = "2h" # Timeout to keep active alerts in memory. Once this TTL expires, a new thread will be created.
#threaded_replies = true # Whether to send threaded replies or not.
dry_run = false
*{{.Labels.severity | toUpper }}*\n
{{ .Labels.alertname }} - {{.Status | toUpper }}*
{{ range .Annotations -}}
{{ .Value}}
{{ end -}}
groups:
- name: host_alerts
rules:
- alert: "High Host CPU Usage"
expr: (1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100 > 40
for: 2m
labels:
severity: critical
annotations:
# summary: "High CPU usage on host {{ $labels.instance }}"
description: "*Host* - {{ $labels.instance }}\nHost CPU usage has exceeded 40% for the last 2 minutes."
- alert: "Low Host Memory"
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < 0.2
for: 2m
labels:
severity: critical
annotations:
# summary: "Low memory on host {{ $labels.instance }}"
description: "*Host* - {{ $labels.instance }}\nAvailable memory is below 20% on the host."
- alert: "High Host Disk Usage"
expr: (node_filesystem_size_bytes{fstype!="tmpfs"} - node_filesystem_avail_bytes{fstype!="tmpfs"}) / node_filesystem_size_bytes{fstype!="tmpfs"} > 0.8
for: 2m
labels:
severity: critical
annotations:
# summary: "High disk usage on host {{ $labels.instance }}"
description: "*Host* - {{ $labels.instance }}\nDisk usage on {{ $labels.mountpoint }} has exceeded 80%."
groups:
- name: container_recording_rules
rules:
# Precompute container CPU usage
- record: container:cpu_usage:rate
expr: rate(container_cpu_usage_seconds_total{name!=""}[5m])
# Precompute container memory usage
- record: container:memory_working_set
expr: container_memory_working_set_bytes{image!=""}
# Precompute container last seen time difference
- record: container:last_seen:time_diff
expr: time() - max(container_last_seen) without (id)
# Precompute container restart rate
- record: container:restart_rate
expr: rate(container_restart_count[5m])
services:
prometheus:
user: 0:0
container_name: prometheus
image: prom/prometheus:v3.0.1
restart: always
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=6h'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--web.enable-lifecycle'
volumes:
- ./temp-prometheus:/prometheus
- ./prometheus.yml:/etc/prometheus/prometheus.yml
- ./host_recording_rules.yml:/etc/prometheus/host_recording_rules.yml
- ./container_recording_rules.yml:/etc/prometheus/container_recording_rules.yml
- ./host_alert_rules.yml:/etc/prometheus/host_alert_rules.yml
- ./container_alert_rules.yml:/etc/prometheus/container_alert_rules.yml
depends_on:
- node-exporter
- cadvisor
ports:
- "9090:9090"
networks:
- monitoring
node-exporter:
image: prom/node-exporter:v1.8.2
container_name: node-exporter
restart: unless-stopped
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
expose:
- 9100
networks:
- monitoring
cadvisor:
container_name: cadvisor
image: gcr.io/cadvisor/cadvisor:v0.49.1
expose:
- 8080
volumes:
- "/:/rootfs"
- "/var/run:/var/run"
- "/sys:/sys"
- "/var/lib/docker/:/var/lib/docker"
- "/dev/disk/:/dev/disk"
privileged: true
command:
- "--docker_only=true"
- "--allow_dynamic_housekeeping=false"
- "--global_housekeeping_interval=5s"
- "--housekeeping_interval=5s"
devices:
- "/dev/kmsg"
networks:
- monitoring
networks:
monitoring:
external: true
groups:
- name: host_alerts
rules:
- alert: "High Host CPU Usage"
expr: host:cpu_usage:avg > 40
for: 2m
labels:
severity: critical
annotations:
description: "*Host* - {{ $labels.instance }}\nHost CPU usage has exceeded 40% for the last 2 minutes."
- alert: "Low Host Memory"
expr: host:memory_available:ratio < 0.2
for: 2m
labels:
severity: critical
annotations:
description: "*Host* - {{ $labels.instance }}\nAvailable memory is below 20% on the host."
- alert: "High Host Disk Usage"
expr: host:disk_usage:ratio > 0.8
for: 2m
labels:
severity: critical
annotations:
description: "*Host* - {{ $labels.instance }}\nDisk usage on {{ $labels.mountpoint }} has exceeded 80%."
groups:
- name: host_recording_rules
rules:
# Precompute average CPU usage per host
- record: host:cpu_usage:avg
expr: (1 - avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) * 100
# Precompute memory availability as a fraction
- record: host:memory_available:ratio
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes
# Precompute disk usage as a fraction
- record: host:disk_usage:ratio
expr: (node_filesystem_size_bytes{fstype!="tmpfs"} - node_filesystem_avail_bytes{fstype!="tmpfs"}) / node_filesystem_size_bytes{fstype!="tmpfs"}
multitenancy_enabled: false
api:
prometheus_http_prefix: ''
blocks_storage:
backend: s3
s3:
endpoint: ...
bucket_name: ...
access_key_id: ...
secret_access_key: ...
insecure: true
limits:
# Enable TSDB block upload
max_label_names_per_series: 100
compactor_block_upload_enabled: true
compactor:
data_dir: /tmp/mimir/compactor
sharding_ring:
kvstore:
store: memberlist
distributor:
ring:
instance_addr: 127.0.0.1
kvstore:
store: memberlist
ingester:
ring:
instance_addr: 127.0.0.1
kvstore:
store: memberlist
replication_factor: 1
server:
http_listen_port: 9009
store_gateway:
sharding_ring:
replication_factor: 1
networks:
monitoring:
external: true
services:
mimir:
image: grafana/mimir:2.13.1
container_name: mimir
restart: unless-stopped
ports:
- "9009:9009" # API and query endpoint
volumes:
- ./mimir-config.yaml:/etc/mimir/mimir-config.yaml
command:
- "-config.file=/etc/mimir/mimir-config.yaml"
networks:
- monitoring
services:
minio:
image: quay.io/minio/minio:RELEASE.2024-11-07T00-52-20Z
container_name: minio
environment:
MINIO_ROOT_USER: ...
MINIO_ROOT_PASSWORD: ...
MINIO_BROWSER_REDIRECT_URL: https://...
command: server /data --console-address ":9001"
ports:
- "9000:9000" # MinIO API
- "9001:9001" # MinIO Console
volumes:
- ./minio/data:/data
networks:
- monitoring
networks:
monitoring:
external: true
global:
scrape_interval: 10s
evaluation_interval: 10s
scrape_configs:
- job_name: 'node'
scrape_interval: 5s
static_configs:
- targets: ['node-exporter:9100']
labels:
instance: "One O One - Kalyan Mudumby"
- job_name: cadvisor
scrape_interval: 5s
static_configs:
- targets: ['cadvisor:8080']
rule_files:
- "host_recording_rules.yml"
- "container_recording_rules.yml"
- "host_alert_rules.yml"
- "container_alert_rules.yml"
alerting:
alertmanagers:
- scheme: https
static_configs:
- targets:
- ...
remote_write:
- url: "https://.../api/v1/push"
headers:
X-Scope-OrgID: "anonymous"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment