Last active
          November 4, 2025 04:29 
        
      - 
      
 - 
        
Save krisek/62a98e2645af5dce169a7b506e999cd8 to your computer and use it in GitHub Desktop.  
    Prometheus alert rules for node exporter
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | groups: | |
| - name: node_exporter_alerts | |
| rules: | |
| - alert: Node down | |
| expr: up{job="monitoring-pi"} == 0 | |
| for: 2m | |
| labels: | |
| severity: warning | |
| annotations: | |
| title: Node {{ $labels.instance }} is down | |
| description: Failed to scrape {{ $labels.job }} on {{ $labels.instance }} for more than 2 minutes. Node seems down. | |
| - alert: HostOutOfMemory | |
| expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 | |
| for: 2m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host out of memory (instance {{ $labels.instance }}) | |
| description: Node memory is filling up (< 10% left)\n VALUE = {{ $value }} | |
| - alert: HostMemoryUnderMemoryPressure | |
| expr: rate(node_vmstat_pgmajfault[1m]) > 1000 | |
| for: 2m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host memory under memory pressure (instance {{ $labels.instance }}) | |
| description: The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }} | |
| - alert: HostUnusualNetworkThroughputIn | |
| expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host unusual network throughput in (instance {{ $labels.instance }}) | |
| description: Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }} | |
| - alert: HostUnusualNetworkThroughputOut | |
| expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host unusual network throughput out (instance {{ $labels.instance }}) | |
| description: Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }} | |
| - alert: HostUnusualDiskReadRate | |
| expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host unusual disk read rate (instance {{ $labels.instance }}) | |
| description: Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }} | |
| - alert: HostUnusualDiskWriteRate | |
| expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50 | |
| for: 2m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host unusual disk write rate (instance {{ $labels.instance }}) | |
| description: Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }} | |
| # Please add ignored mountpoints in node_exporter parameters like | |
| # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". | |
| # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. | |
| - alert: HostOutOfDiskSpace | |
| expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 | |
| for: 2m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host out of disk space (instance {{ $labels.instance }}) | |
| description: Disk is almost full (< 10% left)\n VALUE = {{ $value }} | |
| # Please add ignored mountpoints in node_exporter parameters like | |
| # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". | |
| # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. | |
| - alert: HostDiskWillFillIn24Hours | |
| expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 | |
| for: 2m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host disk will fill in 24 hours (instance {{ $labels.instance }}) | |
| description: Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }} | |
| - alert: HostOutOfInodes | |
| expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 | |
| for: 2m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host out of inodes (instance {{ $labels.instance }}) | |
| description: Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }} | |
| - alert: HostInodesWillFillIn24Hours | |
| expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 | |
| for: 2m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) | |
| description: Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }} | |
| - alert: HostUnusualDiskReadLatency | |
| expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0 | |
| for: 2m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host unusual disk read latency (instance {{ $labels.instance }}) | |
| description: Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }} | |
| - alert: HostUnusualDiskWriteLatency | |
| expr: rate(node_disk_write_time_seconds_totali{device!~"mmcblk.+"}[1m]) / rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{device!~"mmcblk.+"}[1m]) > 0 | |
| for: 2m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host unusual disk write latency (instance {{ $labels.instance }}) | |
| description: Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }} | |
| - alert: HostHighCpuLoad | |
| expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80 | |
| for: 0m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host high CPU load (instance {{ $labels.instance }}) | |
| description: CPU load is > 80%\n VALUE = {{ $value }} | |
| - alert: HostCpuStealNoisyNeighbor | |
| expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 | |
| for: 0m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) | |
| description: CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }} | |
| # 1000 context switches is an arbitrary number. | |
| # Alert threshold depends on nature of application. | |
| # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 | |
| - alert: HostContextSwitching | |
| expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000 | |
| for: 0m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host context switching (instance {{ $labels.instance }}) | |
| description: Context switching is growing on node (> 1000 / s)\n VALUE = {{ $value }} | |
| - alert: HostSwapIsFillingUp | |
| expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 | |
| for: 2m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host swap is filling up (instance {{ $labels.instance }}) | |
| description: Swap is filling up (>80%)\n VALUE = {{ $value }} | |
| - alert: HostSystemdServiceCrashed | |
| expr: node_systemd_unit_state{state="failed"} == 1 | |
| for: 0m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host SystemD service crashed (instance {{ $labels.instance }}) | |
| description: SystemD service crashed\n VALUE = {{ $value }} | |
| - alert: HostPhysicalComponentTooHot | |
| expr: node_hwmon_temp_celsius > 75 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host physical component too hot (instance {{ $labels.instance }}) | |
| description: Physical hardware component too hot\n VALUE = {{ $value }} | |
| - alert: HostNodeOvertemperatureAlarm | |
| expr: node_hwmon_temp_crit_alarm_celsius == 1 | |
| for: 0m | |
| labels: | |
| severity: critical | |
| annotations: | |
| summary: Host node overtemperature alarm (instance {{ $labels.instance }}) | |
| description: Physical node temperature alarm triggered\n VALUE = {{ $value }} | |
| - alert: HostRaidArrayGotInactive | |
| expr: node_md_state{state="inactive"} > 0 | |
| for: 0m | |
| labels: | |
| severity: critical | |
| annotations: | |
| summary: Host RAID array got inactive (instance {{ $labels.instance }}) | |
| description: RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }} | |
| - alert: HostRaidDiskFailure | |
| expr: node_md_disks{state="failed"} > 0 | |
| for: 2m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host RAID disk failure (instance {{ $labels.instance }}) | |
| description: At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }} | |
| - alert: HostKernelVersionDeviations | |
| expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1 | |
| for: 6h | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host kernel version deviations (instance {{ $labels.instance }}) | |
| description: Different kernel versions are running\n VALUE = {{ $value }} | |
| - alert: HostOomKillDetected | |
| expr: increase(node_vmstat_oom_kill[1m]) > 0 | |
| for: 0m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host OOM kill detected (instance {{ $labels.instance }}) | |
| description: OOM kill detected\n VALUE = {{ $value }} | |
| - alert: HostEdacCorrectableErrorsDetected | |
| expr: increase(node_edac_correctable_errors_total[1m]) > 0 | |
| for: 0m | |
| labels: | |
| severity: info | |
| annotations: | |
| summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) | |
| description: Instance has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }} | |
| - alert: HostEdacUncorrectableErrorsDetected | |
| expr: node_edac_uncorrectable_errors_total > 0 | |
| for: 0m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) | |
| description: Instance has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }} | |
| - alert: HostNetworkReceiveErrors | |
| expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 | |
| for: 2m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host Network Receive Errors (instance {{ $labels.instance }}:{{ $labels.device }}) | |
| description: Instance interface has encountered {{ printf "%.0f" $value }} receive errors in the last five minutes.\n VALUE = {{ $value }} | |
| - alert: HostNetworkTransmitErrors | |
| expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 | |
| for: 2m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host Network Transmit Errors (instance {{ $labels.instance }}:{{ $labels.device }}) | |
| description: Instance has encountered {{ printf "%.0f" $value }} transmit errors in the last five minutes.\n VALUE = {{ $value }} | |
| - alert: HostNetworkInterfaceSaturated | |
| expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 | |
| for: 1m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host Network Interface Saturated (instance {{ $labels.instance }}:{{ $labels.interface }}) | |
| description: The network interface is getting overloaded.\n VALUE = {{ $value }} | |
| - alert: HostConntrackLimit | |
| expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 | |
| for: 5m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host conntrack limit (instance {{ $labels.instance }}) | |
| description: The number of conntrack is approching limit\n VALUE = {{ $value }} | |
| - alert: HostClockSkew | |
| expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0) | |
| for: 2m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host clock skew (instance {{ $labels.instance }}) | |
| description: Clock skew detected. Clock is out of sync.\n VALUE = {{ $value }} | |
| - alert: HostClockNotSynchronising | |
| expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16 | |
| for: 2m | |
| labels: | |
| severity: warning | |
| annotations: | |
| summary: Host clock not synchronising (instance {{ $labels.instance }}) | |
| description: Clock not synchronising.\n VALUE = {{ $value }} | |
This is helpful, thanks!
thanks !
thanks!
Thank you so much!
thanks a lot
rules1m.yaml
🛠️ Load Average 1m > 2
Apache Server Load > 1
📡 Mucho tráfico OUT > 600MB/s
Apache Response Time
Mucho tráfico IN > 600MB/s
🔥 CPU Temperature Router +64º
⚡NF Conntrack > 16.000
MySQL Too much connections 60%
MySQL Slow Queries
MySQL QPS > 400s
apiVersion: 1
groups:
    - orgId: 1
      name: rules30s
      folder: rules
      interval: 30s
      rules:
        - uid: ce0xw3r63atq8c
          title: "\U0001F6E0️ CPU Load Average 1m > 2"
          condition: C
          data:
            - refId: A
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: de05xuoi6cav4b
              model:
                datasource:
                    type: prometheus
                    uid: de05xuoi6cav4b
                disableTextWrap: false
                editorMode: builder
                expr: node_load1
                fullMetaSearch: false
                includeNullMetadata: true
                instant: true
                intervalMs: 1000
                legendFormat: __auto
                maxDataPoints: 43200
                range: false
                refId: A
                useBackend: false
            - refId: C
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 2
                        type: gt
                      operator:
                        type: and
                      query:
                        params:
                            - C
                      reducer:
                        params: []
                        type: last
                      type: query
                datasource:
                    type: __expr__
                    uid: __expr__
                expression: A
                intervalMs: 1000
                maxDataPoints: 43200
                refId: C
                type: threshold
          noDataState: NoData
          execErrState: Error
          for: 1m
          annotations: {}
          labels: {}
          isPaused: false
          notification_settings:
            receiver: grafana-default-email
        - uid: be0xx1u22eccge
          title: Apache Server Load > 1.40
          condition: C
          data:
            - refId: A
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: de05xuoi6cav4b
              model:
                datasource:
                    type: prometheus
                    uid: de05xuoi6cav4b
                disableTextWrap: false
                editorMode: builder
                expr: apache_load{interval="1min"}
                fullMetaSearch: false
                includeNullMetadata: true
                instant: true
                intervalMs: 10000
                legendFormat: __auto
                maxDataPoints: 43200
                range: false
                refId: A
                useBackend: false
            - refId: C
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 1.4
                        type: gt
                      operator:
                        type: and
                      query:
                        params:
                            - C
                      reducer:
                        params: []
                        type: last
                      type: query
                datasource:
                    type: __expr__
                    uid: __expr__
                expression: A
                intervalMs: 1000
                maxDataPoints: 43200
                refId: C
                type: threshold
          noDataState: NoData
          execErrState: Error
          for: 1m
          annotations: {}
          labels: {}
          isPaused: false
          notification_settings:
            receiver: grafana-default-email
        - uid: be0yeu47va1a8e
          title: "\U0001F4E1 Mucho tráfico OUT > 600MB/s"
          condition: C
          data:
            - refId: A
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: de05xuoi6cav4b
              model:
                datasource:
                    type: prometheus
                    uid: de05xuoi6cav4b
                editorMode: code
                expr: sum by (instance) (rate(node_network_transmit_bytes_total[1m])) / 1024 / 1024
                instant: true
                intervalMs: 10000
                legendFormat: __auto
                maxDataPoints: 43200
                range: false
                refId: A
            - refId: C
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 60
                        type: gt
                      operator:
                        type: and
                      query:
                        params:
                            - C
                      reducer:
                        params: []
                        type: last
                      type: query
                datasource:
                    type: __expr__
                    uid: __expr__
                expression: A
                intervalMs: 1000
                maxDataPoints: 43200
                refId: C
                type: threshold
          noDataState: NoData
          execErrState: Error
          for: 1m
          isPaused: false
          notification_settings:
            receiver: grafana-default-email
        - uid: ae0yfkh2q8ohsf
          title: Apache Response Time
          condition: C
          data:
            - refId: A
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: de05xuoi6cav4b
              model:
                datasource:
                    type: prometheus
                    uid: de05xuoi6cav4b
                editorMode: code
                exemplar: true
                expr: sum(rate(apache_duration_ms_total{instance=~"localhost:9117"}[1m])) / sum(rate(apache_accesses_total{instance=~"localhost:9117"}[1m]))
                format: time_series
                instant: true
                interval: ""
                intervalFactor: 1
                intervalMs: 15000
                legendFormat: Time
                maxDataPoints: 43200
                range: false
                refId: A
                step: 240
            - refId: C
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 0.09
                        type: gt
                      operator:
                        type: and
                      query:
                        params:
                            - C
                      reducer:
                        params: []
                        type: last
                      type: query
                datasource:
                    type: __expr__
                    uid: __expr__
                expression: A
                intervalMs: 1000
                maxDataPoints: 43200
                refId: C
                type: threshold
          dashboardUid: Jc1gx9hVk
          panelId: 1
          noDataState: NoData
          execErrState: Error
          for: 1m
          annotations:
            __dashboardUid__: Jc1gx9hVk
            __panelId__: "1"
          isPaused: false
          notification_settings:
            receiver: grafana-default-email
        - uid: fe0zcct15wp34f
          title: Mucho tráfico IN > 600MB/s
          condition: C
          data:
            - refId: A
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: de05xuoi6cav4b
              model:
                datasource:
                    type: prometheus
                    uid: de05xuoi6cav4b
                editorMode: code
                expr: sum by (instance) (rate(node_network_receive_bytes_total[1m])) / 1024 / 1024
                instant: true
                intervalMs: 10000
                legendFormat: __auto
                maxDataPoints: 43200
                range: false
                refId: A
            - refId: C
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 60
                        type: gt
                      operator:
                        type: and
                      query:
                        params:
                            - C
                      reducer:
                        params: []
                        type: last
                      type: query
                datasource:
                    type: __expr__
                    uid: __expr__
                expression: A
                intervalMs: 1000
                maxDataPoints: 43200
                refId: C
                type: threshold
          noDataState: NoData
          execErrState: Error
          for: 1m
          isPaused: false
          notification_settings:
            receiver: grafana-default-email
        - uid: ae0zgn59v0ruoe
          title: "\U0001F525 CPU Temperature Router +65º"
          condition: B
          data:
            - refId: CPU
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: adzvde9z38cg0a
              model:
                datasource:
                    type: influxdb
                    uid: adzvde9z38cg0a
                groupBy:
                    - params:
                        - $__interval
                      type: time
                    - params:
                        - "null"
                      type: fill
                intervalMs: 10000
                maxDataPoints: 43200
                orderByTime: ASC
                policy: default
                query: "from(bucket: \"RedesZoneBUCKET\")\r\n    |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\r\n    |> filter(fn: (r) =>\r\n        r._measurement == \"router_asus\" and\r\n        r._field == \"temp_CPU\"        \r\n        )\r\n//    |> drop(columns: [\"container_version\", \"engine_host\", \"host\", \"server_version\"])\r\n    //|> aggregateWindow(every: 5m, fn: mean)\r\n    |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false)\r\n    |> yield(name: \"mean\")"
                refId: CPU
                resultFormat: time_series
                select:
                    - - params:
                            - value
                        type: field
                      - params: []
                        type: mean
                tags: []
            - refId: A
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 0
                            - 0
                        type: gt
                      operator:
                        type: and
                      query:
                        params: []
                      reducer:
                        params: []
                        type: avg
                      type: query
                datasource:
                    name: Expression
                    type: __expr__
                    uid: __expr__
                expression: CPU
                intervalMs: 1000
                maxDataPoints: 43200
                reducer: last
                refId: A
                type: reduce
            - refId: B
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 65
                            - 0
                        type: gt
                      operator:
                        type: and
                      query:
                        params: []
                      reducer:
                        params: []
                        type: avg
                      type: query
                datasource:
                    name: Expression
                    type: __expr__
                    uid: __expr__
                expression: A
                intervalMs: 1000
                maxDataPoints: 43200
                refId: B
                type: threshold
          dashboardUid: jY_JZIlGz
          panelId: 37
          noDataState: NoData
          execErrState: Error
          for: 1m
          annotations:
            __dashboardUid__: jY_JZIlGz
            __panelId__: "37"
          labels: {}
          isPaused: false
          notification_settings:
            receiver: grafana-default-email
        - uid: ce0zkl6b8piwwf
          title: ⚡NF Conntrack > 20k
          condition: C
          data:
            - refId: A
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: de05xuoi6cav4b
              model:
                datasource:
                    type: prometheus
                    uid: de05xuoi6cav4b
                expr: node_nf_conntrack_entries{instance="localhost:9100",job="node_exporter"}
                format: time_series
                interval: ""
                intervalFactor: 1
                intervalMs: 15000
                legendFormat: NF conntrack entries
                maxDataPoints: 43200
                refId: A
                step: 240
            - refId: B
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params: []
                        type: gt
                      operator:
                        type: and
                      query:
                        params:
                            - B
                      reducer:
                        params: []
                        type: last
                      type: query
                datasource:
                    type: __expr__
                    uid: __expr__
                expression: A
                intervalMs: 1000
                maxDataPoints: 43200
                reducer: last
                refId: B
                type: reduce
            - refId: C
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 20000
                        type: gt
                      operator:
                        type: and
                      query:
                        params:
                            - C
                      reducer:
                        params: []
                        type: last
                      type: query
                datasource:
                    type: __expr__
                    uid: __expr__
                expression: B
                intervalMs: 1000
                maxDataPoints: 43200
                refId: C
                type: threshold
          dashboardUid: publicok
          panelId: 61
          noDataState: NoData
          execErrState: Error
          for: 1m
          annotations:
            __dashboardUid__: publicok
            __panelId__: "61"
          labels: {}
          isPaused: false
          notification_settings:
            receiver: grafana-default-email
        - uid: be11xgopyuvb4d
          title: MySQL Too much connections 60%
          condition: C
          data:
            - refId: A
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: de05xuoi6cav4b
              model:
                datasource:
                    type: prometheus
                    uid: de05xuoi6cav4b
                editorMode: code
                expr: max_over_time(mysql_global_status_threads_connected[1m]) / (mysql_global_variables_max_connections * 100)
                instant: true
                intervalMs: 1000
                legendFormat: __auto
                maxDataPoints: 43200
                range: false
                refId: A
            - refId: C
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 60
                        type: gt
                      operator:
                        type: and
                      query:
                        params:
                            - C
                      reducer:
                        params: []
                        type: last
                      type: query
                datasource:
                    type: __expr__
                    uid: __expr__
                expression: A
                intervalMs: 1000
                maxDataPoints: 43200
                refId: C
                type: threshold
          noDataState: NoData
          execErrState: Error
          for: 1m
          isPaused: false
          notification_settings:
            receiver: grafana-default-email
        - uid: fe1235gx41czke
          title: MySQL Slow Queries
          condition: C
          data:
            - refId: A
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: de05xuoi6cav4b
              model:
                datasource:
                    type: prometheus
                    uid: de05xuoi6cav4b
                editorMode: code
                expr: mysql_global_status_slow_queries[1m]
                instant: true
                intervalMs: 1000
                legendFormat: __auto
                maxDataPoints: 43200
                range: false
                refId: A
            - refId: C
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 50
                        type: gt
                      operator:
                        type: and
                      query:
                        params:
                            - C
                      reducer:
                        params: []
                        type: last
                      type: query
                datasource:
                    type: __expr__
                    uid: __expr__
                expression: B
                intervalMs: 1000
                maxDataPoints: 43200
                refId: C
                type: threshold
            - refId: B
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 0
                            - 0
                        type: gt
                      operator:
                        type: and
                      query:
                        params: []
                      reducer:
                        params: []
                        type: avg
                      type: query
                datasource:
                    name: Expression
                    type: __expr__
                    uid: __expr__
                expression: A
                hide: false
                intervalMs: 1000
                maxDataPoints: 43200
                reducer: last
                refId: B
                type: reduce
          noDataState: NoData
          execErrState: Error
          for: 1m
          isPaused: false
          notification_settings:
            receiver: grafana-default-email
        - uid: de128sontu29sc
          title: MySQL QPS > 400s
          condition: C
          data:
            - refId: A
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: de05xuoi6cav4b
              model:
                datasource:
                    type: prometheus
                    uid: de05xuoi6cav4b
                editorMode: code
                expr: rate(mysql_global_status_questions[1m])
                instant: true
                intervalMs: 1000
                legendFormat: __auto
                maxDataPoints: 43200
                range: false
                refId: A
            - refId: C
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 400
                        type: gt
                      operator:
                        type: and
                      query:
                        params:
                            - C
                      reducer:
                        params: []
                        type: last
                      type: query
                datasource:
                    type: __expr__
                    uid: __expr__
                expression: A
                intervalMs: 1000
                maxDataPoints: 43200
                refId: C
                type: threshold
          noDataState: NoData
          execErrState: Error
          for: 1m
          annotations: {}
          labels: {}
          isPaused: false
          notification_settings:
            receiver: grafana-default-email
        - uid: ce12wohr1m51cd
          title: Cloudflare http 5xx error rate
          condition: C
          data:
            - refId: A
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: de05xuoi6cav4b
              model:
                datasource:
                    type: prometheus
                    uid: de05xuoi6cav4b
                disableTextWrap: false
                editorMode: code
                expr: increase(cloudflare_zone_requests_status{status=~"^5.."}[5m])
                fullMetaSearch: false
                includeNullMetadata: true
                instant: true
                intervalMs: 15000
                legendFormat: __auto
                maxDataPoints: 43200
                range: false
                refId: A
                useBackend: false
            - refId: C
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 10
                        type: gt
                      operator:
                        type: and
                      query:
                        params:
                            - C
                      reducer:
                        params: []
                        type: last
                      type: query
                datasource:
                    type: __expr__
                    uid: __expr__
                expression: A
                intervalMs: 1000
                maxDataPoints: 43200
                refId: C
                type: threshold
          noDataState: NoData
          execErrState: Error
          for: 1m
          annotations: {}
          labels: {}
          isPaused: true
          notification_settings:
            receiver: grafana-default-email
        - uid: be12xj80cqmf4c
          title: Rate limiting rules 429 CloudFLare
          condition: C
          data:
            - refId: A
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: de05xuoi6cav4b
              model:
                datasource:
                    type: prometheus
                    uid: de05xuoi6cav4b
                disableTextWrap: false
                editorMode: code
                expr: increase(cloudflare_zone_requests_status{status="429"}[2m])
                fullMetaSearch: false
                includeNullMetadata: true
                instant: true
                intervalMs: 60000
                legendFormat: __auto
                maxDataPoints: 43200
                range: false
                refId: A
                useBackend: false
            - refId: C
              relativeTimeRange:
                from: 60
                to: 0
              datasourceUid: __expr__
              model:
                conditions:
                    - evaluator:
                        params:
                            - 15
                        type: gt
                      operator:
                        type: and
                      query:
                        params:
                            - C
                      reducer:
                        params: []
                        type: last
                      type: query
                datasource:
                    type: __expr__
                    uid: __expr__
                expression: A
                intervalMs: 1000
                maxDataPoints: 43200
                refId: C
                type: threshold
          noDataState: NoData
          execErrState: Error
          for: 1m
          isPaused: true
          notification_settings:
            receiver: grafana-default-email
    rules1m.yaml
apologies, I don't see how this relates to the gist, I think Grafana has some way of sharing/publishing dashboards, maybe this belongs there
nice work! Thank you
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment
  
            
Thanks, this was a great help