Last active
November 12, 2024 04:19
-
-
Save jiang-wei/ab8b258344854cf01da23a5631d95abc to your computer and use it in GitHub Desktop.
PromQL examples & explanation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
这是从 GKE market place 里安装的Prometheus server 的 rules 配置 | |
当做 rules 样例学习 | |
$ cat rules.yaml | |
"groups": | |
- "name": "k8s.rules" | |
"rules": | |
- "expr": | | |
sum(rate(container_cpu_usage_seconds_total{job="cadvisor", image!=""}[5m])) by (namespace) | |
"record": "namespace:container_cpu_usage_seconds_total:sum_rate" | |
# job="cadvisor" prometheus scrape job cadvisor | |
# rate()[5m] 5min 内的变化率(/s) | |
# sum() by (namespace) 按 namespace 求和汇总 | |
- "expr": | | |
sum(container_memory_usage_bytes{job="cadvisor", image!=""}) by (namespace) | |
"record": "namespace:container_memory_usage_bytes:sum" | |
# 略 | |
- "expr": | | |
sum by (namespace, label_name) ( | |
sum(rate(container_cpu_usage_seconds_total{job="cadvisor", image!=""}[5m])) by (namespace, pod_name) | |
* on (namespace, pod_name) group_left(label_name) | |
label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") | |
) | |
"record": "namespace_name:container_cpu_usage_seconds_total:sum_rate" | |
# label_replace(vector, target_label, "target_value", src_label, src_regex) 将 vector 中 pod 的值存入 pod_name | |
# v1 * on (namespace, pod_name) group_left(label_name) v2, v1 v2 按 namespace, pod_name 相乘,添加 v2:label_name 到最终结果 | |
# sum(..) by (namespace, pod_name) 按 namespace, pod_name 汇总 | |
# kube_pod_labels 的值都是 1 | |
# 从最终效果上看这个 * on () group_left(label) 仅仅是添加了 v2:label_name 到结果中 | |
# 整体效果是给 container_cpu_usage_seconds_total 添加了 label_name 然后按 namespace, label_name 汇总 | |
- "expr": | | |
sum by (namespace, label_name) ( | |
sum(container_memory_usage_bytes{job="cadvisor",image!=""}) by (pod_name, namespace) | |
* on (namespace, pod_name) group_left(label_name) | |
label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") | |
) | |
"record": "namespace_name:container_memory_usage_bytes:sum" | |
# 同上 | |
- "expr": | | |
sum by (namespace, label_name) ( | |
sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}) by (namespace, pod) | |
* on (namespace, pod) group_left(label_name) | |
label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") | |
) | |
"record": "namespace_name:kube_pod_container_resource_requests_memory_bytes:sum" | |
# 同上 | |
- "expr": | | |
sum by (namespace, label_name) ( | |
sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod) | |
* on (namespace, pod) group_left(label_name) | |
label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") | |
) | |
"record": "namespace_name:kube_pod_container_resource_requests_cpu_cores:sum" | |
# v1 and on (pod) v2 取v1的值,当 v2:pod 和 v1:pod 相同时有值 这里的效果是取 requests_cpu_cores 仅当该 pod status_scheduled=true 避免获取未启动的 pod 的值 | |
- "name": "kube-scheduler.rules" | |
"rules": | |
- "expr": | | |
histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 | |
"labels": | |
"quantile": "0.99" | |
"record": "cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile" | |
# histogram_quantile 取 0.99 分位的值 | |
# 最终 record 添加 label: quantile=0.99 | |
# sum() without(l1, l2) 汇总时去掉l1, l2 保留其它 label | |
- "expr": | | |
histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 | |
"labels": | |
"quantile": "0.99" | |
"record": "cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile" | |
- "expr": | | |
histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 | |
"labels": | |
"quantile": "0.99" | |
"record": "cluster_quantile:scheduler_binding_latency:histogram_quantile" | |
- "expr": | | |
histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 | |
"labels": | |
"quantile": "0.9" | |
"record": "cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile" | |
- "expr": | | |
histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 | |
"labels": | |
"quantile": "0.9" | |
"record": "cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile" | |
- "expr": | | |
histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 | |
"labels": | |
"quantile": "0.9" | |
"record": "cluster_quantile:scheduler_binding_latency:histogram_quantile" | |
- "expr": | | |
histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 | |
"labels": | |
"quantile": "0.5" | |
"record": "cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile" | |
- "expr": | | |
histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 | |
"labels": | |
"quantile": "0.5" | |
"record": "cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile" | |
- "expr": | | |
histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 | |
"labels": | |
"quantile": "0.5" | |
"record": "cluster_quantile:scheduler_binding_latency:histogram_quantile" | |
- "name": "kube-apiserver.rules" | |
"rules": | |
- "expr": | | |
histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 | |
"labels": | |
"quantile": "0.99" | |
"record": "cluster_quantile:apiserver_request_latencies:histogram_quantile" | |
- "expr": | | |
histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 | |
"labels": | |
"quantile": "0.9" | |
"record": "cluster_quantile:apiserver_request_latencies:histogram_quantile" | |
- "expr": | | |
histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 | |
"labels": | |
"quantile": "0.5" | |
"record": "cluster_quantile:apiserver_request_latencies:histogram_quantile" | |
- "name": "node.rules" | |
"rules": | |
- "expr": "sum(min(kube_pod_info) by (node))" | |
"record": ":kube_pod_info_node_count:" | |
# min(vector) by (node) 按 node 汇总,取最小值 | |
# sum() 求和,获取标量。 | |
# 从实际效果看,用sum(max(...)) 也是一样的 | |
- "expr": | | |
max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod) | |
"record": "node_namespace_pod:kube_pod_info:" | |
# 这个 label_replace 似乎多此一举 | |
# 最终得到一个含有 node, namespace, pod 的 vector,值都是 1 | |
- "expr": | | |
count by (node) (sum by (node, cpu) ( | |
node_cpu{job="node-exporter"} | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
)) | |
"record": "node:node_num_cpu:sum" | |
# node_cpu{} * on () group_left(node) node_namespace_pod:kube_pod_info: -> 为每条数据添加 node label | |
# sum by(node, cpu) 按 node/cpu 汇总 | |
# count by (node) 汇总 node 数量 | |
# 感觉可以用其它简单公式代替 | |
- "expr": | | |
1 - avg(rate(node_cpu{job="node-exporter",mode="idle"}[1m])) | |
"record": ":node_cpu_utilisation:avg1m" | |
- "expr": | | |
1 - avg by (node) ( | |
rate(node_cpu{job="node-exporter",mode="idle"}[1m]) | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info:) | |
"record": "node:node_cpu_utilisation:avg1m" | |
# 给 rate(node_cpu()) 添加 node label | |
- "expr": | | |
sum(node_load1{job="node-exporter"}) | |
/ | |
sum(node:node_num_cpu:sum) | |
"record": ":node_cpu_saturation_load1:" | |
# 所有 node node_load1 总和 / node 数量 | |
- "expr": | | |
sum by (node) ( | |
node_load1{job="node-exporter"} | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
/ | |
node:node_num_cpu:sum | |
"record": "node:node_cpu_saturation_load1:" | |
- "expr": | | |
1 - | |
sum(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) | |
/ | |
sum(node_memory_MemTotal{job="node-exporter"}) | |
"record": ":node_memory_utilisation:" | |
- "expr": | | |
sum by (node) ( | |
(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
"record": "node:node_memory_bytes_available:sum" | |
- "expr": | | |
sum by (node) ( | |
node_memory_MemTotal{job="node-exporter"} | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
"record": "node:node_memory_bytes_total:sum" | |
- "expr": | | |
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) | |
/ | |
scalar(sum(node:node_memory_bytes_total:sum)) | |
"record": "node:node_memory_utilisation:ratio" | |
- "expr": | | |
1e3 * sum( | |
(rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) | |
+ rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) | |
) | |
"record": ":node_memory_swap_io_bytes:sum_rate" | |
- "expr": | | |
1 - | |
sum by (node) ( | |
(node_memory_MemFree{job="node-exporter"} + node_memory_Cached{job="node-exporter"} + node_memory_Buffers{job="node-exporter"}) | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
/ | |
sum by (node) ( | |
node_memory_MemTotal{job="node-exporter"} | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
"record": "node:node_memory_utilisation:" | |
- "expr": | | |
1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum) | |
"record": "node:node_memory_utilisation_2:" | |
- "expr": | | |
1e3 * sum by (node) ( | |
(rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) | |
+ rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
"record": "node:node_memory_swap_io_bytes:sum_rate" | |
- "expr": | | |
avg(irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) | |
"record": ":node_disk_utilisation:avg_irate" | |
- "expr": | | |
avg by (node) ( | |
irate(node_disk_io_time_ms{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
"record": "node:node_disk_utilisation:avg_irate" | |
- "expr": | | |
avg(irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3) | |
"record": ":node_disk_saturation:avg_irate" | |
- "expr": | | |
avg by (node) ( | |
irate(node_disk_io_time_weighted{job="node-exporter",device=~"(sd|xvd|nvme).+"}[1m]) / 1e3 | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
"record": "node:node_disk_saturation:avg_irate" | |
- "expr": | | |
sum(irate(node_network_receive_bytes{job="node-exporter",device="eth0"}[1m])) + | |
sum(irate(node_network_transmit_bytes{job="node-exporter",device="eth0"}[1m])) | |
"record": ":node_net_utilisation:sum_irate" | |
- "expr": | | |
sum by (node) ( | |
(irate(node_network_receive_bytes{job="node-exporter",device="eth0"}[1m]) + | |
irate(node_network_transmit_bytes{job="node-exporter",device="eth0"}[1m])) | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
"record": "node:node_net_utilisation:sum_irate" | |
- "expr": | | |
sum(irate(node_network_receive_drop{job="node-exporter",device="eth0"}[1m])) + | |
sum(irate(node_network_transmit_drop{job="node-exporter",device="eth0"}[1m])) | |
"record": ":node_net_saturation:sum_irate" | |
- "expr": | | |
sum by (node) ( | |
(irate(node_network_receive_drop{job="node-exporter",device="eth0"}[1m]) + | |
irate(node_network_transmit_drop{job="node-exporter",device="eth0"}[1m])) | |
* on (namespace, pod) group_left(node) | |
node_namespace_pod:kube_pod_info: | |
) | |
"record": "node:node_net_saturation:sum_irate" | |
- "name": "kube-prometheus-node-recording.rules" | |
"rules": | |
- "expr": "sum(rate(node_cpu{mode!=\"idle\",mode!=\"iowait\"}[3m])) BY (instance)" | |
"record": "instance:node_cpu:rate:sum" | |
- "expr": "sum((node_filesystem_size{mountpoint=\"/\"} - node_filesystem_free{mountpoint=\"/\"})) BY (instance)" | |
"record": "instance:node_filesystem_usage:sum" | |
- "expr": "sum(rate(node_network_receive_bytes[3m])) BY (instance)" | |
"record": "instance:node_network_receive_bytes:rate:sum" | |
- "expr": "sum(rate(node_network_transmit_bytes[3m])) BY (instance)" | |
"record": "instance:node_network_transmit_bytes:rate:sum" | |
- "expr": "sum(rate(node_cpu{mode!=\"idle\",mode!=\"iowait\"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)" | |
"record": "instance:node_cpu:ratio" | |
# count(sum(node_cpu) BY (instance, cpu)) BY (instance) 得到每个 instance cpu 数量 | |
# sum(...) / on(instance) group_left() ... -> sum(...) 得到的 vector 还有很多其它 label,最终的 vector 里要保留 所以这里用 group_left | |
- "expr": "sum(rate(node_cpu{mode!=\"idle\",mode!=\"iowait\"}[5m]))" | |
"record": "cluster:node_cpu:sum_rate5m" | |
- "expr": "cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))" | |
"record": "cluster:node_cpu:ratio" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment