Skip to content

Instantly share code, notes, and snippets.

@grafuls
Created August 22, 2018 13:30
Show Gist options
  • Save grafuls/ce447423846dae3793fe2f338ed55519 to your computer and use it in GitHub Desktop.
Save grafuls/ce447423846dae3793fe2f338ed55519 to your computer and use it in GitHub Desktop.
Configuration map for Prometheus OCP deployment
apiVersion: v1
kind: ConfigMap
metadata:
name: prom-k8s
data:
alerting.rules: |
groups:
- name: example-rules
interval: 30s # defaults to global interval
rules:
- alert: Node Down
expr: up{job="kubernetes-nodes"} == 0
annotations:
miqTarget: "ContainerNode"
severity: "HIGH"
recording.rules: |
groups:
- name: aggregate_container_resources
rules:
- record: container_cpu_usage_rate
expr: sum without (cpu) (rate(container_cpu_usage_seconds_total[5m]))
- record: container_memory_rss_by_type
expr: container_memory_rss{id=~"/|/system.slice|/kubepods.slice"} > 0
- record: container_cpu_usage_percent_by_host
expr: sum(rate(container_cpu_usage_seconds_total{id="/"}[5m])) BY(kubernetes_io_hostname) / ON(kubernetes_io_hostname) machine_cpu_cores
- record: apiserver_request_count_rate_by_resources
expr: sum without (client,instance,contentType) (rate(apiserver_request_count[5m]))
prometheus.yml: |
rule_files:
- '*.rules'
scrape_configs:
- job_name: 'kubernetes-apiservers'
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- default
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
# Keep only the default/kubernetes service endpoints for the https port. This
# will add targets for each API server which Kubernetes adds an endpoint to
# the default/kubernetes service.
relabel_configs:
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: kubernetes;https
# Scrape config for controllers.
#
# Each master node exposes a /metrics endpoint on :8444 that contains operational metrics for
# the controllers.
#
# TODO: move this to a pure endpoints based metrics gatherer when controllers are exposed via
# endpoints.
- job_name: 'kubernetes-controllers'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- default
# Keep only the default/kubernetes service endpoints for the https port, and then
# set the port to 8444. This is the default configuration for the controllers on OpenShift
# masters.
relabel_configs:
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: kubernetes;https
- source_labels: [__address__]
action: replace
target_label: __address__
regex: (.+)(?::\d+)
replacement: $1:8444
# Scrape config for nodes.
#
# Each node exposes a /metrics endpoint that contains operational metrics for
# the Kubelet and other components.
- job_name: 'kubernetes-nodes'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
# Drop a very high cardinality metric that is incorrect in 3.7. It will be
# fixed in 3.9.
metric_relabel_configs:
- source_labels: [__name__]
action: drop
regex: 'openshift_sdn_pod_(setup|teardown)_latency(.*)'
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
# Scrape config for cAdvisor.
#
# Beginning in Kube 1.7, each node exposes a /metrics/cadvisor endpoint that
# reports container metrics for each running pod. Scrape those by default.
- job_name: 'kubernetes-cadvisor'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
metrics_path: /metrics/cadvisor
kubernetes_sd_configs:
- role: node
# Exclude a set of high cardinality metrics that can contribute to significant
# memory use in large clusters. These can be selectively enabled as necessary
# for medium or small clusters.
metric_relabel_configs:
- source_labels: [__name__]
action: drop
regex: 'container_(cpu_user_seconds_total|cpu_cfs_periods_total|memory_usage_bytes|memory_swap|memory_cache|last_seen|fs_(read_seconds_total|write_seconds_total|sector_(.*)|io_(.*)|reads_merged_total|writes_merged_total)|tasks_state|memory_failcnt|memory_failures_total|spec_memory_swap_limit_bytes|fs_(.*)_bytes_total|spec_(.*))'
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
# Scrape config for service endpoints.
#
# The relabeling allows the actual service scrape endpoint to be configured
# via the following annotations:
#
# * `prometheus.io/scrape`: Only scrape services that have a value of `true`
# * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
# to set this to `https` & most likely set the `tls_config` of the scrape config.
# * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
# * `prometheus.io/port`: If the metrics are exposed on a different port to the
# service then set this appropriately.
- job_name: 'kubernetes-service-endpoints'
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
# TODO: this should be per target
insecure_skip_verify: true
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
# only scrape infrastructure components
- source_labels: [__meta_kubernetes_namespace]
action: keep
regex: 'default|logging|metrics|kube-.+|openshift|openshift-.+'
# drop infrastructure components managed by other scrape targets
- source_labels: [__meta_kubernetes_service_name]
action: drop
regex: 'prometheus-node-exporter'
# only those that have requested scraping
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: (.+)(?::\d+);(\d+)
replacement: $1:$2
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
# Scrape config for node-exporter, which is expected to be running on port 9100.
- job_name: 'kubernetes-nodes-exporter'
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
kubernetes_sd_configs:
- role: node
metric_relabel_configs:
- source_labels: [__name__]
action: drop
regex: 'node_cpu|node_(disk|scrape_collector)_.+'
# preserve a subset of the network, netstat, vmstat, and filesystem series
- source_labels: [__name__]
action: replace
regex: '(node_(netstat_Ip_.+|vmstat_(nr|thp)_.+|filesystem_(free|size|device_error)|network_(transmit|receive)_(drop|errs)))'
target_label: __name__
replacement: renamed_$1
- source_labels: [__name__]
action: drop
regex: 'node_(netstat|vmstat|filesystem|network)_.+'
- source_labels: [__name__]
action: replace
regex: 'renamed_(.+)'
target_label: __name__
replacement: $1
# drop any partial expensive series
- source_labels: [__name__, device]
action: drop
regex: 'node_network_.+;veth.+'
- source_labels: [__name__, mountpoint]
action: drop
regex: 'node_filesystem_(free|size|device_error);([^/].*|/.+)'
relabel_configs:
- source_labels: [__address__]
regex: '(.*):10250'
replacement: '${1}:9100'
target_label: __address__
- source_labels: [__meta_kubernetes_node_label_kubernetes_io_hostname]
target_label: __instance__
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
# TODO: auto-generate these sections, or add a dynamic infrastructure scraper
# Scrape config for the template service broker
- job_name: 'openshift-template-service-broker'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt
server_name: apiserver.openshift-template-service-broker.svc
bearer_token_file: /var/run/secrets/kubernetes.io/scraper/token
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- openshift-template-service-broker
relabel_configs:
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: api-server;https
# Scrape config for the router
- job_name: 'openshift-router'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt
server_name: router.default.svc
bearer_token_file: /var/run/secrets/kubernetes.io/scraper/token
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- default
relabel_configs:
- source_labels: [__meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
action: keep
regex: router;1936-tcp
alerting:
alertmanagers:
- scheme: http
static_configs:
- targets:
- "localhost:9093"
apersion: v1
kind: ConfigMap
medata:
me: alertmanager
da:
ertmanager.yml: |
global:
# The root route on which each incoming alert enters.
route:
# default route if none match
receiver: alert-buffer-wh
# The labels by which incoming alerts are grouped together. For example,
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
# be batched into a single group.
# TODO:
group_by: []
# All the above attributes are inherited by all child routes and can
# overwritten on each.
receivers:
- name: alert-buffer-wh
webhook_configs:
- url: http://localhost:9099/topics/alerts
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment