This demo has been tested in OCP 4.10.5
TOC
- Install MonitoringStack Operator
- Create MSO Instance
- Create a ServiceMonitor for Starburst
- Create PrometheusRules for Starburst
- Create AlertManagerConfig for Starburst
- Trigger Alerts from Pagerduty
- Cleanup
Install the CatalogSource and the Subscription
kubectl apply -f -<<EOF
apiVersion: operators.coreos.com/v1alpha1
kind: CatalogSource
metadata:
annotations:
name: monitoring-operators
namespace: openshift-marketplace
spec:
displayName: Monitoring Test Operator
icon:
base64data: ""
mediatype: ""
image: quay.io/tsisodia10/monitoring-stack-operator-catalog:latest
publisher: Twinkll Sisodia
sourceType: grpc
updateStrategy:
registryPoll:
interval: 1m0s
---
apiVersion: operators.coreos.com/v1alpha1
kind: Subscription
metadata:
labels:
operators.coreos.com/monitoring-stack-operator.openshift-operators: ""
name: monitoring-stack-operator
namespace: openshift-operators
spec:
channel: development
installPlanApproval: Automatic
name: monitoring-stack-operator
source: monitoring-operators
sourceNamespace: openshift-marketplace
startingCSV: monitoring-stack-operator.v0.0.1
EOF
kubectl apply -f -<<EOF
apiVersion: monitoring.rhobs/v1alpha1
kind: MonitoringStack
metadata:
name: starburst
spec:
logLevel: debug
prometheusConfig:
remoteWrite:
- bearerToken: redacted
tlsConfig:
insecureSkipVerify: true
url: https://observatorium-observatorium.apps.mmikhail-obs1.kni.syseng.devcluster.openshift.com/api/metrics/v1/test/api/v1/receive
writeRelabelConfigs:
- action: keep
regex: (csv_succeeded$|csv_abnormal$|cluster_version$|ALERTS$|subscription_sync_total|trino_.*$|jvm_heap_memory_used$)
sourceLabels:
- __name__
resources:
limits:
cpu: 500m
memory: 512M
requests:
cpu: 100m
memory: 256M
retention: 120h
EOF
kubectl apply -f -<<EOF
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app: starburst-enterprise
name: starburst-monitor
namespace: monitoring-stack-operator
spec:
endpoints:
- port: metrics
scrapeTimeout: 2s
interval: 2s
namespaceSelector:
any: true
matchNames:
- openshift-operators
selector:
matchLabels:
app: starburst-enterprise
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: starburst-monitor-2
namespace: monitoring-stack-operator
labels:
app: blue
spec:
selector:
matchLabels:
app: blue
endpoints:
- port: http
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app: prometheus-node-exporter
jobLabel: node-exporter
name: metrics-prometheus-node-exporter
namespace: monitoring-stack-operator
spec:
endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
bearerTokenSecret:
key: ""
interval: 15s
port: https
relabelings:
- action: replace
regex: (.*)
sourceLabels:
- __meta_kubernetes_pod_node_name
targetLabel: instance
scheme: https
tlsConfig:
insecureSkipVerify: true
jobLabel: app.kubernetes.io/name
namespaceSelector:
any: true
matchNames:
- openshift-monitoring
selector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: openshift-monitoring
EOF
kubectl apply -f -<<EOF
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
creationTimestamp: null
labels:
prometheus: starburst
name: starburst-rules
namespace: monitoring-stack-operator
spec:
groups:
- name: starburst_custom_rules
# In this case, we need to trigger an alert as soon as an instances goes down for demo, 15s too long
interval: 1s # Configurable like doc says
rules:
# The average memory used by all queries over a given time period
- record: starburst_query_mem
expr: avg_over_time(jvm_memory_bytes_used{endpoint="metrics"}[5m])
# The max memory available to queries cluster wide
- record: starburst_max_query_mem
expr: jvm_memory_bytes_max{endpoint="metrics", area="heap"}
# The amount of heap memory used by the JVMs across all cluster nodes
- record: starburst_heap_mem
expr: jvm_memory_bytes_used{endpoint="metrics",area="heap"}
# The max amount of heap memory configured in the JVM aggregated across the entire
- record: starburst_max_heap_mem
expr: jvm_memory_bytes_max{endpoint="metrics",area="heap"}
- name: starburst_alert_rules
rules:
# Query Memory Alert
- alert: high_starburst_query_mem
expr: starburst_query_mem >= 45158388108
labels:
severity: page # or critical
annotations:
summary: "High Query Memory"
description: "High average memory used by all queries over a given time period"
# Max Memory Alert ( hard to alert on this )
- alert: high_starburst_max_query_mem
expr: starburst_max_query_mem >= 94489280512
labels:
severity: warn
annotations:
summary: "High Heap Memory"
description: "High amount of heap memory used by the JVMs across all cluster nodes"
# Heap Memory Alert
- alert: high_starburst_heap_mem
expr: starburst_heap_mem >= 45631505600
labels:
severity: page # this is critical
annotations:
summary: "High Max Heap Memory"
description: "The max amount of heap memory configured in the JVM aggregated across the entire cluster"
# Max Heap Memory Alert
- alert: high_starburst_max_heap_mem
expr: starburst_max_heap_mem >= 94489280512
labels:
severity: acknowledged
annotations:
summary: "High Max Heap Memory Alert"
description: "The max amount of heap memory configured in the JVM aggregated across the entire cluster"
# Instance down
- alert: starburst_instance_down
expr: count(up{endpoint="metrics"}) != 3
labels:
severity: page
annotations:
summary: "Starburst instance down"
description: "The pods churned"
# High Thread Count
- alert: high_thread_count
expr: sum(thread_count) > 400
labels:
severity: page
annotations:
summary: "High Thread Count"
description: "High Thread Count"
# JVM Filling UP
- alert: JvmMemoryFillingUp
expr: (sum by (instance)(jvm_memory_bytes_used{area="heap"}) / sum by (instance)(jvm_memory_bytes_max{area="heap"})) * 100 > 80
for: 2m
labels:
severity: page
annotations:
summary: "JVM memory filling up (instance {{ \$labels.instance }})"
description: "JVM memory is filling up (> 80%)\n VALUE = {{ \$value }}\n LABELS = {{ \$labels }}"
# Failed Queries
- alert: starburst_failed_queries
expr: failed_queries >= 4
labels:
severity: page
annotations:
summary: "Queries are failing"
description: "In the last 5 mins the failed queries have risen"
# Trino Active Nodes
- alert: trino_node_failure
expr: trino_active_nodes <= 1
labels:
severity: page
annotations:
summary: "Trino node failure"
description: "An active trino node went down"
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
creationTimestamp: null
labels:
prometheus: blue
role: alert-rules
name: starburst-rules-2
namespace: monitoring-stack-operator
spec:
groups:
- name: recording_rules
interval: 2s
rules:
- record: trino_pod_memory_limit
expr: increase(http_requests_total{container="blue"}[1m])
- name: LoadRules
rules:
- alert: HighTrinoPodMemory
expr: trino_pod_memory_limit >= 10
labels:
severity: page # or critical
annotations:
summary: "high memory usage"
description: "Trino pods are running out of memory"
- alert: MediumTrinoPodMemory
expr: trino_pod_memory_limit >= 5
labels:
severity: warn
annotations:
summary: "medium load average"
description: "medium load average"
- alert: LowTrinoPodMemory
expr: trino_pod_memory_limit >= 1
labels:
severity: acknowledged
annotations:
summary: "low load average"
description: "low load average"
EOF
kubectl apply -f -<<EOF
apiVersion: monitoring.coreos.com/v1alpha1
kind: AlertmanagerConfig
metadata:
name: starburst-pagerduty
namespace: monitoring-stack-operator
labels:
alertmanagerConfig: starburst
spec:
route:
groupBy: [alertname,cluster,service,job]
groupWait: 2s #15s
groupInterval: 2s #15s
repeatInterval: 2s #15s
receiver: 'pagerduty-instance'
routes:
# Don't page on warnings
# - match:
# severity: 'warn'
# receiver: pagerduty-instance
# - match:
# severity: 'acknowledged'
# receiver: pagerduty-instance
# Only pager under severe conditions
- match:
severity: 'page'
receiver: pagerduty-instance
receivers:
- name: 'pagerduty-instance'
pagerdutyConfigs:
- serviceKey:
key: secretKey
name: pagerduty-key
url: https://events.pagerduty.com/generic/2010-04-15/create_event.json
EOF
# Trigger alerts from Memory
for x in $(seq 20); do kubectl exec -it n -- curl blue:9000/; done
k delete po -n openshift-operators -l app=starburst-enterprise --force --grace-period=0
kubectl delete servicemonitor metrics-prometheus-node-exporter starburst-monitor starburst-monitor-2
kubectl delete prometheusrules starburst-rules starburst-rules-2
kubectl delete alertmanagerconfig starburst-pagerduty