Skip to content

Instantly share code, notes, and snippets.

@cmwylie19
Created April 14, 2022 22:04
Show Gist options
  • Select an option

  • Save cmwylie19/13daac0a90ba4958021dc6d545e91ec5 to your computer and use it in GitHub Desktop.

Select an option

Save cmwylie19/13daac0a90ba4958021dc6d545e91ec5 to your computer and use it in GitHub Desktop.
MonitoringStackOperator Progress Demo, markdown as code

Demo Overview

This demo has been tested in OCP 4.10.5

TOC

Install MSO

Install the CatalogSource and the Subscription

kubectl apply -f -<<EOF
apiVersion: operators.coreos.com/v1alpha1
kind: CatalogSource
metadata:
  annotations:
  name: monitoring-operators
  namespace: openshift-marketplace
spec:
  displayName: Monitoring Test Operator
  icon:
    base64data: ""
    mediatype: ""
  image: quay.io/tsisodia10/monitoring-stack-operator-catalog:latest
  publisher: Twinkll Sisodia
  sourceType: grpc
  updateStrategy:
    registryPoll:
      interval: 1m0s
---
apiVersion: operators.coreos.com/v1alpha1
kind: Subscription
metadata:
  labels:
    operators.coreos.com/monitoring-stack-operator.openshift-operators: ""
  name: monitoring-stack-operator
  namespace: openshift-operators
spec:
  channel: development
  installPlanApproval: Automatic
  name: monitoring-stack-operator
  source: monitoring-operators
  sourceNamespace: openshift-marketplace
  startingCSV: monitoring-stack-operator.v0.0.1
EOF

Create MSO Instance

kubectl apply -f -<<EOF
apiVersion: monitoring.rhobs/v1alpha1
kind: MonitoringStack
metadata:
  name: starburst
spec:
  logLevel: debug
  prometheusConfig:
    remoteWrite:
    - bearerToken: redacted
      tlsConfig:
        insecureSkipVerify: true
      url: https://observatorium-observatorium.apps.mmikhail-obs1.kni.syseng.devcluster.openshift.com/api/metrics/v1/test/api/v1/receive
      writeRelabelConfigs:
      - action: keep
        regex: (csv_succeeded$|csv_abnormal$|cluster_version$|ALERTS$|subscription_sync_total|trino_.*$|jvm_heap_memory_used$)
        sourceLabels:
        - __name__
  resources:
    limits:
      cpu: 500m
      memory: 512M
    requests:
      cpu: 100m
      memory: 256M
  retention: 120h
EOF

Create Starburst ServiceMonitor

kubectl apply -f -<<EOF
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  labels:
    app: starburst-enterprise
  name: starburst-monitor
  namespace: monitoring-stack-operator
spec:
  endpoints:
  - port: metrics
    scrapeTimeout: 2s
    interval: 2s
  namespaceSelector:
    any: true
    matchNames:
    - openshift-operators
  selector:
    matchLabels:
      app: starburst-enterprise
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: starburst-monitor-2
  namespace: monitoring-stack-operator
  labels:
    app: blue
spec:
  selector:
    matchLabels:
      app: blue
  endpoints:
  - port: http
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  labels:
    app: prometheus-node-exporter
    jobLabel: node-exporter
  name: metrics-prometheus-node-exporter
  namespace: monitoring-stack-operator
spec:
  endpoints:
  - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
    bearerTokenSecret:
      key: ""
    interval: 15s
    port: https
    relabelings:
    - action: replace
      regex: (.*)
      sourceLabels:
      - __meta_kubernetes_pod_node_name
      targetLabel: instance
    scheme: https
    tlsConfig:
      insecureSkipVerify: true
  jobLabel: app.kubernetes.io/name
  namespaceSelector:
    any: true
    matchNames:
    - openshift-monitoring
  selector:
    matchLabels:
      app.kubernetes.io/component: exporter
      app.kubernetes.io/name: node-exporter
      app.kubernetes.io/part-of: openshift-monitoring
EOF

Create PrometheusRules for Starburst

kubectl apply -f -<<EOF
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule 
metadata:
  creationTimestamp: null
  labels:
    prometheus: starburst
  name: starburst-rules
  namespace: monitoring-stack-operator
spec:
  groups:
  - name: starburst_custom_rules
    # In this case, we need to trigger an alert as soon as an instances goes down for demo, 15s too long
    interval: 1s # Configurable like doc says 
    rules: 

    # The average memory used by all queries over a given time period
    - record: starburst_query_mem
      expr: avg_over_time(jvm_memory_bytes_used{endpoint="metrics"}[5m])

    # The max memory available to queries cluster wide
    - record: starburst_max_query_mem
      expr: jvm_memory_bytes_max{endpoint="metrics", area="heap"}

    # The amount of heap memory used by the JVMs across all cluster nodes
    - record: starburst_heap_mem
      expr: jvm_memory_bytes_used{endpoint="metrics",area="heap"}

    # The max amount of heap memory configured in the JVM aggregated across the entire 
    - record: starburst_max_heap_mem
      expr: jvm_memory_bytes_max{endpoint="metrics",area="heap"}

  - name: starburst_alert_rules
    rules: 

    # Query Memory Alert
    - alert: high_starburst_query_mem
      expr: starburst_query_mem >= 45158388108 
      labels:
        severity: page # or critical 
      annotations:
        summary: "High Query Memory"
        description: "High average memory used by all queries over a given time period"

    # Max Memory Alert ( hard to alert on this )
    - alert: high_starburst_max_query_mem
      expr: starburst_max_query_mem >= 94489280512 
      labels:
        severity: warn 
      annotations:
        summary: "High Heap Memory"
        description: "High amount of heap memory used by the JVMs across all cluster nodes" 

    # Heap Memory Alert
    - alert: high_starburst_heap_mem
      expr: starburst_heap_mem >= 45631505600
      labels:
        severity: page # this is critical
      annotations:
        summary: "High Max Heap Memory"
        description: "The max amount of heap memory configured in the JVM aggregated across the entire cluster" 

    # Max Heap Memory Alert
    - alert: high_starburst_max_heap_mem
      expr: starburst_max_heap_mem >= 94489280512
      labels:
        severity: acknowledged
      annotations:
        summary: "High Max Heap Memory Alert"
        description: "The max amount of heap memory configured in the JVM aggregated across the entire cluster" 
    
    # Instance down
    - alert: starburst_instance_down
      expr: count(up{endpoint="metrics"}) != 3
      labels:
        severity: page
      annotations:
        summary: "Starburst instance down"
        description: "The pods churned" 

    # High Thread Count
    - alert: high_thread_count
      expr: sum(thread_count) > 400
      labels:
        severity: page
      annotations:
        summary: "High Thread Count"
        description: "High Thread Count" 

    # JVM Filling UP
    - alert: JvmMemoryFillingUp
      expr: (sum by (instance)(jvm_memory_bytes_used{area="heap"}) / sum by (instance)(jvm_memory_bytes_max{area="heap"})) * 100 > 80
      for: 2m
      labels:
        severity: page
      annotations:
        summary: "JVM memory filling up (instance {{ \$labels.instance }})"
        description: "JVM memory is filling up (> 80%)\n  VALUE = {{ \$value }}\n  LABELS = {{ \$labels }}"

    # Failed Queries
    - alert: starburst_failed_queries
      expr: failed_queries >= 4
      labels:
        severity: page
      annotations:
        summary: "Queries are failing"
        description: "In the last 5 mins the failed queries have risen"     

    # Trino Active Nodes
    - alert: trino_node_failure
      expr: trino_active_nodes <= 1
      labels:
        severity: page
      annotations:
        summary: "Trino node failure"
        description: "An active trino node went down"
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule 
metadata:
  creationTimestamp: null
  labels:
    prometheus: blue
    role: alert-rules
  name: starburst-rules-2
  namespace: monitoring-stack-operator
spec:
  groups:
  - name: recording_rules
    interval: 2s
    rules: 
    - record: trino_pod_memory_limit
      expr: increase(http_requests_total{container="blue"}[1m])
  - name: LoadRules
    rules: 
    - alert: HighTrinoPodMemory
      expr: trino_pod_memory_limit >= 10 
      labels:
        severity: page # or critical 
      annotations:
        summary: "high memory usage"
        description: "Trino pods are running out of memory"
    - alert: MediumTrinoPodMemory
      expr: trino_pod_memory_limit >= 5 
      labels:
        severity: warn 
      annotations:
        summary: "medium load average"
        description: "medium load average" 
    - alert: LowTrinoPodMemory
      expr: trino_pod_memory_limit >= 1 
      labels:
        severity: acknowledged
      annotations:
        summary: "low load average"
        description: "low load average"    
EOF

Create AlertmanagerConfig for Starburst

kubectl apply -f -<<EOF
apiVersion: monitoring.coreos.com/v1alpha1
kind: AlertmanagerConfig
metadata:
  name: starburst-pagerduty
  namespace: monitoring-stack-operator
  labels:
    alertmanagerConfig: starburst
spec:
  route:
    groupBy: [alertname,cluster,service,job]
    groupWait: 2s #15s
    groupInterval: 2s #15s
    repeatInterval: 2s #15s
    receiver: 'pagerduty-instance'
    routes: 

   # Don't page on warnings

   # - match: 
   #     severity: 'warn'
   #   receiver: pagerduty-instance
   # - match: 
   #     severity: 'acknowledged'
   #   receiver: pagerduty-instance

   # Only pager under severe conditions 

    - match: 
        severity: 'page'
      receiver: pagerduty-instance

  receivers:
  - name: 'pagerduty-instance'
    pagerdutyConfigs:
    - serviceKey: 
        key: secretKey
        name: pagerduty-key
      url: https://events.pagerduty.com/generic/2010-04-15/create_event.json

EOF

Trigger Alerts from Pagerduty

# Trigger alerts from Memory
for x in $(seq 20); do kubectl exec -it n -- curl blue:9000/; done 

k delete po -n openshift-operators -l app=starburst-enterprise --force --grace-period=0

Cleanup

kubectl delete servicemonitor metrics-prometheus-node-exporter starburst-monitor starburst-monitor-2

kubectl delete prometheusrules starburst-rules starburst-rules-2

kubectl delete alertmanagerconfig starburst-pagerduty
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment