Last active
November 4, 2025 02:52
-
-
Save davidlu1001/f4afc8e32dda13d86a235f977a77c151 to your computer and use it in GitHub Desktop.
dashboard.tf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # dashboard.tf | |
| # Azure Monitor Dashboard for AKS Platform & Application Overview | |
| resource "azurerm_portal_dashboard" "aks_monitoring" { | |
| count = var.enable_dashboard ? 1 : 0 | |
| name = "${local.base_suffix}-monitoring-dashboard" | |
| resource_group_name = var.resource_group_name | |
| location = var.location | |
| tags = merge(local.alert_tags, { dashboard_version = "1.2", managed_by = "terraform" }) | |
| dashboard_properties = jsonencode({ | |
| lenses = { | |
| "0" = { | |
| order = 0 | |
| parts = { | |
| # ============================================================ | |
| # NODE HEALTH SECTION (Items 0-4) | |
| # ============================================================ | |
| "0" = { | |
| position = { x = 0, y = 0, rowSpan = 1, colSpan = 16 } | |
| metadata = { | |
| inputs = [] | |
| type = "Extension/HubsExtension/PartType/MarkdownPart" | |
| settings = { | |
| content = { | |
| settings = { | |
| content = "## π₯οΈ Node Health & Resource Utilization" | |
| subtitle = "Monitor cluster node performance and availability" | |
| } | |
| } | |
| } | |
| } | |
| } | |
| # Node CPU Usage | |
| "1" = { | |
| position = { x = 0, y = 1, rowSpan = 5, colSpan = 8 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "cpu001", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| Perf | |
| | where ObjectName == 'K8SNode' and CounterName == 'cpuUsageNanoCores' | |
| | where isnotnull(CounterValue) and CounterValue > 0 | |
| | extend CPUPercent = CounterValue / 10000000 | |
| | summarize | |
| avg_cpu = avg(CPUPercent), | |
| p95_cpu = percentile(CPUPercent, 95) | |
| by bin(TimeGenerated, 5m), Computer | |
| | order by TimeGenerated asc | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "FrameControlChart", isOptional = true }, | |
| { name = "SpecificChart", value = "Line", isOptional = true }, | |
| { name = "PartTitle", value = "Node CPU Usage (%)", isOptional = true }, | |
| { name = "PartSubTitle", value = "Avg & P95 by Node", isOptional = true }, | |
| { | |
| name = "Dimensions" | |
| value = { | |
| xAxis = { name = "TimeGenerated", type = "datetime" } | |
| yAxis = [{ name = "avg_cpu", type = "real" }] | |
| splitBy = [{ name = "Computer", type = "string" }] | |
| aggregation = "Average" | |
| } | |
| isOptional = true | |
| }, | |
| { name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true }, | |
| { name = "IsQueryContainTimeRange", value = false, isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # Node Memory Usage | |
| "2" = { | |
| position = { x = 8, y = 1, rowSpan = 5, colSpan = 8 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "mem001", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| Perf | |
| | where ObjectName == 'K8SNode' and CounterName == 'memoryWorkingSetBytes' | |
| | where isnotnull(CounterValue) and CounterValue > 0 | |
| | extend MemoryGB = CounterValue / 1073741824 | |
| | summarize | |
| avg_mem = avg(MemoryGB), | |
| max_mem = max(MemoryGB) | |
| by bin(TimeGenerated, 5m), Computer | |
| | order by TimeGenerated asc | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "FrameControlChart", isOptional = true }, | |
| { name = "SpecificChart", value = "Line", isOptional = true }, | |
| { name = "PartTitle", value = "Node Memory Usage (GB)", isOptional = true }, | |
| { name = "PartSubTitle", value = "Working Set - Avg & Max", isOptional = true }, | |
| { name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "avg_mem", type = "real" }], splitBy = [{ name = "Computer", type = "string" }], aggregation = "Average" }, isOptional = true }, | |
| { name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true }, | |
| { name = "IsQueryContainTimeRange", value = false, isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # Node Disk Usage | |
| "3" = { | |
| position = { x = 0, y = 6, rowSpan = 5, colSpan = 8 } | |
| metadata = { | |
| inputs = [ | |
| { name = "options", value = { chart = { metrics = [{ resourceMetadata = { id = data.azurerm_kubernetes_cluster.aks.id }, name = "node_disk_usage_percentage", aggregationType = 4, namespace = "Microsoft.ContainerService/managedClusters", metricVisualization = { displayName = "Disk Used Percentage" } }], title = "Node Disk Usage (%)", titleKind = 2, visualization = { chartType = 2 } }, title = "Node Disk Usage (%)" }, isOptional = true } | |
| ] | |
| type = "Extension/HubsExtension/PartType/MonitorChartPart" | |
| settings = { | |
| content = { | |
| options = { | |
| chart = { | |
| metrics = [{ | |
| resourceMetadata = { id = data.azurerm_kubernetes_cluster.aks.id } | |
| name = "node_disk_usage_percentage" | |
| aggregationType = 4 | |
| namespace = "Microsoft.ContainerService/managedClusters" | |
| metricVisualization = { | |
| displayName = "Disk Used Percentage" | |
| } | |
| }] | |
| title = "Node Disk Usage (%)" | |
| titleKind = 2 | |
| visualization = { chartType = 2 } | |
| timespan = { relative = { duration = 86400000 } } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| # Node Health Issues | |
| "4" = { | |
| position = { x = 8, y = 6, rowSpan = 5, colSpan = 8 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "health001", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| KubeNodeInventory | |
| | where Status != 'Ready' | |
| | summarize NodeCount = dcount(Computer) by bin(TimeGenerated, 5m), Status | |
| | where NodeCount > 0 | |
| | order by TimeGenerated asc | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "FrameControlChart", isOptional = true }, | |
| { name = "SpecificChart", value = "StackedColumn", isOptional = true }, | |
| { name = "PartTitle", value = "Node Health Issues", isOptional = true }, | |
| { name = "PartSubTitle", value = "Non-Ready Nodes", isOptional = true }, | |
| { name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "NodeCount", type = "long" }], splitBy = [{ name = "Status", type = "string" }], aggregation = "Sum" }, isOptional = true }, | |
| { name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true }, | |
| { name = "IsQueryContainTimeRange", value = false, isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # ============================================================ | |
| # POD HEALTH SECTION (Items 5-9) | |
| # ============================================================ | |
| "5" = { | |
| position = { x = 0, y = 11, rowSpan = 1, colSpan = 16 } | |
| metadata = { | |
| inputs = [] | |
| type = "Extension/HubsExtension/PartType/MarkdownPart" | |
| settings = { | |
| content = { | |
| settings = { | |
| content = "## π³ Pod Health & Container Performance" | |
| subtitle = "Track pod lifecycle, restarts, and resource consumption" | |
| } | |
| } | |
| } | |
| } | |
| } | |
| # Pod Restart Rate | |
| "6" = { | |
| position = { x = 0, y = 12, rowSpan = 5, colSpan = 8 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "restart001", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| KubePodInventory | |
| | where isnotempty(ContainerRestartCount) | |
| | extend RestartCount = toint(ContainerRestartCount) | |
| | where RestartCount > 0 | |
| | summarize MaxRestarts = max(RestartCount) by bin(TimeGenerated, 5m), Name | |
| | order by MaxRestarts desc, TimeGenerated asc | |
| | take 50 | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "FrameControlChart", isOptional = true }, | |
| { name = "SpecificChart", value = "Line", isOptional = true }, | |
| { name = "PartTitle", value = "Pod Restart Count", isOptional = true }, | |
| { name = "PartSubTitle", value = "Top Pods with Restarts", isOptional = true }, | |
| { name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "MaxRestarts", type = "long" }], splitBy = [{ name = "Name", type = "string" }], aggregation = "Max" }, isOptional = true }, | |
| { name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true }, | |
| { name = "IsQueryContainTimeRange", value = false, isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # Pod Count by Namespace | |
| "7" = { | |
| position = { x = 8, y = 12, rowSpan = 5, colSpan = 8 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "podns001", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| KubePodInventory | |
| | where Namespace !in ('kube-system', 'kube-public', 'kube-node-lease') | |
| | summarize PodCount = dcount(Name) by bin(TimeGenerated, 5m), Namespace | |
| | order by TimeGenerated asc | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "FrameControlChart", isOptional = true }, | |
| { name = "SpecificChart", value = "StackedArea", isOptional = true }, | |
| { name = "PartTitle", value = "Pod Count by Namespace", isOptional = true }, | |
| { name = "PartSubTitle", value = "Application namespaces", isOptional = true }, | |
| { name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "PodCount", type = "long" }], splitBy = [{ name = "Namespace", type = "string" }], aggregation = "Sum" }, isOptional = true }, | |
| { name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true }, | |
| { name = "IsQueryContainTimeRange", value = false, isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # Pod Memory by Namespace | |
| "8" = { | |
| position = { x = 0, y = 17, rowSpan = 5, colSpan = 8 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "podmem001", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| let memData = Perf | |
| | where ObjectName == 'K8SContainer' and CounterName == 'memoryWorkingSetBytes' | |
| | where isnotnull(CounterValue) and CounterValue > 0 | |
| | extend PodUid = tostring(split(InstanceName, '/')[3]) | |
| | project TimeGenerated, PodUid, MemoryBytes = CounterValue; | |
| KubePodInventory | |
| | where isnotempty(Namespace) and Namespace !in ('kube-system', 'kube-public', 'kube-node-lease') | |
| | project TimeGenerated, PodUid, Namespace | |
| | join kind=inner (memData) on PodUid, TimeGenerated | |
| | extend MemoryGB = MemoryBytes / 1073741824 | |
| | summarize avg_mem = avg(MemoryGB) by bin(TimeGenerated, 5m), Namespace | |
| | order by TimeGenerated asc | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "FrameControlChart", isOptional = true }, | |
| { name = "SpecificChart", value = "StackedArea", isOptional = true }, | |
| { name = "PartTitle", value = "Pod Memory by Namespace (GB)", isOptional = true }, | |
| { name = "PartSubTitle", value = "Average memory usage", isOptional = true }, | |
| { name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "avg_mem", type = "real" }], splitBy = [{ name = "Namespace", type = "string" }], aggregation = "Average" }, isOptional = true }, | |
| { name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true }, | |
| { name = "IsQueryContainTimeRange", value = false, isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # Unhealthy Pods | |
| "9" = { | |
| position = { x = 8, y = 17, rowSpan = 5, colSpan = 8 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "unhealthy001", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| KubePodInventory | |
| | where PodStatus !in ('Running', 'Succeeded') | |
| | summarize PodCount = dcount(Name) by bin(TimeGenerated, 5m), PodStatus | |
| | where PodCount > 0 | |
| | order by TimeGenerated asc | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "FrameControlChart", isOptional = true }, | |
| { name = "SpecificChart", value = "StackedColumn", isOptional = true }, | |
| { name = "PartTitle", value = "Unhealthy Pods", isOptional = true }, | |
| { name = "PartSubTitle", value = "Non-Running/Succeeded Status", isOptional = true }, | |
| { name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "PodCount", type = "long" }], splitBy = [{ name = "PodStatus", type = "string" }], aggregation = "Sum" }, isOptional = true }, | |
| { name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true }, | |
| { name = "IsQueryContainTimeRange", value = false, isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # ============================================================ | |
| # CONTROL PLANE HEALTH (Items 10-13) | |
| # ============================================================ | |
| "10" = { | |
| position = { x = 0, y = 22, rowSpan = 1, colSpan = 16 } | |
| metadata = { | |
| inputs = [] | |
| type = "Extension/HubsExtension/PartType/MarkdownPart" | |
| settings = { | |
| content = { | |
| settings = { | |
| content = "## βοΈ Control Plane & Platform Health" | |
| subtitle = "Monitor control plane pods and system components" | |
| } | |
| } | |
| } | |
| } | |
| } | |
| # Control Plane Pod Status | |
| "11" = { | |
| position = { x = 0, y = 23, rowSpan = 5, colSpan = 8 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "cp001", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| KubePodInventory | |
| | where Namespace == 'kube-system' | |
| | summarize PodCount = dcount(Name) by bin(TimeGenerated, 5m), PodStatus | |
| | order by TimeGenerated asc | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "FrameControlChart", isOptional = true }, | |
| { name = "SpecificChart", value = "StackedArea", isOptional = true }, | |
| { name = "PartTitle", value = "Control Plane Pod Status", isOptional = true }, | |
| { name = "PartSubTitle", value = "kube-system namespace", isOptional = true }, | |
| { name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "PodCount", type = "long" }], splitBy = [{ name = "PodStatus", type = "string" }], aggregation = "Sum" }, isOptional = true }, | |
| { name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true }, | |
| { name = "IsQueryContainTimeRange", value = false, isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # Control Plane Restarts | |
| "12" = { | |
| position = { x = 8, y = 23, rowSpan = 5, colSpan = 8 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "cprestart001", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| KubePodInventory | |
| | where Namespace == 'kube-system' | |
| | where isnotempty(ContainerRestartCount) | |
| | extend RestartCount = toint(ContainerRestartCount) | |
| | where RestartCount > 0 | |
| | summarize MaxRestarts = max(RestartCount) by bin(TimeGenerated, 5m), Name | |
| | order by TimeGenerated asc | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "FrameControlChart", isOptional = true }, | |
| { name = "SpecificChart", value = "Line", isOptional = true }, | |
| { name = "PartTitle", value = "Control Plane Pod Restarts", isOptional = true }, | |
| { name = "PartSubTitle", value = "System pod restart count", isOptional = true }, | |
| { name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "MaxRestarts", type = "long" }], splitBy = [{ name = "Name", type = "string" }], aggregation = "Max" }, isOptional = true }, | |
| { name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true }, | |
| { name = "IsQueryContainTimeRange", value = false, isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # Control Plane Errors | |
| "13" = { | |
| position = { x = 0, y = 28, rowSpan = 5, colSpan = 16 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "cperror001", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| KubeEvents | |
| | where Namespace == 'kube-system' or ObjectKind in ('Node', 'ComponentStatus') | |
| | where Type in ('Warning', 'Error') or (Reason has 'fail' or Reason has 'error') | |
| | extend Severity = case( | |
| Type == 'Error' or Reason has 'fatal', 'Critical', | |
| Type == 'Warning' or Reason has 'error', 'Warning', | |
| 'Info' | |
| ) | |
| | where Severity != 'Info' | |
| | summarize ErrorCount = count() by bin(TimeGenerated, 5m), Severity, Reason | |
| | summarize ErrorCount = sum(ErrorCount) by bin(TimeGenerated, 5m), Severity | |
| | where ErrorCount > 0 | |
| | order by TimeGenerated asc | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "FrameControlChart", isOptional = true }, | |
| { name = "SpecificChart", value = "StackedColumn", isOptional = true }, | |
| { name = "PartTitle", value = "Control Plane Error Events", isOptional = true }, | |
| { name = "PartSubTitle", value = "Warning and Error events", isOptional = true }, | |
| { name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "ErrorCount", type = "long" }], splitBy = [{ name = "Severity", type = "string" }], aggregation = "Sum" }, isOptional = true }, | |
| { name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true }, | |
| { name = "IsQueryContainTimeRange", value = false, isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # ============================================================ | |
| # NETWORK & INGRESS (Items 14-16) | |
| # ============================================================ | |
| "14" = { | |
| position = { x = 0, y = 33, rowSpan = 1, colSpan = 16 } | |
| metadata = { | |
| inputs = [] | |
| type = "Extension/HubsExtension/PartType/MarkdownPart" | |
| settings = { | |
| content = { | |
| settings = { | |
| content = "## π Network & Ingress Metrics" | |
| subtitle = "Monitor network throughput and connectivity" | |
| } | |
| } | |
| } | |
| } | |
| } | |
| # Network Throughput | |
| "15" = { | |
| position = { x = 0, y = 34, rowSpan = 5, colSpan = 8 } | |
| metadata = { | |
| inputs = [ | |
| { name = "options", value = { chart = { metrics = [{ resourceMetadata = { id = data.azurerm_kubernetes_cluster.aks.id }, name = "node_network_in_bytes", aggregationType = 1, namespace = "Microsoft.ContainerService/managedClusters", metricVisualization = { displayName = "Network In" } }, { resourceMetadata = { id = data.azurerm_kubernetes_cluster.aks.id }, name = "node_network_out_bytes", aggregationType = 1, namespace = "Microsoft.ContainerService/managedClusters", metricVisualization = { displayName = "Network Out" } }], title = "Network Throughput (Bytes)", titleKind = 2, visualization = { chartType = 2 } }, title = "Network Throughput" }, isOptional = true } | |
| ] | |
| type = "Extension/HubsExtension/PartType/MonitorChartPart" | |
| settings = { | |
| content = { | |
| options = { | |
| chart = { | |
| metrics = [ | |
| { | |
| resourceMetadata = { id = data.azurerm_kubernetes_cluster.aks.id } | |
| name = "node_network_in_bytes" | |
| aggregationType = 1 | |
| namespace = "Microsoft.ContainerService/managedClusters" | |
| metricVisualization = { | |
| displayName = "Network In (Bytes)" | |
| } | |
| }, | |
| { | |
| resourceMetadata = { id = data.azurerm_kubernetes_cluster.aks.id } | |
| name = "node_network_out_bytes" | |
| aggregationType = 1 | |
| namespace = "Microsoft.ContainerService/managedClusters" | |
| metricVisualization = { | |
| displayName = "Network Out (Bytes)" | |
| } | |
| } | |
| ] | |
| title = "Network Throughput (Bytes)" | |
| titleKind = 2 | |
| visualization = { chartType = 2 } | |
| timespan = { relative = { duration = 86400000 } } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| # Ingress Activity | |
| "16" = { | |
| position = { x = 8, y = 34, rowSpan = 5, colSpan = 8 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "ingress001", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| KubePodInventory | |
| | where ControllerName has 'ingress' or ControllerName has 'nginx' or ControllerName has 'traefik' or ControllerName has 'istio' | |
| | where PodStatus == 'Running' | |
| | summarize PodCount = dcount(Name) by bin(TimeGenerated, 5m), ControllerName | |
| | order by TimeGenerated asc | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "FrameControlChart", isOptional = true }, | |
| { name = "SpecificChart", value = "Line", isOptional = true }, | |
| { name = "PartTitle", value = "Ingress Controller Pods", isOptional = true }, | |
| { name = "PartSubTitle", value = "Running ingress pods", isOptional = true }, | |
| { name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "PodCount", type = "long" }], splitBy = [{ name = "ControllerName", type = "string" }], aggregation = "Average" }, isOptional = true }, | |
| { name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true }, | |
| { name = "IsQueryContainTimeRange", value = false, isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # ============================================================ | |
| # CAPACITY PLANNING (Items 17-20) | |
| # ============================================================ | |
| "17" = { | |
| position = { x = 0, y = 39, rowSpan = 1, colSpan = 16 } | |
| metadata = { | |
| inputs = [] | |
| type = "Extension/HubsExtension/PartType/MarkdownPart" | |
| settings = { | |
| content = { | |
| settings = { | |
| content = "## π Capacity Planning & Resource Trends" | |
| subtitle = "Resource utilization trends for capacity planning" | |
| } | |
| } | |
| } | |
| } | |
| } | |
| # Resource Utilization Trend | |
| "18" = { | |
| position = { x = 0, y = 40, rowSpan = 5, colSpan = 16 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "capacity001", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| let cpu_data = Perf | |
| | where ObjectName == 'K8SNode' and CounterName == 'cpuUsageNanoCores' | |
| | where isnotnull(CounterValue) and CounterValue > 0 | |
| | summarize avg_cpu = avg(CounterValue / 10000000) by bin(TimeGenerated, 1h) | |
| | extend MetricType = 'CPU_Percent', Value = avg_cpu; | |
| let mem_data = Perf | |
| | where ObjectName == 'K8SNode' and CounterName == 'memoryWorkingSetBytes' | |
| | where isnotnull(CounterValue) and CounterValue > 0 | |
| | summarize avg_mem = avg(CounterValue / 1073741824) by bin(TimeGenerated, 1h) | |
| | extend MetricType = 'Memory_GB', Value = avg_mem; | |
| cpu_data | |
| | union mem_data | |
| | project TimeGenerated, MetricType, Value | |
| | order by TimeGenerated asc | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "FrameControlChart", isOptional = true }, | |
| { name = "SpecificChart", value = "Line", isOptional = true }, | |
| { name = "PartTitle", value = "24h Resource Utilization Trend", isOptional = true }, | |
| { name = "PartSubTitle", value = "CPU % & Memory GB", isOptional = true }, | |
| { name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "Value", type = "real" }], splitBy = [{ name = "MetricType", type = "string" }], aggregation = "Average" }, isOptional = true }, | |
| { name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true }, | |
| { name = "IsQueryContainTimeRange", value = false, isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # Running Pods by Namespace | |
| "19" = { | |
| position = { x = 0, y = 45, rowSpan = 5, colSpan = 8 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "runpods001", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| KubePodInventory | |
| | where PodStatus == 'Running' | |
| | summarize PodCount = dcount(Name) by bin(TimeGenerated, 1h), Namespace | |
| | order by TimeGenerated asc | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "FrameControlChart", isOptional = true }, | |
| { name = "SpecificChart", value = "StackedArea", isOptional = true }, | |
| { name = "PartTitle", value = "Running Pods by Namespace", isOptional = true }, | |
| { name = "PartSubTitle", value = "Pod density over time", isOptional = true }, | |
| { name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "PodCount", type = "long" }], splitBy = [{ name = "Namespace", type = "string" }], aggregation = "Sum" }, isOptional = true }, | |
| { name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true }, | |
| { name = "IsQueryContainTimeRange", value = false, isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # Total Container Count | |
| "20" = { | |
| position = { x = 8, y = 45, rowSpan = 5, colSpan = 8 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "container001", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| KubePodInventory | |
| | where isnotempty(ContainerID) | |
| | summarize ContainerCount = dcount(ContainerID) by bin(TimeGenerated, 1h) | |
| | order by TimeGenerated asc | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "FrameControlChart", isOptional = true }, | |
| { name = "SpecificChart", value = "Line", isOptional = true }, | |
| { name = "PartTitle", value = "Total Container Count", isOptional = true }, | |
| { name = "PartSubTitle", value = "Container density trend", isOptional = true }, | |
| { name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "ContainerCount", type = "long" }], aggregation = "Average" }, isOptional = true }, | |
| { name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true }, | |
| { name = "IsQueryContainTimeRange", value = false, isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # ============================================================ | |
| # APPLICATION HEALTH & PERFORMANCE (Items 21-35) | |
| # ============================================================ | |
| "21" = { | |
| position = { x = 0, y = 50, rowSpan = 1, colSpan = 16 } | |
| metadata = { | |
| inputs = [] | |
| type = "Extension/HubsExtension/PartType/MarkdownPart" | |
| settings = { | |
| content = { | |
| settings = { | |
| content = "## π± Application Health & Performance (immuta namespace)" | |
| subtitle = "Monitor application pods, resources, and troubleshooting" | |
| } | |
| } | |
| } | |
| } | |
| } | |
| # Running Pods Tile | |
| "22" = { | |
| position = { x = 0, y = 51, rowSpan = 4, colSpan = 4 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "app_running", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| KubePodInventory | |
| | where Namespace == 'immuta' | |
| | where PodStatus == 'Running' | |
| | summarize RunningPods = dcount(Name) | |
| | project RunningPods | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "AnalyticsGrid", isOptional = true }, | |
| { name = "PartTitle", value = "Running Pods", isOptional = true }, | |
| { name = "PartSubTitle", value = "immuta namespace", isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # Unhealthy Pods Tile | |
| "23" = { | |
| position = { x = 4, y = 51, rowSpan = 4, colSpan = 4 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "app_unhealthy", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| KubePodInventory | |
| | where Namespace == 'immuta' | |
| | where PodStatus !in ('Running', 'Succeeded') | |
| | summarize UnhealthyPods = dcount(Name) | |
| | project UnhealthyPods | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "AnalyticsGrid", isOptional = true }, | |
| { name = "PartTitle", value = "Unhealthy Pods", isOptional = true }, | |
| { name = "PartSubTitle", value = "Non-Running/Failed", isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # Pod Restarts (1h) Tile | |
| "24" = { | |
| position = { x = 8, y = 51, rowSpan = 4, colSpan = 4 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "app_restarts", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| KubePodInventory | |
| | where Namespace == 'immuta' | |
| | where TimeGenerated > ago(1h) | |
| | where isnotempty(ContainerRestartCount) | |
| | extend RestartCount = toint(ContainerRestartCount) | |
| | summarize TotalRestarts = sum(RestartCount) | |
| | project TotalRestarts | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "AnalyticsGrid", isOptional = true }, | |
| { name = "PartTitle", value = "Pod Restarts (1h)", isOptional = true }, | |
| { name = "PartSubTitle", value = "Recent restart count", isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # Average Pod Age Tile | |
| "25" = { | |
| position = { x = 12, y = 51, rowSpan = 4, colSpan = 4 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "app_age", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| KubePodInventory | |
| | where Namespace == 'immuta' | |
| | where PodStatus == 'Running' | |
| | where isnotempty(PodCreationTimeStamp) | |
| | extend PodAge = now() - PodCreationTimeStamp | |
| | summarize AvgAgeDays = avg(PodAge) / 1d | |
| | project AvgAgeDays = round(AvgAgeDays, 1) | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "AnalyticsGrid", isOptional = true }, | |
| { name = "PartTitle", value = "Avg Pod Age (days)", isOptional = true }, | |
| { name = "PartSubTitle", value = "Running pods", isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # Pods in CrashLoopBackOff - CRITICAL | |
| "26" = { | |
| position = { x = 0, y = 55, rowSpan = 5, colSpan = 16 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "app_crashloop", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| KubePodInventory | |
| | where Namespace in ('immuta', 'nginx-ingress') | |
| | where PodStatus contains 'CrashLoop' or (isnotempty(ContainerRestartCount) and toint(ContainerRestartCount) > 5) | |
| | extend RestartCount = toint(ContainerRestartCount) | |
| | project | |
| Time = format_datetime(TimeGenerated, 'yyyy-MM-dd HH:mm'), | |
| PodName = Name, | |
| Namespace, | |
| Status = PodStatus, | |
| Restarts = RestartCount, | |
| Node = Computer | |
| | order by Restarts desc, Time desc | |
| | take 20 | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "AnalyticsGrid", isOptional = true }, | |
| { name = "PartTitle", value = "π¨ CRITICAL: Pods in CrashLoopBackOff or High Restarts", isOptional = true }, | |
| { name = "PartSubTitle", value = "immuta & nginx-ingress - Immediate action required", isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # Top Restarting Pods | |
| "27" = { | |
| position = { x = 0, y = 60, rowSpan = 5, colSpan = 8 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "app_top_restarts", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| KubePodInventory | |
| | where Namespace == 'immuta' | |
| | where isnotempty(ContainerRestartCount) | |
| | extend RestartCount = toint(ContainerRestartCount) | |
| | where RestartCount > 0 | |
| | summarize MaxRestarts = max(RestartCount) by bin(TimeGenerated, 15m), Name | |
| | top 10 by MaxRestarts desc | |
| | order by TimeGenerated asc | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "FrameControlChart", isOptional = true }, | |
| { name = "SpecificChart", value = "Line", isOptional = true }, | |
| { name = "PartTitle", value = "Top Restarting Pods", isOptional = true }, | |
| { name = "PartSubTitle", value = "immuta - Top 10", isOptional = true }, | |
| { name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "MaxRestarts", type = "long" }], splitBy = [{ name = "Name", type = "string" }], aggregation = "Max" }, isOptional = true }, | |
| { name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true }, | |
| { name = "IsQueryContainTimeRange", value = false, isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # Application Events | |
| "28" = { | |
| position = { x = 8, y = 60, rowSpan = 5, colSpan = 8 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "app_events", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| KubeEvents | |
| | where Namespace == 'immuta' | |
| | where Type in ('Warning', 'Error') | |
| | summarize EventCount = count() by bin(TimeGenerated, 30m), Type | |
| | order by TimeGenerated asc | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "FrameControlChart", isOptional = true }, | |
| { name = "SpecificChart", value = "StackedColumn", isOptional = true }, | |
| { name = "PartTitle", value = "Application Events", isOptional = true }, | |
| { name = "PartSubTitle", value = "Warnings & Errors", isOptional = true }, | |
| { name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "EventCount", type = "long" }], splitBy = [{ name = "Type", type = "string" }], aggregation = "Sum" }, isOptional = true }, | |
| { name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true }, | |
| { name = "IsQueryContainTimeRange", value = false, isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # Top CPU Consuming Pods | |
| "29" = { | |
| position = { x = 0, y = 65, rowSpan = 5, colSpan = 8 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "app_cpu", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| let topPods = KubePodInventory | |
| | where Namespace == 'immuta' | |
| | distinct PodUid, Name | |
| | take 10; | |
| Perf | |
| | where ObjectName == 'K8SContainer' and CounterName == 'cpuUsageNanoCores' | |
| | where isnotnull(CounterValue) and CounterValue > 0 | |
| | extend PodUid = tostring(split(InstanceName, '/')[3]) | |
| | where isnotempty(PodUid) | |
| | join kind=inner (topPods) on PodUid | |
| | extend CPUMillicores = CounterValue / 1000000 | |
| | summarize avg_cpu = avg(CPUMillicores) by bin(TimeGenerated, 15m), Name | |
| | order by TimeGenerated asc | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "FrameControlChart", isOptional = true }, | |
| { name = "SpecificChart", value = "Line", isOptional = true }, | |
| { name = "PartTitle", value = "Top CPU Consuming Pods", isOptional = true }, | |
| { name = "PartSubTitle", value = "immuta - millicores", isOptional = true }, | |
| { name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "avg_cpu", type = "real" }], splitBy = [{ name = "Name", type = "string" }], aggregation = "Average" }, isOptional = true }, | |
| { name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true }, | |
| { name = "IsQueryContainTimeRange", value = false, isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # Top Memory Consuming Pods | |
| "30" = { | |
| position = { x = 8, y = 65, rowSpan = 5, colSpan = 8 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "app_memory", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| let topPods = KubePodInventory | |
| | where Namespace == 'immuta' | |
| | distinct PodUid, Name | |
| | take 10; | |
| Perf | |
| | where ObjectName == 'K8SContainer' and CounterName == 'memoryWorkingSetBytes' | |
| | where isnotnull(CounterValue) and CounterValue > 0 | |
| | extend PodUid = tostring(split(InstanceName, '/')[3]) | |
| | where isnotempty(PodUid) | |
| | join kind=inner (topPods) on PodUid | |
| | extend MemoryMB = CounterValue / 1048576 | |
| | summarize avg_mem = avg(MemoryMB) by bin(TimeGenerated, 15m), Name | |
| | order by TimeGenerated asc | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "FrameControlChart", isOptional = true }, | |
| { name = "SpecificChart", value = "Line", isOptional = true }, | |
| { name = "PartTitle", value = "Top Memory Consuming Pods", isOptional = true }, | |
| { name = "PartSubTitle", value = "immuta - MB", isOptional = true }, | |
| { name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "avg_mem", type = "real" }], splitBy = [{ name = "Name", type = "string" }], aggregation = "Average" }, isOptional = true }, | |
| { name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true }, | |
| { name = "IsQueryContainTimeRange", value = false, isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # Failed/Pending Pods | |
| "31" = { | |
| position = { x = 0, y = 70, rowSpan = 5, colSpan = 8 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "app_failed", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| KubePodInventory | |
| | where Namespace in ('immuta', 'nginx-ingress') | |
| | where PodStatus in ('Failed', 'Pending', 'Unknown') | |
| | where isnotempty(PodCreationTimeStamp) | |
| | extend PodAge = now() - PodCreationTimeStamp | |
| | project | |
| Time = format_datetime(TimeGenerated, 'yyyy-MM-dd HH:mm'), | |
| Pod = Name, | |
| Namespace, | |
| Status = PodStatus, | |
| AgeHours = round(PodAge / 1h, 1), | |
| Node = Computer | |
| | order by Time desc | |
| | take 20 | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "AnalyticsGrid", isOptional = true }, | |
| { name = "PartTitle", value = "Failed/Pending Pods", isOptional = true }, | |
| { name = "PartSubTitle", value = "Scheduling issues", isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # Nginx Ingress Health | |
| "32" = { | |
| position = { x = 8, y = 70, rowSpan = 5, colSpan = 8 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "app_ingress", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| KubePodInventory | |
| | where Namespace == 'nginx-ingress' | |
| | extend RestartCount = toint(ContainerRestartCount) | |
| | summarize | |
| RunningPods = dcountif(Name, PodStatus == 'Running'), | |
| TotalPods = dcount(Name), | |
| MaxRestarts = max(RestartCount) | |
| by bin(TimeGenerated, 15m) | |
| | order by TimeGenerated asc | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "FrameControlChart", isOptional = true }, | |
| { name = "SpecificChart", value = "Line", isOptional = true }, | |
| { name = "PartTitle", value = "Nginx Ingress Health", isOptional = true }, | |
| { name = "PartSubTitle", value = "Pod count & restarts", isOptional = true }, | |
| { name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "RunningPods", type = "long" }, { name = "MaxRestarts", type = "long" }], aggregation = "Average" }, isOptional = true }, | |
| { name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true }, | |
| { name = "IsQueryContainTimeRange", value = false, isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # Image Versions & Deployment Status | |
| "33" = { | |
| position = { x = 0, y = 75, rowSpan = 6, colSpan = 16 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "app_versions", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| KubePodInventory | |
| | where Namespace == 'immuta' | |
| | where TimeGenerated > ago(10m) | |
| | where isnotempty(ContainerImage) | |
| | extend ImageParts = split(ContainerImage, ':') | |
| | extend ImageName = tostring(ImageParts[0]) | |
| | extend ImageTag = iff(array_length(ImageParts) > 1, tostring(ImageParts[1]), 'latest') | |
| | summarize | |
| TotalPods = dcount(Name), | |
| RunningPods = dcountif(Name, PodStatus == 'Running'), | |
| PendingPods = dcountif(Name, PodStatus == 'Pending'), | |
| FailedPods = dcountif(Name, PodStatus == 'Failed') | |
| by ControllerName, ImageName, ImageTag | |
| | extend Status = case( | |
| RunningPods == TotalPods, 'β Healthy', | |
| RunningPods > 0, 'β οΈ Degraded', | |
| 'β Failed' | |
| ) | |
| | project | |
| Controller = ControllerName, | |
| Image = ImageName, | |
| Version = ImageTag, | |
| Total = TotalPods, | |
| Running = RunningPods, | |
| Pending = PendingPods, | |
| Failed = FailedPods, | |
| Status | |
| | order by Controller asc | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "AnalyticsGrid", isOptional = true }, | |
| { name = "PartTitle", value = "Image Versions & Deployment Status", isOptional = true }, | |
| { name = "PartSubTitle", value = "immuta - Version tracking", isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # Container Readiness | |
| "34" = { | |
| position = { x = 0, y = 81, rowSpan = 5, colSpan = 8 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "app_readiness", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| KubePodInventory | |
| | where Namespace == 'immuta' | |
| | extend ReadyStatus = iff(PodStatus == 'Running', 'Ready', 'NotReady') | |
| | summarize ContainerCount = count() by bin(TimeGenerated, 15m), ReadyStatus | |
| | order by TimeGenerated asc | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "FrameControlChart", isOptional = true }, | |
| { name = "SpecificChart", value = "StackedArea", isOptional = true }, | |
| { name = "PartTitle", value = "Container Readiness", isOptional = true }, | |
| { name = "PartSubTitle", value = "Ready to serve traffic", isOptional = true }, | |
| { name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "ContainerCount", type = "long" }], splitBy = [{ name = "ReadyStatus", type = "string" }], aggregation = "Sum" }, isOptional = true }, | |
| { name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true }, | |
| { name = "IsQueryContainTimeRange", value = false, isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # Pod Distribution | |
| "35" = { | |
| position = { x = 8, y = 81, rowSpan = 5, colSpan = 8 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "app_distribution", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| KubePodInventory | |
| | where Namespace in ('immuta', 'nginx-ingress') | |
| | where PodStatus == 'Running' | |
| | summarize PodCount = dcount(Name) by bin(TimeGenerated, 15m), Computer | |
| | order by TimeGenerated asc | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "FrameControlChart", isOptional = true }, | |
| { name = "SpecificChart", value = "StackedColumn", isOptional = true }, | |
| { name = "PartTitle", value = "Pod Distribution by Node", isOptional = true }, | |
| { name = "PartSubTitle", value = "Running pods across nodes", isOptional = true }, | |
| { name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "PodCount", type = "long" }], splitBy = [{ name = "Computer", type = "string" }], aggregation = "Average" }, isOptional = true }, | |
| { name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true }, | |
| { name = "IsQueryContainTimeRange", value = false, isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # ============================================================ | |
| # POD VULNERABILITY ASSESSMENT (Items 36-37) | |
| # ============================================================ | |
| "36" = { | |
| position = { x = 0, y = 86, rowSpan = 1, colSpan = 16 } | |
| metadata = { | |
| inputs = [] | |
| type = "Extension/HubsExtension/PartType/MarkdownPart" | |
| settings = { | |
| content = { | |
| settings = { | |
| content = "## π Pod Vulnerability Assessment" | |
| subtitle = "Security vulnerabilities and CVE tracking" | |
| } | |
| } | |
| } | |
| } | |
| } | |
| "37" = { | |
| position = { x = 0, y = 87, rowSpan = 6, colSpan = 16 } | |
| metadata = { | |
| inputs = [ | |
| { name = "resourceTypeMode", isOptional = true }, | |
| { name = "ComponentId", isOptional = true }, | |
| { name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true }, | |
| { name = "PartId", value = "vulm001", isOptional = true }, | |
| { name = "Version", value = "2.0", isOptional = true }, | |
| { name = "TimeRange", isOptional = true }, | |
| { name = "DashboardId", isOptional = true }, | |
| { | |
| name = "Query" | |
| value = <<-QUERY | |
| // AKS cluster vulnerability assessment | |
| let clusterName = "${var.aks_cluster_name}"; | |
| securityresources | |
| | where type == "microsoft.security/assessments/subassessments" | |
| | where id contains clusterName | |
| | extend assessmentKey = extract(@"(?i)providers/Microsoft.Security/assessments/([^/]+)", 1, id) | |
| | where assessmentKey == "c609cf0f-71ab-41e9-a3c6-9a1f7fe1b8d5" | |
| | where properties.status.code == "Unhealthy" | |
| | extend | |
| clusterName = extract(@"asb-aks-[^/]+", 0, id), | |
| environment = case( | |
| id contains "q01", "QA", | |
| id contains "p01", "PROD", | |
| "unknown" | |
| ), | |
| cveId = tostring(properties.additionalData.vulnerabilityDetails.cveId), | |
| severity = tostring(properties.additionalData.vulnerabilityDetails.severity), | |
| namespace = tostring(properties.additionalData.kubernetesContext.workloads[0].namespace) | |
| | summarize | |
| TotalVulnerabilities = count(), | |
| CriticalCount = countif(severity == "Critical"), | |
| HighCount = countif(severity == "High"), | |
| MediumCount = countif(severity == "Medium"), | |
| LowCount = countif(severity == "Low"), | |
| UniqueCVEs = dcount(cveId), | |
| AffectedNamespaces = dcount(namespace) | |
| by environment, clusterName | |
| | project | |
| Environment = environment, | |
| Cluster = clusterName, | |
| TotalVulnerabilities, | |
| CriticalCount, | |
| HighCount, | |
| MediumCount, | |
| LowCount, | |
| UniqueCVEs, | |
| AffectedNamespaces | |
| | order by TotalVulnerabilities desc | |
| QUERY | |
| isOptional = true | |
| }, | |
| { name = "ControlType", value = "AnalyticsGrid", isOptional = true }, | |
| { name = "PartTitle", value = "Pod Vulnerability Summary", isOptional = true }, | |
| { name = "PartSubTitle", value = "CVE tracking by severity", isOptional = true } | |
| ] | |
| type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart" | |
| } | |
| } | |
| # ============================================================ | |
| # SRE QUICK REFERENCE (Items 38-40) | |
| # ============================================================ | |
| "38" = { | |
| position = { x = 0, y = 93, rowSpan = 1, colSpan = 16 } | |
| metadata = { | |
| inputs = [] | |
| type = "Extension/HubsExtension/PartType/MarkdownPart" | |
| settings = { | |
| content = { | |
| settings = { | |
| content = "## π SRE Quick Reference & Runbooks" | |
| subtitle = "Essential links and troubleshooting guides" | |
| } | |
| } | |
| } | |
| } | |
| } | |
| # Runbook Links | |
| "39" = { | |
| position = { x = 0, y = 94, rowSpan = 4, colSpan = 8 } | |
| metadata = { | |
| inputs = [] | |
| type = "Extension/HubsExtension/PartType/MarkdownPart" | |
| settings = { | |
| content = { | |
| settings = { | |
| content = <<-MARKDOWN | |
| ### π Runbook Links | |
| - **[Node NotReady Troubleshooting](https://your-wiki.com/runbooks/node-notready)** | |
| - **[Pod CrashLoopBackOff Guide](https://your-wiki.com/runbooks/pod-crashloop)** | |
| - **[High CPU Investigation](https://your-wiki.com/runbooks/high-cpu)** | |
| - **[Memory Pressure Response](https://your-wiki.com/runbooks/memory-pressure)** | |
| - **[Control Plane Issues](https://your-wiki.com/runbooks/control-plane)** | |
| ### π Quick Tools | |
| - **[Log Analytics](https://portal.azure.com)** - Advanced KQL queries | |
| - **[AKS Diagnostics](https://portal.azure.com)** - Built-in troubleshooting | |
| MARKDOWN | |
| } | |
| } | |
| } | |
| } | |
| } | |
| # Dashboard Best Practices | |
| "40" = { | |
| position = { x = 8, y = 94, rowSpan = 4, colSpan = 8 } | |
| metadata = { | |
| inputs = [] | |
| type = "Extension/HubsExtension/PartType/MarkdownPart" | |
| settings = { | |
| content = { | |
| settings = { | |
| content = <<-MARKDOWN | |
| ### π Dashboard Best Practices | |
| **Time Display:** | |
| - β° All times shown in **Local timezone** | |
| - Auto-adjusts for daylight savings | |
| **Time Range Selection:** | |
| - π΄ **Incident Response:** 1h - 6h | |
| - π‘ **Trend Analysis:** 24h (default) | |
| - π’ **Capacity Planning:** 7d - 30d | |
| **Alert Thresholds:** | |
| - Node CPU: >80% = Warning, >90% = Critical | |
| - Node Memory: >85% = Warning, >95% = Critical | |
| - Pod Restarts: >5 in 1h = Investigate | |
| MARKDOWN | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| } | |
| metadata = { | |
| model = { | |
| timeRange = { | |
| value = { | |
| relative = { | |
| duration = 24 | |
| timeUnit = 1 | |
| } | |
| } | |
| type = "MsPortalFx.Composition.Configuration.ValueTypes.TimeRange" | |
| } | |
| filterLocale = { | |
| value = "en-us" | |
| } | |
| filters = { | |
| value = { | |
| MsPortalFx_TimeRange = { | |
| model = { | |
| format = "local" | |
| granularity = "auto" | |
| relative = "24h" | |
| } | |
| displayCache = { | |
| name = "Local Time" | |
| value = "Past 24 hours" | |
| } | |
| filteredPartIds = [] | |
| } | |
| } | |
| } | |
| } | |
| } | |
| }) | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment