Skip to content

Instantly share code, notes, and snippets.

@davidlu1001
Last active November 4, 2025 02:52
Show Gist options
  • Select an option

  • Save davidlu1001/f4afc8e32dda13d86a235f977a77c151 to your computer and use it in GitHub Desktop.

Select an option

Save davidlu1001/f4afc8e32dda13d86a235f977a77c151 to your computer and use it in GitHub Desktop.
dashboard.tf
# dashboard.tf
# Azure Monitor Dashboard for AKS Platform & Application Overview
resource "azurerm_portal_dashboard" "aks_monitoring" {
count = var.enable_dashboard ? 1 : 0
name = "${local.base_suffix}-monitoring-dashboard"
resource_group_name = var.resource_group_name
location = var.location
tags = merge(local.alert_tags, { dashboard_version = "1.2", managed_by = "terraform" })
dashboard_properties = jsonencode({
lenses = {
"0" = {
order = 0
parts = {
# ============================================================
# NODE HEALTH SECTION (Items 0-4)
# ============================================================
"0" = {
position = { x = 0, y = 0, rowSpan = 1, colSpan = 16 }
metadata = {
inputs = []
type = "Extension/HubsExtension/PartType/MarkdownPart"
settings = {
content = {
settings = {
content = "## πŸ–₯️ Node Health & Resource Utilization"
subtitle = "Monitor cluster node performance and availability"
}
}
}
}
}
# Node CPU Usage
"1" = {
position = { x = 0, y = 1, rowSpan = 5, colSpan = 8 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "cpu001", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
Perf
| where ObjectName == 'K8SNode' and CounterName == 'cpuUsageNanoCores'
| where isnotnull(CounterValue) and CounterValue > 0
| extend CPUPercent = CounterValue / 10000000
| summarize
avg_cpu = avg(CPUPercent),
p95_cpu = percentile(CPUPercent, 95)
by bin(TimeGenerated, 5m), Computer
| order by TimeGenerated asc
QUERY
isOptional = true
},
{ name = "ControlType", value = "FrameControlChart", isOptional = true },
{ name = "SpecificChart", value = "Line", isOptional = true },
{ name = "PartTitle", value = "Node CPU Usage (%)", isOptional = true },
{ name = "PartSubTitle", value = "Avg & P95 by Node", isOptional = true },
{
name = "Dimensions"
value = {
xAxis = { name = "TimeGenerated", type = "datetime" }
yAxis = [{ name = "avg_cpu", type = "real" }]
splitBy = [{ name = "Computer", type = "string" }]
aggregation = "Average"
}
isOptional = true
},
{ name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true },
{ name = "IsQueryContainTimeRange", value = false, isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# Node Memory Usage
"2" = {
position = { x = 8, y = 1, rowSpan = 5, colSpan = 8 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "mem001", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
Perf
| where ObjectName == 'K8SNode' and CounterName == 'memoryWorkingSetBytes'
| where isnotnull(CounterValue) and CounterValue > 0
| extend MemoryGB = CounterValue / 1073741824
| summarize
avg_mem = avg(MemoryGB),
max_mem = max(MemoryGB)
by bin(TimeGenerated, 5m), Computer
| order by TimeGenerated asc
QUERY
isOptional = true
},
{ name = "ControlType", value = "FrameControlChart", isOptional = true },
{ name = "SpecificChart", value = "Line", isOptional = true },
{ name = "PartTitle", value = "Node Memory Usage (GB)", isOptional = true },
{ name = "PartSubTitle", value = "Working Set - Avg & Max", isOptional = true },
{ name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "avg_mem", type = "real" }], splitBy = [{ name = "Computer", type = "string" }], aggregation = "Average" }, isOptional = true },
{ name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true },
{ name = "IsQueryContainTimeRange", value = false, isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# Node Disk Usage
"3" = {
position = { x = 0, y = 6, rowSpan = 5, colSpan = 8 }
metadata = {
inputs = [
{ name = "options", value = { chart = { metrics = [{ resourceMetadata = { id = data.azurerm_kubernetes_cluster.aks.id }, name = "node_disk_usage_percentage", aggregationType = 4, namespace = "Microsoft.ContainerService/managedClusters", metricVisualization = { displayName = "Disk Used Percentage" } }], title = "Node Disk Usage (%)", titleKind = 2, visualization = { chartType = 2 } }, title = "Node Disk Usage (%)" }, isOptional = true }
]
type = "Extension/HubsExtension/PartType/MonitorChartPart"
settings = {
content = {
options = {
chart = {
metrics = [{
resourceMetadata = { id = data.azurerm_kubernetes_cluster.aks.id }
name = "node_disk_usage_percentage"
aggregationType = 4
namespace = "Microsoft.ContainerService/managedClusters"
metricVisualization = {
displayName = "Disk Used Percentage"
}
}]
title = "Node Disk Usage (%)"
titleKind = 2
visualization = { chartType = 2 }
timespan = { relative = { duration = 86400000 } }
}
}
}
}
}
}
# Node Health Issues
"4" = {
position = { x = 8, y = 6, rowSpan = 5, colSpan = 8 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "health001", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
KubeNodeInventory
| where Status != 'Ready'
| summarize NodeCount = dcount(Computer) by bin(TimeGenerated, 5m), Status
| where NodeCount > 0
| order by TimeGenerated asc
QUERY
isOptional = true
},
{ name = "ControlType", value = "FrameControlChart", isOptional = true },
{ name = "SpecificChart", value = "StackedColumn", isOptional = true },
{ name = "PartTitle", value = "Node Health Issues", isOptional = true },
{ name = "PartSubTitle", value = "Non-Ready Nodes", isOptional = true },
{ name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "NodeCount", type = "long" }], splitBy = [{ name = "Status", type = "string" }], aggregation = "Sum" }, isOptional = true },
{ name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true },
{ name = "IsQueryContainTimeRange", value = false, isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# ============================================================
# POD HEALTH SECTION (Items 5-9)
# ============================================================
"5" = {
position = { x = 0, y = 11, rowSpan = 1, colSpan = 16 }
metadata = {
inputs = []
type = "Extension/HubsExtension/PartType/MarkdownPart"
settings = {
content = {
settings = {
content = "## 🐳 Pod Health & Container Performance"
subtitle = "Track pod lifecycle, restarts, and resource consumption"
}
}
}
}
}
# Pod Restart Rate
"6" = {
position = { x = 0, y = 12, rowSpan = 5, colSpan = 8 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "restart001", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
KubePodInventory
| where isnotempty(ContainerRestartCount)
| extend RestartCount = toint(ContainerRestartCount)
| where RestartCount > 0
| summarize MaxRestarts = max(RestartCount) by bin(TimeGenerated, 5m), Name
| order by MaxRestarts desc, TimeGenerated asc
| take 50
QUERY
isOptional = true
},
{ name = "ControlType", value = "FrameControlChart", isOptional = true },
{ name = "SpecificChart", value = "Line", isOptional = true },
{ name = "PartTitle", value = "Pod Restart Count", isOptional = true },
{ name = "PartSubTitle", value = "Top Pods with Restarts", isOptional = true },
{ name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "MaxRestarts", type = "long" }], splitBy = [{ name = "Name", type = "string" }], aggregation = "Max" }, isOptional = true },
{ name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true },
{ name = "IsQueryContainTimeRange", value = false, isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# Pod Count by Namespace
"7" = {
position = { x = 8, y = 12, rowSpan = 5, colSpan = 8 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "podns001", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
KubePodInventory
| where Namespace !in ('kube-system', 'kube-public', 'kube-node-lease')
| summarize PodCount = dcount(Name) by bin(TimeGenerated, 5m), Namespace
| order by TimeGenerated asc
QUERY
isOptional = true
},
{ name = "ControlType", value = "FrameControlChart", isOptional = true },
{ name = "SpecificChart", value = "StackedArea", isOptional = true },
{ name = "PartTitle", value = "Pod Count by Namespace", isOptional = true },
{ name = "PartSubTitle", value = "Application namespaces", isOptional = true },
{ name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "PodCount", type = "long" }], splitBy = [{ name = "Namespace", type = "string" }], aggregation = "Sum" }, isOptional = true },
{ name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true },
{ name = "IsQueryContainTimeRange", value = false, isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# Pod Memory by Namespace
"8" = {
position = { x = 0, y = 17, rowSpan = 5, colSpan = 8 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "podmem001", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
let memData = Perf
| where ObjectName == 'K8SContainer' and CounterName == 'memoryWorkingSetBytes'
| where isnotnull(CounterValue) and CounterValue > 0
| extend PodUid = tostring(split(InstanceName, '/')[3])
| project TimeGenerated, PodUid, MemoryBytes = CounterValue;
KubePodInventory
| where isnotempty(Namespace) and Namespace !in ('kube-system', 'kube-public', 'kube-node-lease')
| project TimeGenerated, PodUid, Namespace
| join kind=inner (memData) on PodUid, TimeGenerated
| extend MemoryGB = MemoryBytes / 1073741824
| summarize avg_mem = avg(MemoryGB) by bin(TimeGenerated, 5m), Namespace
| order by TimeGenerated asc
QUERY
isOptional = true
},
{ name = "ControlType", value = "FrameControlChart", isOptional = true },
{ name = "SpecificChart", value = "StackedArea", isOptional = true },
{ name = "PartTitle", value = "Pod Memory by Namespace (GB)", isOptional = true },
{ name = "PartSubTitle", value = "Average memory usage", isOptional = true },
{ name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "avg_mem", type = "real" }], splitBy = [{ name = "Namespace", type = "string" }], aggregation = "Average" }, isOptional = true },
{ name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true },
{ name = "IsQueryContainTimeRange", value = false, isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# Unhealthy Pods
"9" = {
position = { x = 8, y = 17, rowSpan = 5, colSpan = 8 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "unhealthy001", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
KubePodInventory
| where PodStatus !in ('Running', 'Succeeded')
| summarize PodCount = dcount(Name) by bin(TimeGenerated, 5m), PodStatus
| where PodCount > 0
| order by TimeGenerated asc
QUERY
isOptional = true
},
{ name = "ControlType", value = "FrameControlChart", isOptional = true },
{ name = "SpecificChart", value = "StackedColumn", isOptional = true },
{ name = "PartTitle", value = "Unhealthy Pods", isOptional = true },
{ name = "PartSubTitle", value = "Non-Running/Succeeded Status", isOptional = true },
{ name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "PodCount", type = "long" }], splitBy = [{ name = "PodStatus", type = "string" }], aggregation = "Sum" }, isOptional = true },
{ name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true },
{ name = "IsQueryContainTimeRange", value = false, isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# ============================================================
# CONTROL PLANE HEALTH (Items 10-13)
# ============================================================
"10" = {
position = { x = 0, y = 22, rowSpan = 1, colSpan = 16 }
metadata = {
inputs = []
type = "Extension/HubsExtension/PartType/MarkdownPart"
settings = {
content = {
settings = {
content = "## βš™οΈ Control Plane & Platform Health"
subtitle = "Monitor control plane pods and system components"
}
}
}
}
}
# Control Plane Pod Status
"11" = {
position = { x = 0, y = 23, rowSpan = 5, colSpan = 8 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "cp001", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
KubePodInventory
| where Namespace == 'kube-system'
| summarize PodCount = dcount(Name) by bin(TimeGenerated, 5m), PodStatus
| order by TimeGenerated asc
QUERY
isOptional = true
},
{ name = "ControlType", value = "FrameControlChart", isOptional = true },
{ name = "SpecificChart", value = "StackedArea", isOptional = true },
{ name = "PartTitle", value = "Control Plane Pod Status", isOptional = true },
{ name = "PartSubTitle", value = "kube-system namespace", isOptional = true },
{ name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "PodCount", type = "long" }], splitBy = [{ name = "PodStatus", type = "string" }], aggregation = "Sum" }, isOptional = true },
{ name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true },
{ name = "IsQueryContainTimeRange", value = false, isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# Control Plane Restarts
"12" = {
position = { x = 8, y = 23, rowSpan = 5, colSpan = 8 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "cprestart001", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
KubePodInventory
| where Namespace == 'kube-system'
| where isnotempty(ContainerRestartCount)
| extend RestartCount = toint(ContainerRestartCount)
| where RestartCount > 0
| summarize MaxRestarts = max(RestartCount) by bin(TimeGenerated, 5m), Name
| order by TimeGenerated asc
QUERY
isOptional = true
},
{ name = "ControlType", value = "FrameControlChart", isOptional = true },
{ name = "SpecificChart", value = "Line", isOptional = true },
{ name = "PartTitle", value = "Control Plane Pod Restarts", isOptional = true },
{ name = "PartSubTitle", value = "System pod restart count", isOptional = true },
{ name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "MaxRestarts", type = "long" }], splitBy = [{ name = "Name", type = "string" }], aggregation = "Max" }, isOptional = true },
{ name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true },
{ name = "IsQueryContainTimeRange", value = false, isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# Control Plane Errors
"13" = {
position = { x = 0, y = 28, rowSpan = 5, colSpan = 16 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "cperror001", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
KubeEvents
| where Namespace == 'kube-system' or ObjectKind in ('Node', 'ComponentStatus')
| where Type in ('Warning', 'Error') or (Reason has 'fail' or Reason has 'error')
| extend Severity = case(
Type == 'Error' or Reason has 'fatal', 'Critical',
Type == 'Warning' or Reason has 'error', 'Warning',
'Info'
)
| where Severity != 'Info'
| summarize ErrorCount = count() by bin(TimeGenerated, 5m), Severity, Reason
| summarize ErrorCount = sum(ErrorCount) by bin(TimeGenerated, 5m), Severity
| where ErrorCount > 0
| order by TimeGenerated asc
QUERY
isOptional = true
},
{ name = "ControlType", value = "FrameControlChart", isOptional = true },
{ name = "SpecificChart", value = "StackedColumn", isOptional = true },
{ name = "PartTitle", value = "Control Plane Error Events", isOptional = true },
{ name = "PartSubTitle", value = "Warning and Error events", isOptional = true },
{ name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "ErrorCount", type = "long" }], splitBy = [{ name = "Severity", type = "string" }], aggregation = "Sum" }, isOptional = true },
{ name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true },
{ name = "IsQueryContainTimeRange", value = false, isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# ============================================================
# NETWORK & INGRESS (Items 14-16)
# ============================================================
"14" = {
position = { x = 0, y = 33, rowSpan = 1, colSpan = 16 }
metadata = {
inputs = []
type = "Extension/HubsExtension/PartType/MarkdownPart"
settings = {
content = {
settings = {
content = "## 🌐 Network & Ingress Metrics"
subtitle = "Monitor network throughput and connectivity"
}
}
}
}
}
# Network Throughput
"15" = {
position = { x = 0, y = 34, rowSpan = 5, colSpan = 8 }
metadata = {
inputs = [
{ name = "options", value = { chart = { metrics = [{ resourceMetadata = { id = data.azurerm_kubernetes_cluster.aks.id }, name = "node_network_in_bytes", aggregationType = 1, namespace = "Microsoft.ContainerService/managedClusters", metricVisualization = { displayName = "Network In" } }, { resourceMetadata = { id = data.azurerm_kubernetes_cluster.aks.id }, name = "node_network_out_bytes", aggregationType = 1, namespace = "Microsoft.ContainerService/managedClusters", metricVisualization = { displayName = "Network Out" } }], title = "Network Throughput (Bytes)", titleKind = 2, visualization = { chartType = 2 } }, title = "Network Throughput" }, isOptional = true }
]
type = "Extension/HubsExtension/PartType/MonitorChartPart"
settings = {
content = {
options = {
chart = {
metrics = [
{
resourceMetadata = { id = data.azurerm_kubernetes_cluster.aks.id }
name = "node_network_in_bytes"
aggregationType = 1
namespace = "Microsoft.ContainerService/managedClusters"
metricVisualization = {
displayName = "Network In (Bytes)"
}
},
{
resourceMetadata = { id = data.azurerm_kubernetes_cluster.aks.id }
name = "node_network_out_bytes"
aggregationType = 1
namespace = "Microsoft.ContainerService/managedClusters"
metricVisualization = {
displayName = "Network Out (Bytes)"
}
}
]
title = "Network Throughput (Bytes)"
titleKind = 2
visualization = { chartType = 2 }
timespan = { relative = { duration = 86400000 } }
}
}
}
}
}
}
# Ingress Activity
"16" = {
position = { x = 8, y = 34, rowSpan = 5, colSpan = 8 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "ingress001", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
KubePodInventory
| where ControllerName has 'ingress' or ControllerName has 'nginx' or ControllerName has 'traefik' or ControllerName has 'istio'
| where PodStatus == 'Running'
| summarize PodCount = dcount(Name) by bin(TimeGenerated, 5m), ControllerName
| order by TimeGenerated asc
QUERY
isOptional = true
},
{ name = "ControlType", value = "FrameControlChart", isOptional = true },
{ name = "SpecificChart", value = "Line", isOptional = true },
{ name = "PartTitle", value = "Ingress Controller Pods", isOptional = true },
{ name = "PartSubTitle", value = "Running ingress pods", isOptional = true },
{ name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "PodCount", type = "long" }], splitBy = [{ name = "ControllerName", type = "string" }], aggregation = "Average" }, isOptional = true },
{ name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true },
{ name = "IsQueryContainTimeRange", value = false, isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# ============================================================
# CAPACITY PLANNING (Items 17-20)
# ============================================================
"17" = {
position = { x = 0, y = 39, rowSpan = 1, colSpan = 16 }
metadata = {
inputs = []
type = "Extension/HubsExtension/PartType/MarkdownPart"
settings = {
content = {
settings = {
content = "## πŸ“Š Capacity Planning & Resource Trends"
subtitle = "Resource utilization trends for capacity planning"
}
}
}
}
}
# Resource Utilization Trend
"18" = {
position = { x = 0, y = 40, rowSpan = 5, colSpan = 16 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "capacity001", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
let cpu_data = Perf
| where ObjectName == 'K8SNode' and CounterName == 'cpuUsageNanoCores'
| where isnotnull(CounterValue) and CounterValue > 0
| summarize avg_cpu = avg(CounterValue / 10000000) by bin(TimeGenerated, 1h)
| extend MetricType = 'CPU_Percent', Value = avg_cpu;
let mem_data = Perf
| where ObjectName == 'K8SNode' and CounterName == 'memoryWorkingSetBytes'
| where isnotnull(CounterValue) and CounterValue > 0
| summarize avg_mem = avg(CounterValue / 1073741824) by bin(TimeGenerated, 1h)
| extend MetricType = 'Memory_GB', Value = avg_mem;
cpu_data
| union mem_data
| project TimeGenerated, MetricType, Value
| order by TimeGenerated asc
QUERY
isOptional = true
},
{ name = "ControlType", value = "FrameControlChart", isOptional = true },
{ name = "SpecificChart", value = "Line", isOptional = true },
{ name = "PartTitle", value = "24h Resource Utilization Trend", isOptional = true },
{ name = "PartSubTitle", value = "CPU % & Memory GB", isOptional = true },
{ name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "Value", type = "real" }], splitBy = [{ name = "MetricType", type = "string" }], aggregation = "Average" }, isOptional = true },
{ name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true },
{ name = "IsQueryContainTimeRange", value = false, isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# Running Pods by Namespace
"19" = {
position = { x = 0, y = 45, rowSpan = 5, colSpan = 8 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "runpods001", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
KubePodInventory
| where PodStatus == 'Running'
| summarize PodCount = dcount(Name) by bin(TimeGenerated, 1h), Namespace
| order by TimeGenerated asc
QUERY
isOptional = true
},
{ name = "ControlType", value = "FrameControlChart", isOptional = true },
{ name = "SpecificChart", value = "StackedArea", isOptional = true },
{ name = "PartTitle", value = "Running Pods by Namespace", isOptional = true },
{ name = "PartSubTitle", value = "Pod density over time", isOptional = true },
{ name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "PodCount", type = "long" }], splitBy = [{ name = "Namespace", type = "string" }], aggregation = "Sum" }, isOptional = true },
{ name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true },
{ name = "IsQueryContainTimeRange", value = false, isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# Total Container Count
"20" = {
position = { x = 8, y = 45, rowSpan = 5, colSpan = 8 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "container001", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
KubePodInventory
| where isnotempty(ContainerID)
| summarize ContainerCount = dcount(ContainerID) by bin(TimeGenerated, 1h)
| order by TimeGenerated asc
QUERY
isOptional = true
},
{ name = "ControlType", value = "FrameControlChart", isOptional = true },
{ name = "SpecificChart", value = "Line", isOptional = true },
{ name = "PartTitle", value = "Total Container Count", isOptional = true },
{ name = "PartSubTitle", value = "Container density trend", isOptional = true },
{ name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "ContainerCount", type = "long" }], aggregation = "Average" }, isOptional = true },
{ name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true },
{ name = "IsQueryContainTimeRange", value = false, isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# ============================================================
# APPLICATION HEALTH & PERFORMANCE (Items 21-35)
# ============================================================
"21" = {
position = { x = 0, y = 50, rowSpan = 1, colSpan = 16 }
metadata = {
inputs = []
type = "Extension/HubsExtension/PartType/MarkdownPart"
settings = {
content = {
settings = {
content = "## πŸ“± Application Health & Performance (immuta namespace)"
subtitle = "Monitor application pods, resources, and troubleshooting"
}
}
}
}
}
# Running Pods Tile
"22" = {
position = { x = 0, y = 51, rowSpan = 4, colSpan = 4 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "app_running", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
KubePodInventory
| where Namespace == 'immuta'
| where PodStatus == 'Running'
| summarize RunningPods = dcount(Name)
| project RunningPods
QUERY
isOptional = true
},
{ name = "ControlType", value = "AnalyticsGrid", isOptional = true },
{ name = "PartTitle", value = "Running Pods", isOptional = true },
{ name = "PartSubTitle", value = "immuta namespace", isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# Unhealthy Pods Tile
"23" = {
position = { x = 4, y = 51, rowSpan = 4, colSpan = 4 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "app_unhealthy", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
KubePodInventory
| where Namespace == 'immuta'
| where PodStatus !in ('Running', 'Succeeded')
| summarize UnhealthyPods = dcount(Name)
| project UnhealthyPods
QUERY
isOptional = true
},
{ name = "ControlType", value = "AnalyticsGrid", isOptional = true },
{ name = "PartTitle", value = "Unhealthy Pods", isOptional = true },
{ name = "PartSubTitle", value = "Non-Running/Failed", isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# Pod Restarts (1h) Tile
"24" = {
position = { x = 8, y = 51, rowSpan = 4, colSpan = 4 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "app_restarts", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
KubePodInventory
| where Namespace == 'immuta'
| where TimeGenerated > ago(1h)
| where isnotempty(ContainerRestartCount)
| extend RestartCount = toint(ContainerRestartCount)
| summarize TotalRestarts = sum(RestartCount)
| project TotalRestarts
QUERY
isOptional = true
},
{ name = "ControlType", value = "AnalyticsGrid", isOptional = true },
{ name = "PartTitle", value = "Pod Restarts (1h)", isOptional = true },
{ name = "PartSubTitle", value = "Recent restart count", isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# Average Pod Age Tile
"25" = {
position = { x = 12, y = 51, rowSpan = 4, colSpan = 4 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "app_age", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
KubePodInventory
| where Namespace == 'immuta'
| where PodStatus == 'Running'
| where isnotempty(PodCreationTimeStamp)
| extend PodAge = now() - PodCreationTimeStamp
| summarize AvgAgeDays = avg(PodAge) / 1d
| project AvgAgeDays = round(AvgAgeDays, 1)
QUERY
isOptional = true
},
{ name = "ControlType", value = "AnalyticsGrid", isOptional = true },
{ name = "PartTitle", value = "Avg Pod Age (days)", isOptional = true },
{ name = "PartSubTitle", value = "Running pods", isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# Pods in CrashLoopBackOff - CRITICAL
"26" = {
position = { x = 0, y = 55, rowSpan = 5, colSpan = 16 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "app_crashloop", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
KubePodInventory
| where Namespace in ('immuta', 'nginx-ingress')
| where PodStatus contains 'CrashLoop' or (isnotempty(ContainerRestartCount) and toint(ContainerRestartCount) > 5)
| extend RestartCount = toint(ContainerRestartCount)
| project
Time = format_datetime(TimeGenerated, 'yyyy-MM-dd HH:mm'),
PodName = Name,
Namespace,
Status = PodStatus,
Restarts = RestartCount,
Node = Computer
| order by Restarts desc, Time desc
| take 20
QUERY
isOptional = true
},
{ name = "ControlType", value = "AnalyticsGrid", isOptional = true },
{ name = "PartTitle", value = "🚨 CRITICAL: Pods in CrashLoopBackOff or High Restarts", isOptional = true },
{ name = "PartSubTitle", value = "immuta & nginx-ingress - Immediate action required", isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# Top Restarting Pods
"27" = {
position = { x = 0, y = 60, rowSpan = 5, colSpan = 8 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "app_top_restarts", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
KubePodInventory
| where Namespace == 'immuta'
| where isnotempty(ContainerRestartCount)
| extend RestartCount = toint(ContainerRestartCount)
| where RestartCount > 0
| summarize MaxRestarts = max(RestartCount) by bin(TimeGenerated, 15m), Name
| top 10 by MaxRestarts desc
| order by TimeGenerated asc
QUERY
isOptional = true
},
{ name = "ControlType", value = "FrameControlChart", isOptional = true },
{ name = "SpecificChart", value = "Line", isOptional = true },
{ name = "PartTitle", value = "Top Restarting Pods", isOptional = true },
{ name = "PartSubTitle", value = "immuta - Top 10", isOptional = true },
{ name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "MaxRestarts", type = "long" }], splitBy = [{ name = "Name", type = "string" }], aggregation = "Max" }, isOptional = true },
{ name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true },
{ name = "IsQueryContainTimeRange", value = false, isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# Application Events
"28" = {
position = { x = 8, y = 60, rowSpan = 5, colSpan = 8 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "app_events", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
KubeEvents
| where Namespace == 'immuta'
| where Type in ('Warning', 'Error')
| summarize EventCount = count() by bin(TimeGenerated, 30m), Type
| order by TimeGenerated asc
QUERY
isOptional = true
},
{ name = "ControlType", value = "FrameControlChart", isOptional = true },
{ name = "SpecificChart", value = "StackedColumn", isOptional = true },
{ name = "PartTitle", value = "Application Events", isOptional = true },
{ name = "PartSubTitle", value = "Warnings & Errors", isOptional = true },
{ name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "EventCount", type = "long" }], splitBy = [{ name = "Type", type = "string" }], aggregation = "Sum" }, isOptional = true },
{ name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true },
{ name = "IsQueryContainTimeRange", value = false, isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# Top CPU Consuming Pods
"29" = {
position = { x = 0, y = 65, rowSpan = 5, colSpan = 8 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "app_cpu", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
let topPods = KubePodInventory
| where Namespace == 'immuta'
| distinct PodUid, Name
| take 10;
Perf
| where ObjectName == 'K8SContainer' and CounterName == 'cpuUsageNanoCores'
| where isnotnull(CounterValue) and CounterValue > 0
| extend PodUid = tostring(split(InstanceName, '/')[3])
| where isnotempty(PodUid)
| join kind=inner (topPods) on PodUid
| extend CPUMillicores = CounterValue / 1000000
| summarize avg_cpu = avg(CPUMillicores) by bin(TimeGenerated, 15m), Name
| order by TimeGenerated asc
QUERY
isOptional = true
},
{ name = "ControlType", value = "FrameControlChart", isOptional = true },
{ name = "SpecificChart", value = "Line", isOptional = true },
{ name = "PartTitle", value = "Top CPU Consuming Pods", isOptional = true },
{ name = "PartSubTitle", value = "immuta - millicores", isOptional = true },
{ name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "avg_cpu", type = "real" }], splitBy = [{ name = "Name", type = "string" }], aggregation = "Average" }, isOptional = true },
{ name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true },
{ name = "IsQueryContainTimeRange", value = false, isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# Top Memory Consuming Pods
"30" = {
position = { x = 8, y = 65, rowSpan = 5, colSpan = 8 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "app_memory", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
let topPods = KubePodInventory
| where Namespace == 'immuta'
| distinct PodUid, Name
| take 10;
Perf
| where ObjectName == 'K8SContainer' and CounterName == 'memoryWorkingSetBytes'
| where isnotnull(CounterValue) and CounterValue > 0
| extend PodUid = tostring(split(InstanceName, '/')[3])
| where isnotempty(PodUid)
| join kind=inner (topPods) on PodUid
| extend MemoryMB = CounterValue / 1048576
| summarize avg_mem = avg(MemoryMB) by bin(TimeGenerated, 15m), Name
| order by TimeGenerated asc
QUERY
isOptional = true
},
{ name = "ControlType", value = "FrameControlChart", isOptional = true },
{ name = "SpecificChart", value = "Line", isOptional = true },
{ name = "PartTitle", value = "Top Memory Consuming Pods", isOptional = true },
{ name = "PartSubTitle", value = "immuta - MB", isOptional = true },
{ name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "avg_mem", type = "real" }], splitBy = [{ name = "Name", type = "string" }], aggregation = "Average" }, isOptional = true },
{ name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true },
{ name = "IsQueryContainTimeRange", value = false, isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# Failed/Pending Pods
"31" = {
position = { x = 0, y = 70, rowSpan = 5, colSpan = 8 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "app_failed", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
KubePodInventory
| where Namespace in ('immuta', 'nginx-ingress')
| where PodStatus in ('Failed', 'Pending', 'Unknown')
| where isnotempty(PodCreationTimeStamp)
| extend PodAge = now() - PodCreationTimeStamp
| project
Time = format_datetime(TimeGenerated, 'yyyy-MM-dd HH:mm'),
Pod = Name,
Namespace,
Status = PodStatus,
AgeHours = round(PodAge / 1h, 1),
Node = Computer
| order by Time desc
| take 20
QUERY
isOptional = true
},
{ name = "ControlType", value = "AnalyticsGrid", isOptional = true },
{ name = "PartTitle", value = "Failed/Pending Pods", isOptional = true },
{ name = "PartSubTitle", value = "Scheduling issues", isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# Nginx Ingress Health
"32" = {
position = { x = 8, y = 70, rowSpan = 5, colSpan = 8 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "app_ingress", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
KubePodInventory
| where Namespace == 'nginx-ingress'
| extend RestartCount = toint(ContainerRestartCount)
| summarize
RunningPods = dcountif(Name, PodStatus == 'Running'),
TotalPods = dcount(Name),
MaxRestarts = max(RestartCount)
by bin(TimeGenerated, 15m)
| order by TimeGenerated asc
QUERY
isOptional = true
},
{ name = "ControlType", value = "FrameControlChart", isOptional = true },
{ name = "SpecificChart", value = "Line", isOptional = true },
{ name = "PartTitle", value = "Nginx Ingress Health", isOptional = true },
{ name = "PartSubTitle", value = "Pod count & restarts", isOptional = true },
{ name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "RunningPods", type = "long" }, { name = "MaxRestarts", type = "long" }], aggregation = "Average" }, isOptional = true },
{ name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true },
{ name = "IsQueryContainTimeRange", value = false, isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# Image Versions & Deployment Status
"33" = {
position = { x = 0, y = 75, rowSpan = 6, colSpan = 16 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "app_versions", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
KubePodInventory
| where Namespace == 'immuta'
| where TimeGenerated > ago(10m)
| where isnotempty(ContainerImage)
| extend ImageParts = split(ContainerImage, ':')
| extend ImageName = tostring(ImageParts[0])
| extend ImageTag = iff(array_length(ImageParts) > 1, tostring(ImageParts[1]), 'latest')
| summarize
TotalPods = dcount(Name),
RunningPods = dcountif(Name, PodStatus == 'Running'),
PendingPods = dcountif(Name, PodStatus == 'Pending'),
FailedPods = dcountif(Name, PodStatus == 'Failed')
by ControllerName, ImageName, ImageTag
| extend Status = case(
RunningPods == TotalPods, 'βœ… Healthy',
RunningPods > 0, '⚠️ Degraded',
'❌ Failed'
)
| project
Controller = ControllerName,
Image = ImageName,
Version = ImageTag,
Total = TotalPods,
Running = RunningPods,
Pending = PendingPods,
Failed = FailedPods,
Status
| order by Controller asc
QUERY
isOptional = true
},
{ name = "ControlType", value = "AnalyticsGrid", isOptional = true },
{ name = "PartTitle", value = "Image Versions & Deployment Status", isOptional = true },
{ name = "PartSubTitle", value = "immuta - Version tracking", isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# Container Readiness
"34" = {
position = { x = 0, y = 81, rowSpan = 5, colSpan = 8 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "app_readiness", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
KubePodInventory
| where Namespace == 'immuta'
| extend ReadyStatus = iff(PodStatus == 'Running', 'Ready', 'NotReady')
| summarize ContainerCount = count() by bin(TimeGenerated, 15m), ReadyStatus
| order by TimeGenerated asc
QUERY
isOptional = true
},
{ name = "ControlType", value = "FrameControlChart", isOptional = true },
{ name = "SpecificChart", value = "StackedArea", isOptional = true },
{ name = "PartTitle", value = "Container Readiness", isOptional = true },
{ name = "PartSubTitle", value = "Ready to serve traffic", isOptional = true },
{ name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "ContainerCount", type = "long" }], splitBy = [{ name = "ReadyStatus", type = "string" }], aggregation = "Sum" }, isOptional = true },
{ name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true },
{ name = "IsQueryContainTimeRange", value = false, isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# Pod Distribution
"35" = {
position = { x = 8, y = 81, rowSpan = 5, colSpan = 8 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "app_distribution", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
KubePodInventory
| where Namespace in ('immuta', 'nginx-ingress')
| where PodStatus == 'Running'
| summarize PodCount = dcount(Name) by bin(TimeGenerated, 15m), Computer
| order by TimeGenerated asc
QUERY
isOptional = true
},
{ name = "ControlType", value = "FrameControlChart", isOptional = true },
{ name = "SpecificChart", value = "StackedColumn", isOptional = true },
{ name = "PartTitle", value = "Pod Distribution by Node", isOptional = true },
{ name = "PartSubTitle", value = "Running pods across nodes", isOptional = true },
{ name = "Dimensions", value = { xAxis = { name = "TimeGenerated", type = "datetime" }, yAxis = [{ name = "PodCount", type = "long" }], splitBy = [{ name = "Computer", type = "string" }], aggregation = "Average" }, isOptional = true },
{ name = "LegendOptions", value = { isEnabled = true, position = "Bottom" }, isOptional = true },
{ name = "IsQueryContainTimeRange", value = false, isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# ============================================================
# POD VULNERABILITY ASSESSMENT (Items 36-37)
# ============================================================
"36" = {
position = { x = 0, y = 86, rowSpan = 1, colSpan = 16 }
metadata = {
inputs = []
type = "Extension/HubsExtension/PartType/MarkdownPart"
settings = {
content = {
settings = {
content = "## πŸ”’ Pod Vulnerability Assessment"
subtitle = "Security vulnerabilities and CVE tracking"
}
}
}
}
}
"37" = {
position = { x = 0, y = 87, rowSpan = 6, colSpan = 16 }
metadata = {
inputs = [
{ name = "resourceTypeMode", isOptional = true },
{ name = "ComponentId", isOptional = true },
{ name = "Scope", value = { resourceIds = [data.azurerm_kubernetes_cluster.aks.id] }, isOptional = true },
{ name = "PartId", value = "vulm001", isOptional = true },
{ name = "Version", value = "2.0", isOptional = true },
{ name = "TimeRange", isOptional = true },
{ name = "DashboardId", isOptional = true },
{
name = "Query"
value = <<-QUERY
// AKS cluster vulnerability assessment
let clusterName = "${var.aks_cluster_name}";
securityresources
| where type == "microsoft.security/assessments/subassessments"
| where id contains clusterName
| extend assessmentKey = extract(@"(?i)providers/Microsoft.Security/assessments/([^/]+)", 1, id)
| where assessmentKey == "c609cf0f-71ab-41e9-a3c6-9a1f7fe1b8d5"
| where properties.status.code == "Unhealthy"
| extend
clusterName = extract(@"asb-aks-[^/]+", 0, id),
environment = case(
id contains "q01", "QA",
id contains "p01", "PROD",
"unknown"
),
cveId = tostring(properties.additionalData.vulnerabilityDetails.cveId),
severity = tostring(properties.additionalData.vulnerabilityDetails.severity),
namespace = tostring(properties.additionalData.kubernetesContext.workloads[0].namespace)
| summarize
TotalVulnerabilities = count(),
CriticalCount = countif(severity == "Critical"),
HighCount = countif(severity == "High"),
MediumCount = countif(severity == "Medium"),
LowCount = countif(severity == "Low"),
UniqueCVEs = dcount(cveId),
AffectedNamespaces = dcount(namespace)
by environment, clusterName
| project
Environment = environment,
Cluster = clusterName,
TotalVulnerabilities,
CriticalCount,
HighCount,
MediumCount,
LowCount,
UniqueCVEs,
AffectedNamespaces
| order by TotalVulnerabilities desc
QUERY
isOptional = true
},
{ name = "ControlType", value = "AnalyticsGrid", isOptional = true },
{ name = "PartTitle", value = "Pod Vulnerability Summary", isOptional = true },
{ name = "PartSubTitle", value = "CVE tracking by severity", isOptional = true }
]
type = "Extension/Microsoft_OperationsManagementSuite_Workspace/PartType/LogsDashboardPart"
}
}
# ============================================================
# SRE QUICK REFERENCE (Items 38-40)
# ============================================================
"38" = {
position = { x = 0, y = 93, rowSpan = 1, colSpan = 16 }
metadata = {
inputs = []
type = "Extension/HubsExtension/PartType/MarkdownPart"
settings = {
content = {
settings = {
content = "## πŸ“š SRE Quick Reference & Runbooks"
subtitle = "Essential links and troubleshooting guides"
}
}
}
}
}
# Runbook Links
"39" = {
position = { x = 0, y = 94, rowSpan = 4, colSpan = 8 }
metadata = {
inputs = []
type = "Extension/HubsExtension/PartType/MarkdownPart"
settings = {
content = {
settings = {
content = <<-MARKDOWN
### πŸ”— Runbook Links
- **[Node NotReady Troubleshooting](https://your-wiki.com/runbooks/node-notready)**
- **[Pod CrashLoopBackOff Guide](https://your-wiki.com/runbooks/pod-crashloop)**
- **[High CPU Investigation](https://your-wiki.com/runbooks/high-cpu)**
- **[Memory Pressure Response](https://your-wiki.com/runbooks/memory-pressure)**
- **[Control Plane Issues](https://your-wiki.com/runbooks/control-plane)**
### πŸ” Quick Tools
- **[Log Analytics](https://portal.azure.com)** - Advanced KQL queries
- **[AKS Diagnostics](https://portal.azure.com)** - Built-in troubleshooting
MARKDOWN
}
}
}
}
}
# Dashboard Best Practices
"40" = {
position = { x = 8, y = 94, rowSpan = 4, colSpan = 8 }
metadata = {
inputs = []
type = "Extension/HubsExtension/PartType/MarkdownPart"
settings = {
content = {
settings = {
content = <<-MARKDOWN
### πŸ“Š Dashboard Best Practices
**Time Display:**
- ⏰ All times shown in **Local timezone**
- Auto-adjusts for daylight savings
**Time Range Selection:**
- πŸ”΄ **Incident Response:** 1h - 6h
- 🟑 **Trend Analysis:** 24h (default)
- 🟒 **Capacity Planning:** 7d - 30d
**Alert Thresholds:**
- Node CPU: >80% = Warning, >90% = Critical
- Node Memory: >85% = Warning, >95% = Critical
- Pod Restarts: >5 in 1h = Investigate
MARKDOWN
}
}
}
}
}
}
}
}
metadata = {
model = {
timeRange = {
value = {
relative = {
duration = 24
timeUnit = 1
}
}
type = "MsPortalFx.Composition.Configuration.ValueTypes.TimeRange"
}
filterLocale = {
value = "en-us"
}
filters = {
value = {
MsPortalFx_TimeRange = {
model = {
format = "local"
granularity = "auto"
relative = "24h"
}
displayCache = {
name = "Local Time"
value = "Past 24 hours"
}
filteredPartIds = []
}
}
}
}
}
})
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment