davidlu1001 · September 21, 2025 19:35
diff --git a/terraform-aks-monitoring.txt b/terraform-aks-monitoring.txt
 # Enhanced Combined Files Archive
 # Generated by file-combiner v2.0.1
 # Date: 2025-09-21 19:30:35 UTC
 # Source: /tmp/file_combiner_github_00gln5c_
 # Total files: 14
 # Total size: 109.7KB
 #
 # Format:
 # === FILE_SEPARATOR ===
 # FILE_METADATA: <json_metadata>
 # ENCODING: <encoding_type>
 # <file_content>
 #

 === FILE_SEPARATOR ===
 FILE_METADATA: {"path": ".gitignore", "size": 845, "mtime": 1758483035.6279178, "mode": 33188, "encoding": "utf-8", "checksum": null, "mime_type": null, "is_binary": false, "error": null, "ends_with_newline": false}
 ENCODING: utf-8
 # .gitignore for Terraform Projects

 # Local Terraform directories and files
 # These files are created locally when you run terraform init, plan, and apply.
 # They should not be committed to version control.
 .terraform/
 *.tfstate
 *.tfstate.backup
 *.tfplan
 crash.log
 crash.*.log

 # Inconvenient since this module uses the .tfvars pattern,
 # but it's a best practice to prevent accidental commits of secrets.
 # Users should use a secrets management system or a secure pipeline
 # to inject variables, not commit them.
 *.tfvars
 *.tfvars.json

 # Terraform override files
 # These are meant for local-only overrides and should not be shared.
 override.tf
 override.tf.json
 *_override.tf
 *_override.tf.json

 # Terraform CLI configuration files
 .terraformrc
 terraform.rc

 # Platform-specific files for macOS, Windows, and Linux
 .DS_Store
 Thumbs.db
 *~
 *.swp
 === FILE_SEPARATOR ===
 FILE_METADATA: {"path": "LICENSE", "size": 1065, "mtime": 1758483035.6279178, "mode": 33188, "encoding": "utf-8", "checksum": null, "mime_type": null, "is_binary": false, "error": null, "ends_with_newline": true}
 ENCODING: utf-8
 MIT License

 Copyright (c) 2025

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.

 === FILE_SEPARATOR ===
 FILE_METADATA: {"path": "Observability_Design_Doc.md", "size": 14991, "mtime": 1758483035.6279178, "mode": 33188, "encoding": "utf-8", "checksum": null, "mime_type": "text/markdown", "is_binary": false, "error": null, "ends_with_newline": true}
 ENCODING: utf-8
 # AKS Observability Design for Services

 ## Document Version History

 | Version | Date           | Author | Key Changes                                                                                                                            |
 | :------ | :------------- | :----- | :------------------------------------------------------------------------------------------------------------------------------------- |
 | 1.0     | August 2025    | Admin  | Initial comprehensive design with 55 alerts.                                                                                           |
 | 1.1     | September 2025 | Admin  | Reviewed implementation feasibility, identified application monitoring gap, and refined alert definitions.                             |
 | 2.0     | September 2025 | Admin  | Aligned document with the completed, fully parameterized Terraform module, updated implementation status, and standardized formatting. |

 ## Summary

 This document outlines a comprehensive observability strategy for services running on Azure Kubernetes Service (AKS). The strategy is realized through a **fully generic, reusable Terraform module** that leverages Azure Monitor and Container Insights. The design incorporates Azure's recommended metric alerts and SRE best practices. It is optimized for multi-environment deployments (e.g., QA, Production), supports timezone-aware suppression rules, and provides a clear, actionable alert taxonomy.

 ## Design Objectives

 - **Proactive Detection**: Identify issues before they impact users through comprehensive monitoring.
 - **Cost Optimization**: Enable efficient use of Azure Monitor through intelligent data collection.
 - **Scalability**: Support growing application workloads with minimal operational overhead.
 - **Operational Clarity**: Provide clear alert classification and actionable metadata for efficient incident response.
 - **Regional Optimization**: Allow for fully configurable, timezone-aware maintenance and alerting schedules.
 - **Modular Implementation**: Ensure the design is fully implemented as a standardized, reusable Infrastructure-as-Code (IaC) module.

 ## Architecture Overview

 ### Approved Monitoring Stack
 - **Azure Monitor**: Core telemetry platform and alerting engine.
 - **Container Insights**: AKS-specific monitoring with enhanced container visibility.
 - **Log Analytics Workspace**: Centralized log storage and analysis.
 - **Action Groups**: Role-based notification system (Email, Teams).
 - **Alert Processing Rules**: Intelligent suppression and grouping.

 ### Observability Data Flow
 ```

 ┌─────────────────────────────────────────────────────────────┐
 │                     AKS Cluster Sources                     │
 ├─────────────────┬───────────────────┬─────────────────────┤
 │ • App Pods      │ • Node Metrics    │ • Control Plane     │
 │ • App Logs      │ • System Events   │ • API Server        │
 │ • Health Checks │ • Resource Usage  │ • Etcd/Scheduler    │
 └─────────────────┴───────────────────┴─────────────────────┘
 │
 ▼
 ┌─────────────────────────────────────────────────────────────┐
 │                Container Insights Collection                │
 ├─────────────────┬───────────────────┬─────────────────────┤
 │ • Performance   │ • Health Data     │ • Log Aggregation   │
 │ • Metrics Store │ • Event Capture   │ • Alert Evaluation  │
 └─────────────────┴───────────────────┴─────────────────────┘
 │
 ▼
 ┌─────────────────────────────────────────────────────────────┐
 │                 Log Analytics Workspace                     │
 ├─────────────────┬───────────────────┬─────────────────────┤
 │ • ContainerLogv2│ • Perf Tables     │ • KubeEvents        │
 │ • InsightsMetrics│ • Alert Queries  │ • Custom Analytics  │
 └─────────────────┴───────────────────┴─────────────────────┘
 │
 ▼
 ┌─────────────────────────────────────────────────────────────┐
 │                    Alert Classification                     │
 ├─────────────────┬───────────────────┬─────────────────────┤
 │ • Symptom Alerts│ • Cause Alerts    │ • Severity Levels   │
 │ • Customer Impact│ • Root Cause     │ • Escalation Paths  │
 └─────────────────┴───────────────────┴─────────────────────┘
 │
 ▼
 ┌─────────────────────────────────────────────────────────────┐
 │                 Notification & Response                     │
 ├─────────────────┬───────────────────┬─────────────────────┤
 │ • Email Groups  │ • Time-aware      │ • Runbook Links     │
 │ • Role-based    │ • Suppression     │ • Escalation Matrix │
 └─────────────────┴───────────────────┴─────────────────────┘

 ```

 ## 📊 Alert Taxonomy & Classification

 ### Alert Structure Overview
 Based on SRE best practices, the alerts provided by the base Terraform module are organized into **4 primary categories**.

 ```

 Infrastructure & Platform Alerts
 ├── Azure Resource Level (4 alerts)
 ├── Kubernetes Infrastructure (30 alerts)
 ├── Control Plane Health (7 alerts)
 └── Platform Workload Health (15 alerts)

 ````
 *Note: Application-specific alerts are to be built on top of this foundational module.*

 ### Alert Type Classification
 - **Symptom Alerts**: Alerts that indicate direct customer or service impact (e.g., `PodUnavailableCritical`).
 - **Cause Alerts**: Alerts that help identify the root cause of an issue (e.g., `NodeCPUWarning`).

 ### Severity Distribution
 - **Critical (Sev0 / P0)**: Service-down or imminent failure scenarios. Requires immediate action.
 - **High (Sev1 / P1)**: Significant service degradation or potential for critical failure. Requires urgent attention.
 - **Warning (Sev2 / P2)**: Service degradation, early warning signs, or non-critical failures. Requires attention within business hours.

 ## 🔍 Detailed Alert Categories

 *(Note: All thresholds in this section are examples. The actual values are controlled by the user via `.tfvars` files.)*

 ### 1. Azure Resource Level Alerts
 **Purpose**: Monitor Azure subscription and regional service health impacting the AKS cluster.

 | Alert Name                         | Description                        | Default Threshold  | Severity |
 | :--------------------------------- | :--------------------------------- | :----------------- | :------- |
 | **Azure Subscription Quota Usage** | Resource quota approaching limits  | >85% quota usage   | Warning  |
 | **Azure Service Health Incidents** | Regional Azure service incidents   | Any incident       | High     |
 | **Cost Anomaly Detection**         | Unusual spending patterns          | >50% cost increase | Warning  |
 | **Network Security Group Blocks**  | High volume of blocked connections | >100 blocks/30min  | Warning  |

 ### 2. Kubernetes Infrastructure Alerts
 **Purpose**: Monitor the underlying AKS cluster infrastructure, including nodes and pods.

 #### 2.1 Node Infrastructure Health
 | Alert Name                    | Description                         | Default Threshold    | Severity |
 | :---------------------------- | :---------------------------------- | :------------------- | :------- |
 | **Node CPU Critical**         | Critical node CPU usage             | 90%                  | High     |
 | **Node Memory Critical**      | Critical node memory usage          | 95%                  | High     |
 | **KubeNodeUnreachable**       | Node marked unreachable             | Any unreachable node | High     |
 | **KubeNodeReadinessFlapping** | Node status changing frequently     | >3 changes/15min     | Warning  |
 | **Node Pressure Events**      | Resource pressure (Memory/Disk/PID) | Any pressure event   | High     |
 | **Kubelet Health Issues**     | Kubelet service problems            | Any Kubelet error    | High     |

 #### 2.2 Pod & Container Health
 | Alert Name                   | Description                           | Default Threshold     | Severity |
 | :--------------------------- | :------------------------------------ | :-------------------- | :------- |
 | **Pod Restart Rate High**    | High pod restart frequency            | >5 restarts/30min     | Warning  |
 | **KubeContainerWaiting**     | Container waiting for extended period | >60 minutes           | Warning  |
 | **Pod Unavailable Critical** | Critical pods are unavailable         | Any critical pod down | Critical |

 *(... and many other alerts as defined in the Terraform files.)*

 ### 3. Control Plane Health Alerts
 **Purpose**: Monitor the Kubernetes master components that manage the cluster.

 | Alert Name                      | Description                       | Default Threshold | Severity |
 | :------------------------------ | :-------------------------------- | :---------------- | :------- |
 | **API Server Latency Critical** | Critical API response latency     | >500ms P95        | High     |
 | **Etcd Latency Critical**       | Critical etcd transaction latency | >500ms P95        | High     |
 | **Etcd Health Issues**          | Etcd cluster reports errors       | Any etcd error    | Critical |

 ### 4. Platform Workload Health Alerts
 **Purpose**: Monitor Kubernetes workload patterns and application lifecycle events.

 | Alert Name                         | Description                          | Default Threshold     | Severity |
 | :--------------------------------- | :----------------------------------- | :-------------------- | :------- |
 | **KubePodCrashLooping**            | Pod is in a `CrashLoopBackOff` state | Any pod crash-looping | High     |
 | **KubeContainerOOMKilled**         | Container killed due to OOM          | Any OOMKilled event   | High     |
 | **KubeDeploymentReplicasMismatch** | Deployment replicas mismatch         | Spec ≠ Available      | Warning  |
 | **Failed Pod Scheduling**          | Pods are unable to be scheduled      | Any unschedulable pod | High     |

 ### 5. Application-Specific Monitoring
 **Purpose**: To be built by application teams using the foundational module. This involves creating separate Terraform configurations that define KQL queries for application-specific metrics (e.g., error rates, business transaction latency).

 ## 🚨 Alert Configuration Strategy

 ### Alert Metadata and Actionability
 To enhance operational clarity, all Terraform alert resources include a `custom_properties` block. This metadata directly links an alert to this design document and the team's runbooks.

 **Example Terraform Implementation**:
 ```terraform
 resource "azurerm_monitor_scheduled_query_rules_alert_v2" "pod_storage_latency" {
  # ... other properties
  action {
    action_groups = [azurerm_monitor_action_group.standard.id]
    
    custom_properties = {
      alert_name      = "Pod Storage Latency High"
      alert_type      = "Cause"
      severity_level  = "P2"
      runbook_url     = "${var.runbook_base_url}/aks-pod-storage-latency"
      design_doc      = "AKS Observability Design v5.1"
    }
  }
 }
 ````

 ### Environment-Specific Thresholds

 The Terraform module is designed to accept all thresholds as input variables. This allows each team to maintain separate `.tfvars` files for their environments, such as `qa.tfvars` and `prod.tfvars`, with different sensitivity levels.

 **Example `prod.tfvars` snippet:**

 ```hcl
 # Stricter thresholds for Production
 node_cpu_critical_threshold    = 85  # vs 90 in QA
 pod_restart_warning_threshold  = 3   # vs 5 in QA
 ```

 ## 🛡️ Alert Processing & Suppression Rules

 The module provides two fully configurable suppression rules. The schedule for these rules is controlled entirely by variables in the user's `.tfvars` file.

 #### Maintenance Window Suppression

  - **Purpose**: Suppress non-critical alerts during planned maintenance activities.
  - **Configuration**: `maintenance_window_*` variables in `variables.tf`.
  - **Default**: Suppresses Sev1-Sev4 alerts on Sundays from 2:00 AM to 4:00 AM (Pacific/Auckland).

 #### After-Hours Suppression (for non-production)

  - **Purpose**: Reduce alert fatigue by suppressing low-severity alerts outside of business hours in non-production environments.
  - **Configuration**: `after_hours_*` variables in `variables.tf`.
  - **Default**: Suppresses Sev2-Sev4 alerts from 6:00 PM to 8:00 AM (Pacific/Auckland) in environments specified in `after_hours_suppression_environments` (defaults to `["qa"]`).

 ## 🔧 Implementation Roadmap & Status

 ### Module Status: ✅ **Complete & Ready for Use**

 The foundational Terraform module (`terraform-aks-monitoring`) is complete. It has been fully parameterized and aligns with the best practices outlined in this document.

 ## 🎯 Success Metrics & KPIs

 ### Monitoring Effectiveness

  - **Mean Time to Detection (MTTD)**: Target \< 5 minutes for critical infrastructure issues.
  - **Alert Accuracy**: Target \> 95% true positive rate (low noise).
  - **Module Adoption**: Track the number of teams/projects successfully using this module.

 ### Operational Excellence

  - **Service Availability**: Maintain 99.9% uptime for critical services.
  - **Incident Response**: Reduce MTTR by providing actionable alerts with clear runbook links.

 === FILE_SEPARATOR ===
 FILE_METADATA: {"path": "README.md", "size": 34162, "mtime": 1758483035.6279178, "mode": 33188, "encoding": "utf-8", "checksum": null, "mime_type": "text/markdown", "is_binary": false, "error": null, "ends_with_newline": false}
 ENCODING: utf-8
 # AKS Monitoring & Alerts Terraform Module

 This Terraform module deploys a comprehensive and production-ready monitoring and alerting solution for Azure Kubernetes Service (AKS). It is based on the SRE best practices outlined in the **"AKS Observability Design"** and is designed to be fully modular and reusable across multiple teams and environments.

 ## 🏛️ Design Philosophy

 This module is designed to be **generic and reusable**. It contains no hardcoded environment-specific logic. All configurations, such as alert thresholds and notification emails, are passed in as variables from the calling Terraform project. This approach allows for maximum flexibility and a clear separation of concerns between the alert resource definitions (the module's responsibility) and the environment-specific configurations (the user's responsibility).

 ## ✨ Features

 -   **Comprehensive Alert Suite**: Deploys a suite of **40+ detailed alerts** covering the most critical aspects of your AKS environment.
 -   **Layered Coverage**: Alerts are logically grouped into four key areas:
    1.  **Azure Resources**: Subscription Quotas, Service Health, Cost Anomalies.
    2.  **AKS Infrastructure**: Node Health (CPU, Memory, Disk), Pods (Restarts, Status), and Storage.
    3.  **Kubernetes Control Plane**: API Server Latency, etcd Health.
    4.  **Platform Workloads**: Deployments, Jobs, StatefulSets, and other workload health signals.
 -   **Flexible Notifications**: Creates fully configurable Action Groups for routing notifications to different teams based on severity (e.g., Standard vs. Critical).
 -   **Intelligent Suppression**: Includes fully customizable Alert Suppression Rules for planned maintenance windows and after-hours periods to reduce alert fatigue.
 -   **Fully Parameterized**: Every tunable value, from alert thresholds to suppression schedules, is exposed as an input variable for easy customization via `.tfvars` files.
 -   **Actionable Alerts**: Enriches alerts with rich metadata (`custom_properties`), including links to runbooks, to accelerate incident response.

 ***

 ## 🚀 Getting Started

 ### Prerequisites

 Before using this module, ensure you have the following:
 * Terraform `~> 1.3`
 * AzureRM Provider `~> 4.0`
 * An existing Azure Resource Group.
 * An existing AKS Cluster.
 * An existing Log Analytics Workspace connected to the AKS cluster.

 ### Usage

 1.  **Create a Configuration File**: Copy the `terraform.tfvars.example` file to a new file named `prod.tfvars` or `qa.tfvars`. Update the values in this file to match your target environment.

    **Example `prod.tfvars`:**
    ```hcl
    # General Settings
    environment_name               = "prod"
    environment_short_prefix       = "p"
    location                       = "australiaeast"
    resource_group_name            = "prod-app-aks-rg"
    log_analytics_workspace_name   = "prod-app-law"
    aks_cluster_name               = "prod-app-aks"

    # Notification Settings
    alert_email_sre                = ["[email protected]"]
    alert_email_oncall_primary     = ["[email protected]"]
    
    # Alert Thresholds
    node_cpu_critical_threshold    = 85
    pod_restart_warning_threshold  = 3
    api_server_latency_critical_ms = 300
    ```

 2.  **Call the Module**: In your project's `main.tf`, call the module and pass in the variables that will be populated from your `.tfvars` file.

    ```terraform
    # main.tf in your root project
    
    # Define variables that will be populated from your .tfvars file
    variable "environment_name" {}
    variable "environment_short_prefix" {}
    variable "location" {}
    variable "resource_group_name" {}
    variable "log_analytics_workspace_name" {}
    variable "aks_cluster_name" {}
    # ... define all other variables from the .tfvars file
    
    module "aks_monitoring" {
      source = "./aks_monitoring_module" # Or use a Git source: "git::[https://github.com/your-org/terraform-aks-monitoring.git?ref=v1.0.0](https://github.com/your-org/terraform-aks-monitoring.git?ref=v1.0.0)"
    
      # Pass all variables from the .tfvars file
      environment_name               = var.environment_name
      environment_short_prefix       = var.environment_short_prefix
      location                       = var.location
      resource_group_name            = var.resource_group_name
      log_analytics_workspace_name   = var.log_analytics_workspace_name
      aks_cluster_name               = var.aks_cluster_name
      runbook_base_url               = var.runbook_base_url
      tags                           = var.tags
    
      # --- Notifications ---
      alert_email_sre                = var.alert_email_sre
      alert_email_oncall_primary     = var.alert_email_oncall_primary
      alert_email_oncall_secondary   = var.alert_email_oncall_secondary
      alert_email_manager            = var.alert_email_manager
      teams_webhook_standard         = var.teams_webhook_standard
      teams_webhook_critical         = var.teams_webhook_critical
    
      # --- Suppression Rules ---
      enable_maintenance_window_suppression = var.enable_maintenance_window_suppression
      maintenance_window_timezone         = var.maintenance_window_timezone
      # ... etc for all suppression variables
    
      # --- Alert Thresholds ---
      node_cpu_critical_threshold    = var.node_cpu_critical_threshold
      pod_restart_warning_threshold  = var.pod_restart_warning_threshold
      # ... etc for all threshold variables
    }
    ```

 3.  **Run Terraform**: Execute the standard Terraform workflow, specifying your environment's variable file.

    ```bash
    # Initialize Terraform
    terraform init
    
    # Preview the changes
    terraform plan -var-file="prod.tfvars"
    
    # Apply the configuration
    terraform apply -var-file="prod.tfvars"
    ```

 ***

 ## 📝 Module Reference

 ### Inputs

 For a complete list of all input variables, their descriptions, and default values, please see the **`variables.tf`** file.

 ### Outputs

 The module exports the IDs of all created resources. For a complete list of outputs, please see the **`outputs.tf`** file.

 ***

 ## 📁 Module File Structure

 -   `main.tf`: Core logic, defining data sources, Action Groups, and Suppression Rules.
 -   `variables.tf`: All input variables for the module.
 -   `terraform.tfvars.example`: A template file for user configuration.
 -   `outputs.tf`: All module outputs.
 -   `versions.tf`: Terraform and provider version constraints.
 -   `locals.tf`: Internal local variables, primarily for constructing tags.
 -   `alerts_azure.tf`: Alerts for Azure subscription and resource level health.
 -   `alerts_infra.tf`: Alerts for AKS node, pod, and storage infrastructure health.
 -   `alerts_platform.tf`: Alerts for Kubernetes control plane and platform workload health.

 <!-- BEGIN_TF_DOCS -->
 ## Requirements

 | Name                                                                      | Version |
 | ------------------------------------------------------------------------- | ------- |
 | <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | ~> 1.3  |
 | <a name="requirement_azurerm"></a> [azurerm](#requirement\_azurerm)       | ~> 4.0  |

 ## Providers

 | Name                                                          | Version |
 | ------------------------------------------------------------- | ------- |
 | <a name="provider_azurerm"></a> [azurerm](#provider\_azurerm) | ~> 4.0  |

 ## Modules

 No modules.

 ## Resources

 | Name                                                                                                                                                                                                       | Type        |
 | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------- |
 | [azurerm_monitor_action_group.critical](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_action_group)                                                              | resource    |
 | [azurerm_monitor_action_group.standard](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_action_group)                                                              | resource    |
 | [azurerm_monitor_activity_log_alert.service_health_incidents](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_activity_log_alert)                                  | resource    |
 | [azurerm_monitor_alert_processing_rule_suppression.after_hours_low_severity](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_alert_processing_rule_suppression)    | resource    |
 | [azurerm_monitor_alert_processing_rule_suppression.maintenance_window](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_alert_processing_rule_suppression)          | resource    |
 | [azurerm_monitor_metric_alert.node_cpu_critical](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_metric_alert)                                                     | resource    |
 | [azurerm_monitor_metric_alert.node_cpu_warning](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_metric_alert)                                                      | resource    |
 | [azurerm_monitor_metric_alert.node_memory_critical](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_metric_alert)                                                  | resource    |
 | [azurerm_monitor_metric_alert.node_memory_warning](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_metric_alert)                                                   | resource    |
 | [azurerm_monitor_metric_alert.pod_restart_alert_warning](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_metric_alert)                                             | resource    |
 | [azurerm_monitor_metric_alert.subscription_quota_usage](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_metric_alert)                                              | resource    |
 | [azurerm_monitor_scheduled_query_rules_alert_v2.api_server_latency_critical](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert_v2)       | resource    |
 | [azurerm_monitor_scheduled_query_rules_alert_v2.api_server_latency_warning](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert_v2)        | resource    |
 | [azurerm_monitor_scheduled_query_rules_alert_v2.cost_anomalies](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert_v2)                    | resource    |
 | [azurerm_monitor_scheduled_query_rules_alert_v2.crashloop_backoff](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert_v2)                 | resource    |
 | [azurerm_monitor_scheduled_query_rules_alert_v2.deployment_replica_mismatch](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert_v2)       | resource    |
 | [azurerm_monitor_scheduled_query_rules_alert_v2.etcd_health](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert_v2)                       | resource    |
 | [azurerm_monitor_scheduled_query_rules_alert_v2.etcd_latency_critical](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert_v2)             | resource    |
 | [azurerm_monitor_scheduled_query_rules_alert_v2.etcd_latency_warning](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert_v2)              | resource    |
 | [azurerm_monitor_scheduled_query_rules_alert_v2.failed_pod_scheduling](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert_v2)             | resource    |
 | [azurerm_monitor_scheduled_query_rules_alert_v2.kube_container_waiting](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert_v2)            | resource    |
 | [azurerm_monitor_scheduled_query_rules_alert_v2.kube_cpu_quota_overcommit](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert_v2)         | resource    |
 | [azurerm_monitor_scheduled_query_rules_alert_v2.kube_job_failed](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert_v2)                   | resource    |
 | [azurerm_monitor_scheduled_query_rules_alert_v2.kube_memory_quota_overcommit](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert_v2)      | resource    |
 | [azurerm_monitor_scheduled_query_rules_alert_v2.kube_statefulset_replica_mismatch](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert_v2) | resource    |
 | [azurerm_monitor_scheduled_query_rules_alert_v2.node_pressure_events](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert_v2)              | resource    |
 | [azurerm_monitor_scheduled_query_rules_alert_v2.node_readiness_flapping](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert_v2)           | resource    |
 | [azurerm_monitor_scheduled_query_rules_alert_v2.node_status_issues](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert_v2)                | resource    |
 | [azurerm_monitor_scheduled_query_rules_alert_v2.nsg_blocks](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert_v2)                        | resource    |
 | [azurerm_monitor_scheduled_query_rules_alert_v2.oom_killed](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert_v2)                        | resource    |
 | [azurerm_monitor_scheduled_query_rules_alert_v2.pod_unavailable_critical](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/resources/monitor_scheduled_query_rules_alert_v2)          | resource    |
 | [azurerm_client_config.current](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/client_config)                                                                          | data source |
 | [azurerm_kubernetes_cluster.aks](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/kubernetes_cluster)                                                                    | data source |
 | [azurerm_log_analytics_workspace.main](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/log_analytics_workspace)                                                         | data source |
 | [azurerm_resource_group.main](https://registry.terraform.io/providers/hashicorp/azurerm/latest/docs/data-sources/resource_group)                                                                           | data source |

 ## Inputs

 | Name                                                                                                                                                                  | Description                                                                                                     | Type           | Default                           | Required |
 | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------- | -------------- | --------------------------------- | :------: |
 | <a name="input_after_hours_end_time"></a> [after\_hours\_end\_time](#input\_after\_hours\_end\_time)                                                                  | The end time for after-hours suppression (e.g., start of the morning).                                          | `string`       | `"08:00:00"`                      |    no    |
 | <a name="input_after_hours_start_time"></a> [after\_hours\_start\_time](#input\_after\_hours\_start\_time)                                                            | The start time for after-hours suppression (e.g., start of the evening).                                        | `string`       | `"18:00:00"`                      |    no    |
 | <a name="input_after_hours_suppression_environments"></a> [after\_hours\_suppression\_environments](#input\_after\_hours\_suppression\_environments)                  | A list of environment names (e.g., ['qa', 'dev']) where the after-hours suppression rule should be active.      | `list(string)` | <pre>[<br/>  "qa"<br/>]</pre>     |    no    |
 | <a name="input_after_hours_suppression_timezone"></a> [after\_hours\_suppression\_timezone](#input\_after\_hours\_suppression\_timezone)                              | The IANA timezone name for the after-hours suppression schedule.                                                | `string`       | `"Pacific/Auckland"`              |    no    |
 | <a name="input_aks_cluster_name"></a> [aks\_cluster\_name](#input\_aks\_cluster\_name)                                                                                | The name of the AKS cluster to scope alerts to.                                                                 | `string`       | n/a                               |   yes    |
 | <a name="input_alert_email_manager"></a> [alert\_email\_manager](#input\_alert\_email\_manager)                                                                       | A list of email addresses for manager escalation.                                                               | `list(string)` | `[]`                              |    no    |
 | <a name="input_alert_email_oncall_primary"></a> [alert\_email\_oncall\_primary](#input\_alert\_email\_oncall\_primary)                                                | A list of email addresses for the primary on-call.                                                              | `list(string)` | `[]`                              |    no    |
 | <a name="input_alert_email_oncall_secondary"></a> [alert\_email\_oncall\_secondary](#input\_alert\_email\_oncall\_secondary)                                          | A list of email addresses for the secondary on-call.                                                            | `list(string)` | `[]`                              |    no    |
 | <a name="input_alert_email_sre"></a> [alert\_email\_sre](#input\_alert\_email\_sre)                                                                                   | A list of email addresses for the SRE team.                                                                     | `list(string)` | `[]`                              |    no    |
 | <a name="input_api_server_latency_critical_ms"></a> [api\_server\_latency\_critical\_ms](#input\_api\_server\_latency\_critical\_ms)                                  | P95 API server latency in milliseconds to trigger a critical alert.                                             | `number`       | `500`                             |    no    |
 | <a name="input_api_server_latency_warning_ms"></a> [api\_server\_latency\_warning\_ms](#input\_api\_server\_latency\_warning\_ms)                                     | P95 API server latency in milliseconds to trigger a warning.                                                    | `number`       | `200`                             |    no    |
 | <a name="input_azure_cost_anomaly_percentage_threshold"></a> [azure\_cost\_anomaly\_percentage\_threshold](#input\_azure\_cost\_anomaly\_percentage\_threshold)       | The percentage increase in cost over the baseline to trigger an anomaly alert.                                  | `number`       | `50`                              |    no    |
 | <a name="input_azure_subscription_quota_threshold"></a> [azure\_subscription\_quota\_threshold](#input\_azure\_subscription\_quota\_threshold)                        | The subscription resource quota usage percentage to trigger an alert.                                           | `number`       | `85`                              |    no    |
 | <a name="input_cluster_cpu_overcommit_ratio_threshold"></a> [cluster\_cpu\_overcommit\_ratio\_threshold](#input\_cluster\_cpu\_overcommit\_ratio\_threshold)          | The ratio of total CPU requests to allocatable capacity to trigger an overcommit alert (e.g., 1.5 for 150%).    | `number`       | `1.5`                             |    no    |
 | <a name="input_cluster_memory_overcommit_ratio_threshold"></a> [cluster\_memory\_overcommit\_ratio\_threshold](#input\_cluster\_memory\_overcommit\_ratio\_threshold) | The ratio of total memory requests to allocatable capacity to trigger an overcommit alert (e.g., 1.5 for 150%). | `number`       | `1.5`                             |    no    |
 | <a name="input_enable_after_hours_suppression"></a> [enable\_after\_hours\_suppression](#input\_enable\_after\_hours\_suppression)                                    | If true, the after-hours alert suppression rule will be created for specified environments.                     | `bool`         | `true`                            |    no    |
 | <a name="input_enable_maintenance_window_suppression"></a> [enable\_maintenance\_window\_suppression](#input\_enable\_maintenance\_window\_suppression)               | If true, the maintenance window alert suppression rule will be created.                                         | `bool`         | `true`                            |    no    |
 | <a name="input_environment_name"></a> [environment\_name](#input\_environment\_name)                                                                                  | The name of the environment (e.g., 'qa', 'prod'). Used for naming and tagging.                                  | `string`       | n/a                               |   yes    |
 | <a name="input_environment_short_prefix"></a> [environment\_short\_prefix](#input\_environment\_short\_prefix)                                                        | A short prefix for the environment (e.g., 'q', 'p'). Used for short names in resources.                         | `string`       | n/a                               |   yes    |
 | <a name="input_etcd_latency_critical_ms"></a> [etcd\_latency\_critical\_ms](#input\_etcd\_latency\_critical\_ms)                                                      | P95 etcd latency in milliseconds to trigger a critical alert.                                                   | `number`       | `500`                             |    no    |
 | <a name="input_etcd_latency_warning_ms"></a> [etcd\_latency\_warning\_ms](#input\_etcd\_latency\_warning\_ms)                                                         | P95 etcd latency in milliseconds to trigger a warning.                                                          | `number`       | `100`                             |    no    |
 | <a name="input_location"></a> [location](#input\_location)                                                                                                            | The Azure region where resources are deployed.                                                                  | `string`       | n/a                               |   yes    |
 | <a name="input_log_analytics_workspace_name"></a> [log\_analytics\_workspace\_name](#input\_log\_analytics\_workspace\_name)                                          | The name of the Log Analytics Workspace to scope alerts to.                                                     | `string`       | n/a                               |   yes    |
 | <a name="input_maintenance_window_days_of_week"></a> [maintenance\_window\_days\_of\_week](#input\_maintenance\_window\_days\_of\_week)                               | A list of days of the week for the maintenance window. Allowed values are 'Sunday', 'Monday', etc.              | `list(string)` | <pre>[<br/>  "Sunday"<br/>]</pre> |    no    |
 | <a name="input_maintenance_window_end_time"></a> [maintenance\_window\_end\_time](#input\_maintenance\_window\_end\_time)                                             | The end time for the maintenance window in HH:MM:SS format.                                                     | `string`       | `"04:00:00"`                      |    no    |
 | <a name="input_maintenance_window_start_time"></a> [maintenance\_window\_start\_time](#input\_maintenance\_window\_start\_time)                                       | The start time for the maintenance window in HH:MM:SS format.                                                   | `string`       | `"02:00:00"`                      |    no    |
 | <a name="input_maintenance_window_timezone"></a> [maintenance\_window\_timezone](#input\_maintenance\_window\_timezone)                                               | The IANA timezone name for the maintenance window schedule (e.g., 'Pacific/Auckland', 'UTC').                   | `string`       | `"Pacific/Auckland"`              |    no    |
 | <a name="input_node_cpu_critical_threshold"></a> [node\_cpu\_critical\_threshold](#input\_node\_cpu\_critical\_threshold)                                             | CPU percentage threshold for a node critical alert.                                                             | `number`       | `90`                              |    no    |
 | <a name="input_node_cpu_warning_threshold"></a> [node\_cpu\_warning\_threshold](#input\_node\_cpu\_warning\_threshold)                                                | CPU percentage threshold for a node warning alert.                                                              | `number`       | `80`                              |    no    |
 | <a name="input_node_disk_critical_threshold"></a> [node\_disk\_critical\_threshold](#input\_node\_disk\_critical\_threshold)                                          | Disk percentage threshold for a node critical alert.                                                            | `number`       | `95`                              |    no    |
 | <a name="input_node_disk_warning_threshold"></a> [node\_disk\_warning\_threshold](#input\_node\_disk\_warning\_threshold)                                             | Disk percentage threshold for a node warning alert.                                                             | `number`       | `80`                              |    no    |
 | <a name="input_node_memory_critical_threshold"></a> [node\_memory\_critical\_threshold](#input\_node\_memory\_critical\_threshold)                                    | Memory percentage threshold for a node critical alert.                                                          | `number`       | `95`                              |    no    |
 | <a name="input_node_memory_warning_threshold"></a> [node\_memory\_warning\_threshold](#input\_node\_memory\_warning\_threshold)                                       | Memory percentage threshold for a node warning alert.                                                           | `number`       | `85`                              |    no    |
 | <a name="input_node_readiness_flapping_count"></a> [node\_readiness\_flapping\_count](#input\_node\_readiness\_flapping\_count)                                       | The number of readiness status changes in 15 minutes to be considered 'flapping'.                               | `number`       | `3`                               |    no    |
 | <a name="input_nsg_blocked_connections_threshold"></a> [nsg\_blocked\_connections\_threshold](#input\_nsg\_blocked\_connections\_threshold)                           | The number of blocked connections by an NSG in 30 minutes to trigger an alert.                                  | `number`       | `100`                             |    no    |
 | <a name="input_pod_container_waiting_minutes_threshold"></a> [pod\_container\_waiting\_minutes\_threshold](#input\_pod\_container\_waiting\_minutes\_threshold)       | The number of minutes a container can be in a 'Waiting' state before an alert is fired.                         | `number`       | `60`                              |    no    |
 | <a name="input_pod_cpu_critical_threshold"></a> [pod\_cpu\_critical\_threshold](#input\_pod\_cpu\_critical\_threshold)                                                | Pod CPU usage percentage vs request for a critical alert.                                                       | `number`       | `90`                              |    no    |
 | <a name="input_pod_cpu_warning_threshold"></a> [pod\_cpu\_warning\_threshold](#input\_pod\_cpu\_warning\_threshold)                                                   | Pod CPU usage percentage vs request for a warning alert.                                                        | `number`       | `70`                              |    no    |
 | <a name="input_pod_memory_critical_threshold"></a> [pod\_memory\_critical\_threshold](#input\_pod\_memory\_critical\_threshold)                                       | Pod memory usage percentage vs request for a critical alert.                                                    | `number`       | `90`                              |    no    |
 | <a name="input_pod_memory_warning_threshold"></a> [pod\_memory\_warning\_threshold](#input\_pod\_memory\_warning\_threshold)                                          | Pod memory usage percentage vs request for a warning alert.                                                     | `number`       | `75`                              |    no    |
 | <a name="input_pod_restart_critical_threshold"></a> [pod\_restart\_critical\_threshold](#input\_pod\_restart\_critical\_threshold)                                    | Number of pod restarts in a 30-minute window to trigger a critical alert.                                       | `number`       | `10`                              |    no    |
 | <a name="input_pod_restart_warning_threshold"></a> [pod\_restart\_warning\_threshold](#input\_pod\_restart\_warning\_threshold)                                       | Number of pod restarts in a 30-minute window to trigger a warning.                                              | `number`       | `5`                               |    no    |
 | <a name="input_resource_group_name"></a> [resource\_group\_name](#input\_resource\_group\_name)                                                                       | The name of the resource group where the AKS cluster resides.                                                   | `string`       | n/a                               |   yes    |
 | <a name="input_runbook_base_url"></a> [runbook\_base\_url](#input\_runbook\_base\_url)                                                                                | The base URL for runbooks to be included in alert notifications.                                                | `string`       | `""`                              |    no    |
 | <a name="input_tags"></a> [tags](#input\_tags)                                                                                                                        | A map of tags to apply to all resources.                                                                        | `map(string)`  | `{}`                              |    no    |
 | <a name="input_teams_webhook_critical"></a> [teams\_webhook\_critical](#input\_teams\_webhook\_critical)                                                              | The webhook URL for critical Teams notifications.                                                               | `string`       | `""`                              |    no    |
 | <a name="input_teams_webhook_standard"></a> [teams\_webhook\_standard](#input\_teams\_webhook\_standard)                                                              | The webhook URL for standard Teams notifications.                                                               | `string`       | `""`                              |    no    |

 ## Outputs

 | Name                                                                                                                  | Description                                                            |
 | --------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------- |
 | <a name="output_action_group_ids"></a> [action\_group\_ids](#output\_action\_group\_ids)                              | The IDs of the created action groups for standard and critical alerts. |
 | <a name="output_alert_processing_rule_ids"></a> [alert\_processing\_rule\_ids](#output\_alert\_processing\_rule\_ids) | The IDs of the created alert processing (suppression) rules.           |
 | <a name="output_alert_summary"></a> [alert\_summary](#output\_alert\_summary)                                         | A summary of the monitoring configuration deployed by this module.     |
 | <a name="output_log_alert_ids"></a> [log\_alert\_ids](#output\_log\_alert\_ids)                                       | A map of all log-based (KQL) alert rule IDs created by this module.    |
 | <a name="output_metric_alert_ids"></a> [metric\_alert\_ids](#output\_metric\_alert\_ids)                              | A map of all metric-based alert rule IDs created by this module.       |
 <!-- END_TF_DOCS -->
 === FILE_SEPARATOR ===
 FILE_METADATA: {"path": "alerts_app.tf", "size": 0, "mtime": 1758483035.6279178, "mode": 33188, "encoding": "utf-8", "checksum": null, "mime_type": null, "is_binary": false, "error": null, "ends_with_newline": false}
 ENCODING: utf-8

 === FILE_SEPARATOR ===
 FILE_METADATA: {"path": "alerts_azure.tf", "size": 4630, "mtime": 1758483035.6279178, "mode": 33188, "encoding": "utf-8", "checksum": null, "mime_type": null, "is_binary": false, "error": null, "ends_with_newline": true}
 ENCODING: utf-8
 # alerts_azure.tf
 #
 # Contains alerts related to the Azure Resource Level.
 # These monitor subscription quotas, service health, cost, and networking.

 # =========================================
 # 1. Azure Resource Level Alerts (4 Alerts)
 # =========================================

 # Azure Subscription Quota Usage
 resource "azurerm_monitor_metric_alert" "subscription_quota_usage" {
  name                = "${var.environment_short_prefix}-alert-subscription-quota"
  resource_group_name = var.resource_group_name
  scopes              = ["/subscriptions/${data.azurerm_client_config.current.subscription_id}"]
  description         = "Azure subscription resource quota usage is approaching limits."
  severity            = 2
  auto_mitigate       = true
  window_size         = "PT1H"
  frequency           = "PT15M"

  criteria {
    metric_namespace = "Microsoft.Capacity/resourceProviders"
    metric_name      = "UsagePercent"
    aggregation      = "Maximum"
    operator         = "GreaterThan"
    threshold        = var.azure_subscription_quota_threshold

    dimension {
      name     = "resourceType"
      operator = "Include"
      values   = ["virtualMachines", "cores"]
    }
  }

  action {
    action_group_id = azurerm_monitor_action_group.standard.id
  }

  tags = local.alert_tags
 }

 # Azure Service Health Incidents
 resource "azurerm_monitor_activity_log_alert" "service_health_incidents" {
  name                = "${var.environment_short_prefix}-alert-service-health"
  resource_group_name = var.resource_group_name
  scopes              = ["/subscriptions/${data.azurerm_client_config.current.subscription_id}"]
  description         = "An Azure service health incident has been reported that may affect our services."
  enabled             = true
  location            = "global"

  criteria {
    category = "ServiceHealth"
  }

  action {
    action_group_id = azurerm_monitor_action_group.critical.id
  }

  tags = local.alert_tags
 }

 # Cost Anomaly Detection
 resource "azurerm_monitor_scheduled_query_rules_alert_v2" "cost_anomalies" {
  name                 = "${var.environment_short_prefix}-alert-cost-anomalies"
  resource_group_name  = var.resource_group_name
  location             = var.location
  description          = "Unusual spending patterns detected."
  severity             = 2
  evaluation_frequency = "PT6H"
  window_duration      = "P1D"
  scopes               = [data.azurerm_log_analytics_workspace.main.id]

  criteria {
    query                   = <<-QUERY
      Usage
      | where TimeGenerated > ago(1d)
      | summarize TotalCost = sum(PretaxCost) by bin(TimeGenerated, 6h)
      | extend BaselineCost = 100.0 // NOTE: This baseline is an example. For a real-world scenario, a more dynamic baseline query would be needed.
      | extend CostIncrease = ((TotalCost - BaselineCost) / BaselineCost) * 100.0
      | project CostIncrease
    QUERY
    time_aggregation_method = "Maximum"
    operator                = "GreaterThan"
    threshold               = var.azure_cost_anomaly_percentage_threshold
  }

  action {
    action_groups = [azurerm_monitor_action_group.standard.id]
    custom_properties = {
      alert_name     = "Cost Anomaly Detection"
      alert_type     = "Cause"
      severity_level = "P2"
      runbook_url    = "${var.runbook_base_url}/cost-management"
    }
  }

  tags = local.alert_tags
 }

 # Network Security Group Blocks
 resource "azurerm_monitor_scheduled_query_rules_alert_v2" "nsg_blocks" {
  name                 = "${var.environment_short_prefix}-alert-nsg-blocks"
  resource_group_name  = var.resource_group_name
  location             = var.location
  description          = "High number of blocked connections by NSGs."
  severity             = 2
  evaluation_frequency = "PT15M"
  window_duration      = "PT30M"
  scopes               = [data.azurerm_log_analytics_workspace.main.id]

  criteria {
    query                   = <<-QUERY
      AzureNetworkAnalytics_CL
      | where TimeGenerated > ago(30m)
      | where FlowStatus_s == "D" // Denied flows
      | summarize BlockedConnections = count()
      | project BlockedConnections
    QUERY
    time_aggregation_method = "Total"
    operator                = "GreaterThan"
    threshold               = var.nsg_blocked_connections_threshold
  }

  action {
    action_groups = [azurerm_monitor_action_group.standard.id]
    custom_properties = {
      alert_name     = "Network Security Group Blocks"
      alert_type     = "Cause"
      severity_level = "P2"
      runbook_url    = "${var.runbook_base_url}/network-troubleshooting"
    }
  }

  tags = local.alert_tags
 }

 === FILE_SEPARATOR ===
 FILE_METADATA: {"path": "alerts_infra.tf", "size": 14670, "mtime": 1758483035.6279178, "mode": 33188, "encoding": "utf-8", "checksum": null, "mime_type": null, "is_binary": false, "error": null, "ends_with_newline": true}
 ENCODING: utf-8
 # alerts_infra.tf
 #
 # Contains all alerts related to the core Kubernetes Infrastructure.
 # This includes Nodes, Pods, Storage, and general cluster resource management.

 # =========================================
 # 2. Kubernetes Infrastructure Alerts
 # =========================================

 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # 2.1 Node Infrastructure Health
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 # Node CPU Usage - Warning
 resource "azurerm_monitor_metric_alert" "node_cpu_warning" {
  name                = "${var.environment_short_prefix}-alert-node-cpu-warning"
  resource_group_name = var.resource_group_name
  scopes              = [data.azurerm_kubernetes_cluster.aks.id]
  description         = "Node CPU usage is consistently high (Warning level)."
  severity            = 2
  auto_mitigate       = true
  window_size         = "PT15M"
  frequency           = "PT5M"

  criteria {
    metric_namespace = "Microsoft.ContainerService/managedClusters"
    metric_name      = "node_cpu_usage_percentage"
    aggregation      = "Average"
    operator         = "GreaterThan"
    threshold        = var.node_cpu_warning_threshold
  }

  action {
    action_group_id = azurerm_monitor_action_group.standard.id
  }

  tags = local.alert_tags
 }

 # Node CPU Usage - Critical
 resource "azurerm_monitor_metric_alert" "node_cpu_critical" {
  name                = "${var.environment_short_prefix}-alert-node-cpu-critical"
  resource_group_name = var.resource_group_name
  scopes              = [data.azurerm_kubernetes_cluster.aks.id]
  description         = "Node CPU usage is critically high."
  severity            = 1
  auto_mitigate       = true
  window_size         = "PT10M"
  frequency           = "PT1M"

  criteria {
    metric_namespace = "Microsoft.ContainerService/managedClusters"
    metric_name      = "node_cpu_usage_percentage"
    aggregation      = "Average"
    operator         = "GreaterThan"
    threshold        = var.node_cpu_critical_threshold
  }

  action {
    action_group_id = azurerm_monitor_action_group.critical.id
  }

  tags = local.alert_tags
 }

 # Node Memory Usage - Warning
 resource "azurerm_monitor_metric_alert" "node_memory_warning" {
  name                = "${var.environment_short_prefix}-alert-node-memory-warning"
  resource_group_name = var.resource_group_name
  scopes              = [data.azurerm_kubernetes_cluster.aks.id]
  description         = "Node memory usage is consistently high (Warning level)."
  severity            = 2
  auto_mitigate       = true
  window_size         = "PT15M"
  frequency           = "PT5M"

  criteria {
    metric_namespace = "Microsoft.ContainerService/managedClusters"
    metric_name      = "node_memory_working_set_percentage"
    aggregation      = "Average"
    operator         = "GreaterThan"
    threshold        = var.node_memory_warning_threshold
  }

  action {
    action_group_id = azurerm_monitor_action_group.standard.id
  }

  tags = local.alert_tags
 }

 # Node Memory Usage - Critical
 resource "azurerm_monitor_metric_alert" "node_memory_critical" {
  name                = "${var.environment_short_prefix}-alert-node-memory-critical"
  resource_group_name = var.resource_group_name
  scopes              = [data.azurerm_kubernetes_cluster.aks.id]
  description         = "Node memory usage is critically high."
  severity            = 1
  auto_mitigate       = true
  window_size         = "PT15M"
  frequency           = "PT2M"

  criteria {
    metric_namespace = "Microsoft.ContainerService/managedClusters"
    metric_name      = "node_memory_working_set_percentage"
    aggregation      = "Average"
    operator         = "GreaterThan"
    threshold        = var.node_memory_critical_threshold
  }

  action {
    action_group_id = azurerm_monitor_action_group.critical.id
  }

  tags = local.alert_tags
 }

 # KubeNodeUnreachable (Node Status Issues)
 resource "azurerm_monitor_scheduled_query_rules_alert_v2" "node_status_issues" {
  name                 = "${var.environment_short_prefix}-alert-node-unreachable"
  resource_group_name  = var.resource_group_name
  location             = var.location
  description          = "One or more nodes are in a 'NotReady' or 'Unknown' state."
  severity             = 1
  evaluation_frequency = "PT5M"
  window_duration      = "PT15M"
  scopes               = [data.azurerm_log_analytics_workspace.main.id]

  criteria {
    query                   = <<-QUERY
      KubeNodeInventory
      | where TimeGenerated > ago(15m)
      | where Status != "Ready"
      | summarize by Computer, Status
    QUERY
    time_aggregation_method = "Count"
    threshold               = 0
    operator                = "GreaterThan"
  }

  action {
    action_groups = [azurerm_monitor_action_group.critical.id]
    custom_properties = {
      alert_name     = "KubeNodeUnreachable"
      alert_type     = "Symptom"
      severity_level = "P1"
      runbook_url    = "${var.runbook_base_url}/node-unreachable"
    }
  }

  tags = local.alert_tags
 }

 # KubeNodeReadinessFlapping
 resource "azurerm_monitor_scheduled_query_rules_alert_v2" "node_readiness_flapping" {
  name                 = "${var.environment_short_prefix}-alert-node-readiness-flapping"
  resource_group_name  = var.resource_group_name
  location             = var.location
  description          = "A node's readiness status is changing frequently, indicating instability."
  severity             = 2
  evaluation_frequency = "PT5M"
  window_duration      = "PT15M"
  scopes               = [data.azurerm_log_analytics_workspace.main.id]

  criteria {
    query                   = <<-QUERY
      KubeEvents
      | where TimeGenerated > ago(15m)
      | where ObjectKind == "Node" and Reason has "NodeNotReady"
      | summarize FlapCount = count() by Computer
      | project FlapCount
    QUERY
    time_aggregation_method = "Maximum"
    operator                = "GreaterThan"
    threshold               = var.node_readiness_flapping_count
  }

  action {
    action_groups = [azurerm_monitor_action_group.standard.id]
    custom_properties = {
      alert_name     = "KubeNodeReadinessFlapping"
      alert_type     = "Cause"
      severity_level = "P2"
      runbook_url    = "${var.runbook_base_url}/node-flapping"
    }
  }

  tags = local.alert_tags
 }

 # Node Pressure Events
 resource "azurerm_monitor_scheduled_query_rules_alert_v2" "node_pressure_events" {
  name                 = "${var.environment_short_prefix}-alert-node-pressure-events"
  resource_group_name  = var.resource_group_name
  location             = var.location
  description          = "Node is reporting pressure for Memory, Disk, or PIDs."
  severity             = 1
  evaluation_frequency = "PT5M"
  window_duration      = "PT15M"
  scopes               = [data.azurerm_log_analytics_workspace.main.id]

  criteria {
    query                   = <<-QUERY
      KubeEvents
      | where TimeGenerated > ago(15m)
      | where Reason has_any ("MemoryPressure", "DiskPressure", "PIDPressure")
      | summarize by Computer, Reason, Message
    QUERY
    time_aggregation_method = "Count"
    threshold               = 0
    operator                = "GreaterThan"
  }

  action {
    action_groups = [azurerm_monitor_action_group.critical.id]
    custom_properties = {
      alert_name     = "NodePressureEvent"
      alert_type     = "Symptom"
      severity_level = "P1"
      runbook_url    = "${var.runbook_base_url}/node-pressure"
    }
  }

  tags = local.alert_tags
 }

 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # 2.2 Pod & Container Infrastructure Health
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 # Pod Restart Rate High - Warning
 resource "azurerm_monitor_metric_alert" "pod_restart_alert_warning" {
  name                = "${var.environment_short_prefix}-alert-pod-restart-rate-warning"
  resource_group_name = var.resource_group_name
  scopes              = [data.azurerm_kubernetes_cluster.aks.id]
  description         = "Pods are restarting frequently (Warning)."
  severity            = 2
  auto_mitigate       = true
  window_size         = "PT30M"
  frequency           = "PT10M"

  criteria {
    metric_namespace = "Microsoft.ContainerService/managedClusters"
    metric_name      = "kube_pod_container_status_restarts_total"
    aggregation      = "Total"
    operator         = "GreaterThan"
    threshold        = var.pod_restart_warning_threshold
  }

  action {
    action_group_id = azurerm_monitor_action_group.standard.id
  }

  tags = local.alert_tags
 }

 # KubeContainerWaiting
 resource "azurerm_monitor_scheduled_query_rules_alert_v2" "kube_container_waiting" {
  name                 = "${var.environment_short_prefix}-alert-kube-container-waiting"
  resource_group_name  = var.resource_group_name
  location             = var.location
  description          = "A container has been in a waiting state for an extended period."
  severity             = 2
  evaluation_frequency = "PT30M"
  window_duration      = "PT1H"
  scopes               = [data.azurerm_log_analytics_workspace.main.id]

  criteria {
    query                   = <<-QUERY
      KubePodInventory
      | where TimeGenerated > ago(1h)
      | where PodStatus in ("Pending", "ContainerCreating")
      | extend WaitDurationMinutes = todouble(datetime_diff('minute', now(), TimeGenerated))
      | summarize max(WaitDurationMinutes) by PodName, PodStatus, Namespace
      | project max_WaitDurationMinutes
    QUERY
    time_aggregation_method = "Maximum"
    operator                = "GreaterThan"
    threshold               = var.pod_container_waiting_minutes_threshold
  }

  action {
    action_groups = [azurerm_monitor_action_group.standard.id]
    custom_properties = {
      alert_name     = "KubeContainerWaiting"
      alert_type     = "Symptom"
      severity_level = "P2"
      runbook_url    = "${var.runbook_base_url}/pod-lifecycle"
    }
  }
  tags = local.alert_tags
 }

 # PodUnavailableCritical
 resource "azurerm_monitor_scheduled_query_rules_alert_v2" "pod_unavailable_critical" {
  name                    = "${var.environment_short_prefix}-alert-pod-unavailable-critical"
  resource_group_name     = var.resource_group_name
  location                = var.location
  description             = "A critical pod is in a non-running state (e.g., Pending, Failed, Unknown)."
  severity                = 0
  evaluation_frequency    = "PT2M"
  window_duration         = "PT10M"
  scopes                  = [data.azurerm_log_analytics_workspace.main.id]
  auto_mitigation_enabled = false

  criteria {
    query                   = <<-QUERY
      KubePodInventory
      | where TimeGenerated > ago(10m)
      | where PodStatus !in ("Running", "Succeeded", "Completed")
      // Add a filter for critical pods by uncommenting and editing the line below
      // | where Namespace == "app" and PodName startswith "app-critical-component"
      | summarize by PodName, PodStatus, Namespace
    QUERY
    time_aggregation_method = "Count"
    threshold               = 0
    operator                = "GreaterThan"
  }

  action {
    action_groups = [azurerm_monitor_action_group.critical.id]
    custom_properties = {
      alert_name     = "PodUnavailableCritical"
      alert_type     = "Symptom"
      severity_level = "P0"
      runbook_url    = "${var.runbook_base_url}/pod-unavailable"
    }
  }
  tags = local.alert_tags
 }

 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # 2.3 Cluster Resource Management
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 # KubeCPUQuotaOvercommit
 resource "azurerm_monitor_scheduled_query_rules_alert_v2" "kube_cpu_quota_overcommit" {
  name                 = "${var.environment_short_prefix}-alert-kube-cpu-quota-overcommit"
  resource_group_name  = var.resource_group_name
  location             = var.location
  description          = "Total CPU requests across all namespaces exceed the cluster's allocatable capacity."
  enabled              = true
  severity             = 2
  evaluation_frequency = "PT15M"
  window_duration      = "PT15M"
  scopes               = [data.azurerm_log_analytics_workspace.main.id]

  criteria {
    query                   = <<-QUERY
      let requests = KubePodInventory
      | where TimeGenerated > ago(15m)
      | summarize PodRequests = sum(cpuRequestNanoCores);
      let capacity = KubeNodeInventory
      | where TimeGenerated > ago(15m)
      | summarize TotalCapacity = sum(allocatableCpuNanoCores);
      requests
      | extend joinKey = 1
      | join kind=inner (capacity | extend joinKey = 1) on joinKey
      | project Ratio = todouble(PodRequests) / todouble(TotalCapacity)
    QUERY
    time_aggregation_method = "Maximum"
    operator                = "GreaterThan"
    threshold               = var.cluster_cpu_overcommit_ratio_threshold
  }

  action {
    action_groups = [azurerm_monitor_action_group.standard.id]
    custom_properties = {
      alert_name     = "KubeCPUQuotaOvercommit"
      alert_type     = "Cause"
      severity_level = "P2"
      runbook_url    = "${var.runbook_base_url}/cluster-capacity"
    }
  }
  tags = local.alert_tags
 }

 # KubeMemoryQuotaOvercommit
 resource "azurerm_monitor_scheduled_query_rules_alert_v2" "kube_memory_quota_overcommit" {
  name                 = "${var.environment_short_prefix}-alert-kube-memory-quota-overcommit"
  resource_group_name  = var.resource_group_name
  location             = var.location
  description          = "Total Memory requests across all namespaces exceed the cluster's allocatable capacity."
  enabled              = true
  severity             = 2
  evaluation_frequency = "PT15M"
  window_duration      = "PT15M"
  scopes               = [data.azurerm_log_analytics_workspace.main.id]

  criteria {
    query                   = <<-QUERY
      let requests = KubePodInventory
      | where TimeGenerated > ago(15m)
      | summarize PodRequests = sum(memoryRequestBytes);
      let capacity = KubeNodeInventory
      | where TimeGenerated > ago(15m)
      | summarize TotalCapacity = sum(allocatableMemoryBytes);
      requests
      | extend joinKey = 1
      | join kind=inner (capacity | extend joinKey = 1) on joinKey
      | project Ratio = todouble(PodRequests) / todouble(TotalCapacity)
    QUERY
    time_aggregation_method = "Maximum"
    operator                = "GreaterThan"
    threshold               = var.cluster_memory_overcommit_ratio_threshold
  }

  action {
    action_groups = [azurerm_monitor_action_group.standard.id]
    custom_properties = {
      alert_name     = "KubeMemoryQuotaOvercommit"
      alert_type     = "Cause"
      severity_level = "P2"
      runbook_url    = "${var.runbook_base_url}/cluster-capacity"
    }
  }
  tags = local.alert_tags
 }

 === FILE_SEPARATOR ===
 FILE_METADATA: {"path": "alerts_platform.tf", "size": 15756, "mtime": 1758483035.6279178, "mode": 33188, "encoding": "utf-8", "checksum": null, "mime_type": null, "is_binary": false, "error": null, "ends_with_newline": true}
 ENCODING: utf-8
 # alerts_platform.tf
 #
 # Contains alerts related to the Kubernetes Platform, including:
 # 1. Control Plane components (API Server, etcd, Scheduler, Controller Manager).
 # 2. Platform Workload health (Deployments, StatefulSets, Jobs, etc.).

 # =========================================
 # 3. Control Plane Health Alerts
 # =========================================

 # API Server Latency High - Warning
 resource "azurerm_monitor_scheduled_query_rules_alert_v2" "api_server_latency_warning" {
  name                 = "${var.environment_short_prefix}-alert-api-server-latency-warning"
  resource_group_name  = var.resource_group_name
  location             = var.location
  description          = "Kubernetes API server response latency is high (Warning)."
  severity             = 2
  evaluation_frequency = "PT5M"
  window_duration      = "PT10M"
  scopes               = [data.azurerm_log_analytics_workspace.main.id]

  criteria {
    # CORRECTED: The KQL query now only projects the value. The threshold logic is handled here.
    query                   = <<-QUERY
      InsightsMetrics
      | where TimeGenerated > ago(10m)
      | where Namespace == "kube-system" and Name == "apiserver_request_latencies_bucket"
      | extend LatencyInMs = Val / 1000 // Convert microseconds to ms
      | summarize P95Latency = percentile(LatencyInMs, 95)
      | project P95Latency
    QUERY
    time_aggregation_method = "Maximum"
    operator                = "GreaterThan"
    threshold               = var.api_server_latency_warning_ms
  }

  action {
    action_groups = [azurerm_monitor_action_group.standard.id]
    custom_properties = {
      alert_name     = "APIServerLatencyHighWarning"
      alert_type     = "Cause"
      severity_level = "P2"
      runbook_url    = "${var.runbook_base_url}/control-plane-health"
    }
  }
  tags = local.alert_tags
 }

 # API Server Latency High - Critical
 resource "azurerm_monitor_scheduled_query_rules_alert_v2" "api_server_latency_critical" {
  name                 = "${var.environment_short_prefix}-alert-api-server-latency-critical"
  resource_group_name  = var.resource_group_name
  location             = var.location
  description          = "Kubernetes API server response latency is critically high."
  severity             = 1
  evaluation_frequency = "PT2M"
  window_duration      = "PT5M"
  scopes               = [data.azurerm_log_analytics_workspace.main.id]

  criteria {
    query                   = <<-QUERY
      InsightsMetrics
      | where TimeGenerated > ago(5m)
      | where Namespace == "kube-system" and Name == "apiserver_request_latencies_bucket"
      | extend LatencyInMs = Val / 1000 // Convert microseconds to ms
      | summarize P95Latency = percentile(LatencyInMs, 95)
      | project P95Latency
    QUERY
    time_aggregation_method = "Maximum"
    operator                = "GreaterThan"
    threshold               = var.api_server_latency_critical_ms
  }

  action {
    action_groups = [azurerm_monitor_action_group.critical.id]
    custom_properties = {
      alert_name     = "APIServerLatencyHighCritical"
      alert_type     = "Cause"
      severity_level = "P1"
      runbook_url    = "${var.runbook_base_url}/control-plane-health"
    }
  }
  tags = local.alert_tags
 }

 # Etcd Latency High - Warning
 resource "azurerm_monitor_scheduled_query_rules_alert_v2" "etcd_latency_warning" {
  name                 = "${var.environment_short_prefix}-alert-etcd-latency-warning"
  resource_group_name  = var.resource_group_name
  location             = var.location
  description          = "etcd request latency is high (Warning)."
  severity             = 2
  evaluation_frequency = "PT5M"
  window_duration      = "PT15M"
  scopes               = [data.azurerm_log_analytics_workspace.main.id]

  criteria {
    query                   = <<-QUERY
      InsightsMetrics
      | where TimeGenerated > ago(15m)
      | where Namespace == "kube-system" and Name == "etcd_request_latencies_bucket"
      | extend LatencyInMs = Val / 1000 // Convert microseconds to ms
      | summarize P95Latency = percentile(LatencyInMs, 95)
      | project P95Latency
    QUERY
    time_aggregation_method = "Maximum"
    operator                = "GreaterThan"
    threshold               = var.etcd_latency_warning_ms
  }

  action {
    action_groups = [azurerm_monitor_action_group.standard.id]
    custom_properties = {
      alert_name     = "EtcdLatencyHighWarning"
      alert_type     = "Cause"
      severity_level = "P2"
      runbook_url    = "${var.runbook_base_url}/control-plane-health"
    }
  }
  tags = local.alert_tags
 }

 # Etcd Latency High - Critical
 resource "azurerm_monitor_scheduled_query_rules_alert_v2" "etcd_latency_critical" {
  name                 = "${var.environment_short_prefix}-alert-etcd-latency-critical"
  resource_group_name  = var.resource_group_name
  location             = var.location
  description          = "etcd request latency is critically high."
  severity             = 1
  evaluation_frequency = "PT2M"
  window_duration      = "PT10M"
  scopes               = [data.azurerm_log_analytics_workspace.main.id]

  criteria {
    query                   = <<-QUERY
      InsightsMetrics
      | where TimeGenerated > ago(10m)
      | where Namespace == "kube-system" and Name == "etcd_request_latencies_bucket"
      | extend LatencyInMs = Val / 1000 // Convert microseconds to ms
      | summarize P95Latency = percentile(LatencyInMs, 95)
      | project P95Latency
    QUERY
    time_aggregation_method = "Maximum"
    operator                = "GreaterThan"
    threshold               = var.etcd_latency_critical_ms
  }

  action {
    action_groups = [azurerm_monitor_action_group.critical.id]
    custom_properties = {
      alert_name     = "EtcdLatencyHighCritical"
      alert_type     = "Cause"
      severity_level = "P1"
      runbook_url    = "${var.runbook_base_url}/control-plane-health"
    }
  }
  tags = local.alert_tags
 }


 # Etcd Health Issues
 resource "azurerm_monitor_scheduled_query_rules_alert_v2" "etcd_health" {
  name                    = "${var.environment_short_prefix}-alert-etcd-health"
  resource_group_name     = var.resource_group_name
  location                = var.location
  description             = "etcd cluster is reporting health issues, which is critical for cluster state."
  severity                = 0 # Critical P0
  evaluation_frequency    = "PT2M"
  window_duration         = "PT5M"
  scopes                  = [data.azurerm_log_analytics_workspace.main.id]
  auto_mitigation_enabled = false

  criteria {
    query                   = <<-QUERY
      ContainerLog
      | where TimeGenerated > ago(5m)
      | where PodName hasprefix "etcd"
      | where LogEntry has_any ("error", "failed", "unhealthy", "leader election")
      | summarize by LogEntry, PodName
    QUERY
    time_aggregation_method = "Count"
    threshold               = 0
    operator                = "GreaterThan"
  }

  action {
    action_groups = [azurerm_monitor_action_group.critical.id]
    custom_properties = {
      alert_name     = "EtcdHealthIssues"
      alert_type     = "Symptom"
      severity_level = "P0"
      runbook_url    = "${var.runbook_base_url}/control-plane-health"
    }
  }
  tags = local.alert_tags
 }

 # =========================================
 # 4. Platform Workload Health Alerts
 # =========================================

 # KubePodCrashLooping
 resource "azurerm_monitor_scheduled_query_rules_alert_v2" "crashloop_backoff" {
  name                 = "${var.environment_short_prefix}-alert-crashloop-backoff"
  resource_group_name  = var.resource_group_name
  location             = var.location
  description          = "Pods are in a CrashLoopBackOff state, indicating a recurring startup failure."
  severity             = 1
  evaluation_frequency = "PT5M"
  window_duration      = "PT15M"
  scopes               = [data.azurerm_log_analytics_workspace.main.id]

  criteria {
    query                   = <<-QUERY
      KubePodInventory
      | where TimeGenerated > ago(15m)
      | where PodStatus == "CrashLoopBackOff"
      | summarize by PodName, Namespace
    QUERY
    time_aggregation_method = "Count"
    threshold               = 0
    operator                = "GreaterThan"
  }

  action {
    action_groups = [azurerm_monitor_action_group.critical.id]
    custom_properties = {
      alert_name     = "KubePodCrashLooping"
      alert_type     = "Symptom"
      severity_level = "P1"
      runbook_url    = "${var.runbook_base_url}/pod-failures"
    }
  }
  tags = local.alert_tags
 }

 # KubeContainerOOMKilledCount
 resource "azurerm_monitor_scheduled_query_rules_alert_v2" "oom_killed" {
  name                 = "${var.environment_short_prefix}-alert-oom-killed"
  resource_group_name  = var.resource_group_name
  location             = var.location
  description          = "A container was OOMKilled (Out of Memory), indicating memory limits are too low."
  severity             = 1
  evaluation_frequency = "PT5M"
  window_duration      = "PT15M"
  scopes               = [data.azurerm_log_analytics_workspace.main.id]

  criteria {
    query                   = <<-QUERY
      KubeEvents
      | where TimeGenerated > ago(15m)
      | where Reason == "OOMKilling"
      | summarize by Name, Namespace, Message
    QUERY
    time_aggregation_method = "Count"
    threshold               = 0
    operator                = "GreaterThan"
  }

  action {
    action_groups = [azurerm_monitor_action_group.critical.id]
    custom_properties = {
      alert_name     = "KubeContainerOOMKilled"
      alert_type     = "Symptom"
      severity_level = "P1"
      runbook_url    = "${var.runbook_base_url}/pod-oomkilled"
    }
  }
  tags = local.alert_tags
 }

 # KubeDeploymentReplicasMismatch
 resource "azurerm_monitor_scheduled_query_rules_alert_v2" "deployment_replica_mismatch" {
  name                 = "${var.environment_short_prefix}-alert-deployment-replica-mismatch"
  resource_group_name  = var.resource_group_name
  location             = var.location
  description          = "The number of available replicas for a deployment does not match the desired state."
  severity             = 2
  evaluation_frequency = "PT10M"
  window_duration      = "PT15M"
  scopes               = [data.azurerm_log_analytics_workspace.main.id]

  criteria {
    query                   = <<-QUERY
      InsightsMetrics
      | where TimeGenerated > ago(15m)
      | where Name == 'kube_deployment_spec_replicas'
      | extend deployment_name = tostring(todynamic(Tags).deployment)
      | summarize desired_replicas = max(Val) by deployment_name, Namespace
      | join kind=inner (
          InsightsMetrics
          | where TimeGenerated > ago(15m)
          | where Name == 'kube_deployment_status_replicas_available'
          | extend deployment_name = tostring(todynamic(Tags).deployment)
          | summarize available_replicas = max(Val) by deployment_name, Namespace
      ) on deployment_name, Namespace
      | where desired_replicas > available_replicas
      | summarize MismatchedCount = count()
      | project MismatchedCount
    QUERY
    time_aggregation_method = "Maximum"
    operator                = "GreaterThan"
    threshold               = 0
  }

  action {
    action_groups = [azurerm_monitor_action_group.standard.id]
    custom_properties = {
      alert_name     = "KubeDeploymentReplicasMismatch"
      alert_type     = "Symptom"
      severity_level = "P2"
      runbook_url    = "${var.runbook_base_url}/workload-mismatch"
    }
  }
  tags = local.alert_tags
 }

 # KubeJobFailed
 resource "azurerm_monitor_scheduled_query_rules_alert_v2" "kube_job_failed" {
  name                 = "${var.environment_short_prefix}-alert-kube-job-failed"
  resource_group_name  = var.resource_group_name
  location             = var.location
  description          = "A Kubernetes job has failed to complete successfully."
  severity             = 2
  evaluation_frequency = "PT10M"
  window_duration      = "PT20M"
  scopes               = [data.azurerm_log_analytics_workspace.main.id]

  criteria {
    query                   = <<-QUERY
      KubePodInventory
      | where TimeGenerated > ago(20m)
      | where ControllerKind == "Job" and PodStatus == "Failed"
      | summarize by ControllerName, Namespace
    QUERY
    time_aggregation_method = "Count"
    threshold               = 0
    operator                = "GreaterThan"
  }

  action {
    action_groups = [azurerm_monitor_action_group.standard.id]
    custom_properties = {
      alert_name     = "KubeJobFailed"
      alert_type     = "Symptom"
      severity_level = "P2"
      runbook_url    = "${var.runbook_base_url}/job-failures"
    }
  }
  tags = local.alert_tags
 }

 # KubeStatefulSetReplicasMismatch
 resource "azurerm_monitor_scheduled_query_rules_alert_v2" "kube_statefulset_replica_mismatch" {
  name                 = "${var.environment_short_prefix}-alert-statefulset-replica-mismatch"
  resource_group_name  = var.resource_group_name
  location             = var.location
  description          = "The number of ready replicas for a StatefulSet does not match the desired state."
  severity             = 2
  evaluation_frequency = "PT10M"
  window_duration      = "PT15M"
  scopes               = [data.azurerm_log_analytics_workspace.main.id]

  criteria {
    query                   = <<-QUERY
      InsightsMetrics
      | where TimeGenerated > ago(15m)
      | where Name == 'kube_statefulset_replicas'
      | extend statefulset_name = tostring(todynamic(Tags).statefulset)
      | summarize desired_replicas = max(Val) by statefulset_name, Namespace
      | join kind=inner (
          InsightsMetrics
          | where TimeGenerated > ago(15m)
          | where Name == 'kube_statefulset_status_replicas_ready'
          | extend statefulset_name = tostring(todynamic(Tags).statefulset)
          | summarize ready_replicas = max(Val) by statefulset_name, Namespace
      ) on statefulset_name, Namespace
      | where desired_replicas > ready_replicas
      | summarize MismatchedCount = count()
      | project MismatchedCount
    QUERY
    time_aggregation_method = "Maximum"
    operator                = "GreaterThan"
    threshold               = 0
  }

  action {
    action_groups = [azurerm_monitor_action_group.standard.id]
    custom_properties = {
      alert_name     = "KubeStatefulSetReplicasMismatch"
      alert_type     = "Symptom"
      severity_level = "P2"
      runbook_url    = "${var.runbook_base_url}/workload-mismatch"
    }
  }
  tags = local.alert_tags
 }

 # FailedPodScheduling
 resource "azurerm_monitor_scheduled_query_rules_alert_v2" "failed_pod_scheduling" {
  name                 = "${var.environment_short_prefix}-alert-failed-pod-scheduling"
  resource_group_name  = var.resource_group_name
  location             = var.location
  description          = "A pod has failed to schedule, indicating resource pressure or misconfiguration."
  severity             = 1
  evaluation_frequency = "PT5M"
  window_duration      = "PT15M"
  scopes               = [data.azurerm_log_analytics_workspace.main.id]

  criteria {
    query                   = <<-QUERY
      KubeEvents
      | where TimeGenerated > ago(15m)
      | where Reason == "FailedScheduling"
      | summarize by Name, Namespace, Message
    QUERY
    time_aggregation_method = "Count"
    threshold               = 0
    operator                = "GreaterThan"
  }

  action {
    action_groups = [azurerm_monitor_action_group.critical.id]
    custom_properties = {
      alert_name     = "FailedPodScheduling"
      alert_type     = "Symptom"
      severity_level = "P1"
      runbook_url    = "${var.runbook_base_url}/pod-scheduling"
    }
  }
  tags = local.alert_tags
 }

 === FILE_SEPARATOR ===
 FILE_METADATA: {"path": "locals.tf", "size": 427, "mtime": 1758483035.6279178, "mode": 33188, "encoding": "utf-8", "checksum": null, "mime_type": null, "is_binary": false, "error": null, "ends_with_newline": true}
 ENCODING: utf-8
 # locals.tf
 #
 # Defines local variables used throughout the module.
 # In this .tfvars-driven pattern, its primary role is to construct a consistent set of tags.

 locals {
  # Common tags to be applied to all resources created by this module.
  alert_tags = merge(var.tags, {
    Component   = "Monitoring"
    Environment = var.environment_name
    Timezone    = "Pacific/Auckland"
    DesignDoc   = "AKS-Observability"
  })
 }

 === FILE_SEPARATOR ===
 FILE_METADATA: {"path": "main.tf", "size": 6043, "mtime": 1758483035.6279178, "mode": 33188, "encoding": "utf-8", "checksum": null, "mime_type": null, "is_binary": false, "error": null, "ends_with_newline": true}
 ENCODING: utf-8
 # main.tf
 #
 # Core module logic including:
 # 1. Data sources to fetch information about the existing environment (RG, LAW, AKS).
 # 2. Action Groups for routing standard and critical alert notifications.
 # 3. Alert Processing Rules for alert suppression during maintenance and after hours.

 # =========================================
 # Data Sources
 # =========================================

 data "azurerm_resource_group" "main" {
  name = var.resource_group_name
 }

 data "azurerm_log_analytics_workspace" "main" {
  name                = var.log_analytics_workspace_name
  resource_group_name = var.resource_group_name
 }

 data "azurerm_kubernetes_cluster" "aks" {
  name                = var.aks_cluster_name
  resource_group_name = var.resource_group_name
 }

 data "azurerm_client_config" "current" {}

 # =========================================
 # Action Groups
 # =========================================

 # Standard alerts action group for warning-level notifications (P2)
 resource "azurerm_monitor_action_group" "standard" {
  name                = "${var.environment_short_prefix}-ag-standard"
  resource_group_name = var.resource_group_name
  short_name          = "std-${var.environment_short_prefix}"
  enabled             = true

  # Dynamically create email receivers for the SRE team list
  dynamic "email_receiver" {
    for_each = var.alert_email_sre
    content {
      name                    = "sre-team-${index(var.alert_email_sre, email_receiver.value)}"
      email_address           = email_receiver.value
      use_common_alert_schema = true
    }
  }

  dynamic "webhook_receiver" {
    for_each = var.teams_webhook_standard != "" ? [1] : []
    content {
      name                    = "teams-standard"
      service_uri             = var.teams_webhook_standard
      use_common_alert_schema = true
    }
  }

  tags = local.alert_tags
 }

 # Critical alerts action group for high-priority notifications (P0, P1)
 resource "azurerm_monitor_action_group" "critical" {
  name                = "${var.environment_short_prefix}-ag-critical"
  resource_group_name = var.resource_group_name
  short_name          = "crit-${var.environment_short_prefix}"
  enabled             = true

  # Dynamically create email receivers for the primary on-call list
  dynamic "email_receiver" {
    for_each = var.alert_email_oncall_primary
    content {
      name                    = "oncall-primary-${index(var.alert_email_oncall_primary, email_receiver.value)}"
      email_address           = email_receiver.value
      use_common_alert_schema = true
    }
  }

  # Dynamically create email receivers for the secondary on-call list
  dynamic "email_receiver" {
    for_each = var.alert_email_oncall_secondary
    content {
      name                    = "oncall-secondary-${index(var.alert_email_oncall_secondary, email_receiver.value)}"
      email_address           = email_receiver.value
      use_common_alert_schema = true
    }
  }

  # Dynamically create email receivers for the manager escalation list
  dynamic "email_receiver" {
    for_each = var.alert_email_manager
    content {
      name                    = "manager-escalation-${index(var.alert_email_manager, email_receiver.value)}"
      email_address           = email_receiver.value
      use_common_alert_schema = true
    }
  }

  dynamic "webhook_receiver" {
    for_each = var.teams_webhook_critical != "" ? [1] : []
    content {
      name                    = "teams-critical"
      service_uri             = var.teams_webhook_critical
      use_common_alert_schema = true
    }
  }

  tags = local.alert_tags
 }

 # =========================================
 # Alert Processing Rules - Flood Prevention
 # =========================================

 # Maintenance window suppression
 resource "azurerm_monitor_alert_processing_rule_suppression" "maintenance_window" {
  name                = "${var.environment_short_prefix}-suppress-maintenance"
  resource_group_name = var.resource_group_name
  scopes              = [data.azurerm_resource_group.main.id]
  description         = "Suppress non-critical alerts during the planned maintenance window."
  enabled             = var.enable_maintenance_window_suppression

  condition {
    severity {
      operator = "NotEquals"
      values   = ["Sev0"] # Suppress everything except Critical P0 alerts
    }
  }

  schedule {
    # This effective_from date can remain static, it just marks when the rule becomes active.
    effective_from = "2025-01-01T00:00:00Z"
    time_zone      = var.maintenance_window_timezone

    # A single 'recurrence' block contains both the 'weekly' and 'daily' blocks
    # to define a precise time window on specific days.
    recurrence {
      daily {
        start_time = var.maintenance_window_start_time
        end_time   = var.maintenance_window_end_time
      }
      weekly {
        days_of_week = var.maintenance_window_days_of_week
      }
    }
  }
  tags = local.alert_tags
 }

 # Low severity suppression during non-business hours for QA environments ONLY
 resource "azurerm_monitor_alert_processing_rule_suppression" "after_hours_low_severity" {
  # This resource is now created if the current environment name is in the list of specified environments.
  count = contains(var.after_hours_suppression_environments, var.environment_name) ? 1 : 0

  name                = "${var.environment_short_prefix}-suppress-afterhours"
  resource_group_name = var.resource_group_name
  scopes              = [data.azurerm_resource_group.main.id]
  description         = "Suppress low severity alerts outside business hours."
  enabled             = var.enable_after_hours_suppression

  condition {
    severity {
      operator = "Equals"
      values   = ["Sev2", "Sev3", "Sev4"]
    }
  }

  schedule {
    effective_from = "2025-01-01T00:00:00Z"
    time_zone      = var.after_hours_suppression_timezone

    recurrence {
      daily {
        start_time = var.after_hours_start_time
        end_time   = var.after_hours_end_time
      }
    }
  }
  tags = local.alert_tags
 }

 === FILE_SEPARATOR ===
 FILE_METADATA: {"path": "outputs.tf", "size": 5055, "mtime": 1758483035.6279178, "mode": 33188, "encoding": "utf-8", "checksum": null, "mime_type": null, "is_binary": false, "error": null, "ends_with_newline": true}
 ENCODING: utf-8
 # outputs.tf
 #
 # Defines the output values from this module.
 # These outputs can be used by other resources in the root Terraform project.

 # =========================================
 # Output: Action Groups
 # =========================================

 output "action_group_ids" {
  description = "The IDs of the created action groups for standard and critical alerts."
  value = {
    standard = azurerm_monitor_action_group.standard.id
    critical = azurerm_monitor_action_group.critical.id
  }
 }

 # =========================================
 # Output: Alert Processing Rules
 # =========================================

 output "alert_processing_rule_ids" {
  description = "The IDs of the created alert processing (suppression) rules."
  value = {
    maintenance_suppression = azurerm_monitor_alert_processing_rule_suppression.maintenance_window.id
    # Note: after_hours_suppression is conditional and may not exist for prod environments.
    after_hours_suppression = length(azurerm_monitor_alert_processing_rule_suppression.after_hours_low_severity) > 0 ? azurerm_monitor_alert_processing_rule_suppression.after_hours_low_severity[0].id : null
  }
 }

 # =========================================
 # Output: Alert Rule IDs
 # =========================================

 output "metric_alert_ids" {
  description = "A map of all metric-based alert rule IDs created by this module."
  value = {
    # Azure Level
    subscription_quota_usage = azurerm_monitor_metric_alert.subscription_quota_usage.id

    # Node Level
    node_cpu_warning     = azurerm_monitor_metric_alert.node_cpu_warning.id
    node_cpu_critical    = azurerm_monitor_metric_alert.node_cpu_critical.id
    node_memory_warning  = azurerm_monitor_metric_alert.node_memory_warning.id
    node_memory_critical = azurerm_monitor_metric_alert.node_memory_critical.id

    # Pod Level
    pod_restart_rate_warning = azurerm_monitor_metric_alert.pod_restart_alert_warning.id
    # Note: A critical restart alert resource would need to be added to alerts_infra.tf to be outputted.
  }
 }

 output "log_alert_ids" {
  description = "A map of all log-based (KQL) alert rule IDs created by this module."
  value = {
    # Azure Level
    cost_anomalies = azurerm_monitor_scheduled_query_rules_alert_v2.cost_anomalies.id
    nsg_blocks     = azurerm_monitor_scheduled_query_rules_alert_v2.nsg_blocks.id

    # Infrastructure - Node Level
    node_status_issues      = azurerm_monitor_scheduled_query_rules_alert_v2.node_status_issues.id
    node_readiness_flapping = azurerm_monitor_scheduled_query_rules_alert_v2.node_readiness_flapping.id
    node_pressure_events    = azurerm_monitor_scheduled_query_rules_alert_v2.node_pressure_events.id

    # Infrastructure - Pod & Container Level
    kube_container_waiting   = azurerm_monitor_scheduled_query_rules_alert_v2.kube_container_waiting.id
    pod_unavailable_critical = azurerm_monitor_scheduled_query_rules_alert_v2.pod_unavailable_critical.id

    # Infrastructure - Cluster Resource Management
    cluster_cpu_overcommit    = azurerm_monitor_scheduled_query_rules_alert_v2.kube_cpu_quota_overcommit.id
    cluster_memory_overcommit = azurerm_monitor_scheduled_query_rules_alert_v2.kube_memory_quota_overcommit.id

    # Platform - Control Plane
    api_server_latency_warning  = azurerm_monitor_scheduled_query_rules_alert_v2.api_server_latency_warning.id
    api_server_latency_critical = azurerm_monitor_scheduled_query_rules_alert_v2.api_server_latency_critical.id
    etcd_latency_warning        = azurerm_monitor_scheduled_query_rules_alert_v2.etcd_latency_warning.id
    etcd_latency_critical       = azurerm_monitor_scheduled_query_rules_alert_v2.etcd_latency_critical.id
    etcd_health                 = azurerm_monitor_scheduled_query_rules_alert_v2.etcd_health.id

    # Platform - Workload Health
    crashloop_backoff            = azurerm_monitor_scheduled_query_rules_alert_v2.crashloop_backoff.id
    oom_killed                   = azurerm_monitor_scheduled_query_rules_alert_v2.oom_killed.id
    deployment_replica_mismatch  = azurerm_monitor_scheduled_query_rules_alert_v2.deployment_replica_mismatch.id
    job_failed                   = azurerm_monitor_scheduled_query_rules_alert_v2.kube_job_failed.id
    job_stale                    = azurerm_monitor_scheduled_query_rules_alert_v2.kube_job_stale.id
    statefulset_replica_mismatch = azurerm_monitor_scheduled_query_rules_alert_v2.kube_statefulset_replica_mismatch.id
    failed_pod_scheduling        = azurerm_monitor_scheduled_query_rules_alert_v2.failed_pod_scheduling.id
  }
 }

 # =========================================
 # Output: Summary
 # =========================================

 output "alert_summary" {
  description = "A summary of the monitoring configuration deployed by this module."
  value = {
    module_pattern       = ".tfvars driven for maximum flexibility"
    coverage_scope       = "Azure Resources, AKS Infrastructure, and Kubernetes Platform"
    total_alerts_defined = "Approx. 30+" # The count of alert resources defined in the module.
  }
 }

 === FILE_SEPARATOR ===
 FILE_METADATA: {"path": "terraform.tfvars.example", "size": 5020, "mtime": 1758483035.6279178, "mode": 33188, "encoding": "utf-8", "checksum": null, "mime_type": null, "is_binary": false, "error": null, "ends_with_newline": false}
 ENCODING: utf-8
 # -----------------------------------------------------------------------------
 # EXAMPLE TERRAFORM VARIABLES
 #
 # Copy this file to a new file named <environment>.tfvars (e.g., prod.tfvars)
 # and update the values to match your target environment.
 #
 # USAGE:
 # terraform plan -var-file="prod.tfvars"
 # terraform apply -var-file="prod.tfvars"
 # -----------------------------------------------------------------------------

 # -------------------------------------
 # General Settings
 # -------------------------------------

 # The full name of the environment (e.g., "prod", "qa"). Used for naming and tagging.
 environment_name = "prod"

 # A short prefix for the environment (e.g., "p", "q"). Used for short names in resources.
 environment_short_prefix = "p"

 # The Azure region where the resources are deployed.
 location = "australiaeast"

 # The name of the resource group where the AKS cluster and Log Analytics Workspace reside.
 resource_group_name = "prod-app-aks-rg"

 # The name of the existing Log Analytics Workspace to scope alerts to.
 log_analytics_workspace_name = "prod-app-law"

 # The name of the existing AKS cluster to scope alerts to.
 aks_cluster_name = "prod-app-aks"

 # The base URL for your team's runbooks or troubleshooting guides.
 runbook_base_url = "https://your-wiki.com/runbooks"

 # A map of tags to apply to all created resources.
 tags = {
  BillingCode = "BC-12345"
  Owner       = "SRE Team"
 }


 # -------------------------------------
 # Notification Settings
 # -------------------------------------

 # A list of email addresses for the SRE team (for standard alerts).
 alert_email_sre = ["[email protected]", "[email protected]"]

 # A list of email addresses for the primary on-call rotation (for critical alerts).
 alert_email_oncall_primary = ["[email protected]", "[email protected]"]

 # A list of email addresses for the secondary on-call rotation (for critical alerts).
 alert_email_oncall_secondary = ["[email protected]"]

 # A list of email addresses for manager escalation (for critical alerts).
 alert_email_manager = ["[email protected]"]

 # (Optional) The webhook URL for standard Teams notifications.
 teams_webhook_standard = ""

 # (Optional) The webhook URL for critical Teams notifications.
 teams_webhook_critical = ""


 # -------------------------------------
 # Maintenance Window Settings
 # -------------------------------------

 # Set to false to disable the maintenance window suppression rule entirely.
 enable_maintenance_window_suppression = true

 # The IANA timezone name for the schedule.
 # Example for Auckland: "Pacific/Auckland"
 # Example for US East Coast: "America/New_York"
 maintenance_window_timezone = "Pacific/Auckland"

 # The start and end time for the maintenance window in 24-hour HH:MM:SS format.
 maintenance_window_start_time = "02:00:00"
 maintenance_window_end_time   = "04:00:00"

 # A list of days for the maintenance.
 # Example for every Saturday and Sunday: ["Saturday", "Sunday"]
 maintenance_window_days_of_week = ["Sunday"]

 # -------------------------------------
 # After-Hours Suppression Settings
 # -------------------------------------

 # Set to false to disable the after-hours suppression rule entirely.
 enable_after_hours_suppression = true

 # A list of environments where this rule should apply.
 # For example, to enable for both 'qa' and 'dev', use: ["qa", "dev"]
 # To disable for all, use an empty list: []
 after_hours_suppression_environments = ["qa"]

 # The IANA timezone name for the schedule.
 after_hours_suppression_timezone = "Pacific/Auckland"

 # The start and end time for the after-hours suppression window.
 after_hours_start_time = "18:00:00"
 after_hours_end_time   = "08:00:00"


 # -------------------------------------
 # Alert Thresholds
 # -------------------------------------
 # These values are based on the "Production" environment in the design document.

 # --- Azure Level Thresholds ---
 azure_subscription_quota_threshold      = 80
 azure_cost_anomaly_percentage_threshold = 50
 nsg_blocked_connections_threshold       = 100

 # --- Node Thresholds ---
 node_cpu_warning_threshold      = 75
 node_cpu_critical_threshold     = 85
 node_memory_warning_threshold   = 80
 node_memory_critical_threshold  = 90
 node_disk_warning_threshold     = 75
 node_disk_critical_threshold    = 90
 node_readiness_flapping_count   = 3

 # --- Pod & Container Thresholds ---
 pod_cpu_warning_threshold       = 65
 pod_cpu_critical_threshold      = 85
 pod_memory_warning_threshold    = 70
 pod_memory_critical_threshold   = 85
 pod_restart_warning_threshold   = 3
 pod_restart_critical_threshold  = 8
 pod_container_waiting_minutes_threshold = 60

 # --- Control Plane Thresholds (in Milliseconds) ---
 api_server_latency_warning_ms   = 100
 api_server_latency_critical_ms  = 300
 etcd_latency_warning_ms         = 50
 etcd_latency_critical_ms        = 200

 # --- Cluster & Workload Thresholds ---
 cluster_cpu_overcommit_ratio_threshold      = 1.5
 cluster_memory_overcommit_ratio_threshold   = 1.5
 === FILE_SEPARATOR ===
 FILE_METADATA: {"path": "variables.tf", "size": 9198, "mtime": 1758483035.6279178, "mode": 33188, "encoding": "utf-8", "checksum": null, "mime_type": null, "is_binary": false, "error": null, "ends_with_newline": true}
 ENCODING: utf-8
 #------------------------------------------------------------------------------
 # General Settings
 #------------------------------------------------------------------------------
 variable "environment_name" {
  description = "The name of the environment (e.g., 'qa', 'prod'). Used for naming and tagging."
  type        = string
 }

 variable "environment_short_prefix" {
  description = "A short prefix for the environment (e.g., 'q', 'p'). Used for short names in resources."
  type        = string
 }

 variable "location" {
  description = "The Azure region where resources are deployed."
  type        = string
 }

 variable "resource_group_name" {
  description = "The name of the resource group where the AKS cluster resides."
  type        = string
 }

 variable "log_analytics_workspace_name" {
  description = "The name of the Log Analytics Workspace to scope alerts to."
  type        = string
 }

 variable "aks_cluster_name" {
  description = "The name of the AKS cluster to scope alerts to."
  type        = string
 }

 variable "runbook_base_url" {
  description = "The base URL for runbooks to be included in alert notifications."
  type        = string
  default     = ""
 }

 variable "tags" {
  description = "A map of tags to apply to all resources."
  type        = map(string)
  default     = {}
 }

 #------------------------------------------------------------------------------
 # Notification Settings
 #------------------------------------------------------------------------------
 variable "alert_email_sre" {
  description = "A list of email addresses for the SRE team."
  type        = list(string)
  default     = []
 }

 variable "alert_email_oncall_primary" {
  description = "A list of email addresses for the primary on-call."
  type        = list(string)
  default     = []
 }

 variable "alert_email_oncall_secondary" {
  description = "A list of email addresses for the secondary on-call."
  type        = list(string)
  default     = []
 }

 variable "alert_email_manager" {
  description = "A list of email addresses for manager escalation."
  type        = list(string)
  default     = []
 }

 variable "teams_webhook_standard" {
  description = "The webhook URL for standard Teams notifications."
  type        = string
  default     = ""
 }

 variable "teams_webhook_critical" {
  description = "The webhook URL for critical Teams notifications."
  type        = string
  default     = ""
 }

 #------------------------------------------------------------------------------
 # Maintenance Window Settings
 #------------------------------------------------------------------------------
 variable "enable_maintenance_window_suppression" {
  description = "If true, the maintenance window alert suppression rule will be created."
  type        = bool
  default     = true
 }

 variable "maintenance_window_timezone" {
  description = "The IANA timezone name for the maintenance window schedule (e.g., 'Pacific/Auckland', 'UTC')."
  type        = string
  default     = "Pacific/Auckland"
 }

 variable "maintenance_window_start_time" {
  description = "The start time for the maintenance window in HH:MM:SS format."
  type        = string
  default     = "02:00:00"
 }

 variable "maintenance_window_end_time" {
  description = "The end time for the maintenance window in HH:MM:SS format."
  type        = string
  default     = "04:00:00"
 }

 variable "maintenance_window_days_of_week" {
  description = "A list of days of the week for the maintenance window. Allowed values are 'Sunday', 'Monday', etc."
  type        = list(string)
  default     = ["Sunday"]
 }

 #------------------------------------------------------------------------------
 # After-Hours Suppression Settings
 #------------------------------------------------------------------------------
 variable "enable_after_hours_suppression" {
  description = "If true, the after-hours alert suppression rule will be created for specified environments."
  type        = bool
  default     = true
 }

 variable "after_hours_suppression_environments" {
  description = "A list of environment names (e.g., ['qa', 'dev']) where the after-hours suppression rule should be active."
  type        = list(string)
  default     = ["qa"]
 }

 variable "after_hours_suppression_timezone" {
  description = "The IANA timezone name for the after-hours suppression schedule."
  type        = string
  default     = "Pacific/Auckland"
 }

 variable "after_hours_start_time" {
  description = "The start time for after-hours suppression (e.g., start of the evening)."
  type        = string
  default     = "18:00:00"
 }

 variable "after_hours_end_time" {
  description = "The end time for after-hours suppression (e.g., start of the morning)."
  type        = string
  default     = "08:00:00"
 }

 #------------------------------------------------------------------------------
 # Alert Threshold Variables
 # Using QA/less-sensitive values from the design doc as safe defaults.
 #------------------------------------------------------------------------------

 # --- Azure Level Thresholds ---
 variable "azure_subscription_quota_threshold" {
  description = "The subscription resource quota usage percentage to trigger an alert."
  type        = number
  default     = 85
 }
 variable "azure_cost_anomaly_percentage_threshold" {
  description = "The percentage increase in cost over the baseline to trigger an anomaly alert."
  type        = number
  default     = 50
 }
 variable "nsg_blocked_connections_threshold" {
  description = "The number of blocked connections by an NSG in 30 minutes to trigger an alert."
  type        = number
  default     = 100
 }

 # --- Node Thresholds ---
 variable "node_cpu_warning_threshold" {
  description = "CPU percentage threshold for a node warning alert."
  type        = number
  default     = 80
 }
 variable "node_cpu_critical_threshold" {
  description = "CPU percentage threshold for a node critical alert."
  type        = number
  default     = 90
 }
 variable "node_memory_warning_threshold" {
  description = "Memory percentage threshold for a node warning alert."
  type        = number
  default     = 85
 }
 variable "node_memory_critical_threshold" {
  description = "Memory percentage threshold for a node critical alert."
  type        = number
  default     = 95
 }
 variable "node_disk_warning_threshold" {
  description = "Disk percentage threshold for a node warning alert."
  type        = number
  default     = 80
 }
 variable "node_disk_critical_threshold" {
  description = "Disk percentage threshold for a node critical alert."
  type        = number
  default     = 95
 }
 variable "node_readiness_flapping_count" {
  description = "The number of readiness status changes in 15 minutes to be considered 'flapping'."
  type        = number
  default     = 3
 }

 # --- Pod & Container Thresholds ---
 variable "pod_cpu_warning_threshold" {
  description = "Pod CPU usage percentage vs request for a warning alert."
  type        = number
  default     = 70
 }
 variable "pod_cpu_critical_threshold" {
  description = "Pod CPU usage percentage vs request for a critical alert."
  type        = number
  default     = 90
 }
 variable "pod_memory_warning_threshold" {
  description = "Pod memory usage percentage vs request for a warning alert."
  type        = number
  default     = 75
 }
 variable "pod_memory_critical_threshold" {
  description = "Pod memory usage percentage vs request for a critical alert."
  type        = number
  default     = 90
 }
 variable "pod_restart_warning_threshold" {
  description = "Number of pod restarts in a 30-minute window to trigger a warning."
  type        = number
  default     = 5
 }
 variable "pod_restart_critical_threshold" {
  description = "Number of pod restarts in a 30-minute window to trigger a critical alert."
  type        = number
  default     = 10
 }
 variable "pod_container_waiting_minutes_threshold" {
  description = "The number of minutes a container can be in a 'Waiting' state before an alert is fired."
  type        = number
  default     = 60
 }

 # --- Control Plane Thresholds (in Milliseconds) ---
 variable "api_server_latency_warning_ms" {
  description = "P95 API server latency in milliseconds to trigger a warning."
  type        = number
  default     = 200
 }

 variable "api_server_latency_critical_ms" {
  description = "P95 API server latency in milliseconds to trigger a critical alert."
  type        = number
  default     = 500
 }

 variable "etcd_latency_warning_ms" {
  description = "P95 etcd latency in milliseconds to trigger a warning."
  type        = number
  default     = 100
 }

 variable "etcd_latency_critical_ms" {
  description = "P95 etcd latency in milliseconds to trigger a critical alert."
  type        = number
  default     = 500
 }

 # --- Cluster & Workload Thresholds ---
 variable "cluster_cpu_overcommit_ratio_threshold" {
  description = "The ratio of total CPU requests to allocatable capacity to trigger an overcommit alert (e.g., 1.5 for 150%)."
  type        = number
  default     = 1.5
 }

 variable "cluster_memory_overcommit_ratio_threshold" {
  description = "The ratio of total memory requests to allocatable capacity to trigger an overcommit alert (e.g., 1.5 for 150%)."
  type        = number
  default     = 1.5
 }

 === FILE_SEPARATOR ===
 FILE_METADATA: {"path": "versions.tf", "size": 499, "mtime": 1758483035.6279178, "mode": 33188, "encoding": "utf-8", "checksum": null, "mime_type": null, "is_binary": false, "error": null, "ends_with_newline": true}
 ENCODING: utf-8
 # versions.tf
 #
 # Specifies the version constraints for Terraform and the required providers.
 # This ensures that the module is used with compatible versions, preventing
 # unexpected errors or breaking changes.

 terraform {
  required_version = "~> 1.3"

  required_providers {
    azurerm = {
      source = "hashicorp/azurerm"
      # Updated to reflect the current major version for access to the latest features.
      # Allows versions >= 4.0.0 and < 5.0.0
      version = "~> 4.0"
    }
  }
 }
No results found