Created
March 19, 2025 09:06
-
-
Save davidlu1001/c64dfeb819dce17c16782072c9ee1e56 to your computer and use it in GitHub Desktop.
failoverMonitor.ps1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # FailoverMonitor.ps1 | |
| # Automatic Failover Monitoring and Execution Script | |
| # This script monitors Windows Events forwarded from servers and triggers | |
| # DNS failover when error patterns are detected that meet threshold criteria. | |
| # | |
| # Required dependencies: | |
| # - dnsFailover.ps1 in the same directory | |
| # - Windows Event Forwarding properly configured | |
| # - Appropriate permissions for DNS and IIS operations | |
| [CmdletBinding()] | |
| param ( | |
| [Parameter()] | |
| [switch]$Initialize, | |
| [Parameter()] | |
| [switch]$TestError, | |
| [Parameter()] | |
| [string]$Server = $env:COMPUTERNAME, | |
| [Parameter()] | |
| [string]$ErrorType = "COMException", | |
| [Parameter()] | |
| [string]$ConfigPath = "$PSScriptRoot\failover_config.json", | |
| [Parameter()] | |
| [string]$StateFilePath = "$PSScriptRoot\error_state.json", | |
| [Parameter()] | |
| [string]$LogFilePath = "$PSScriptRoot\FailoverMonitor.log", | |
| [Parameter()] | |
| [switch]$Force | |
| ) | |
| # Script version | |
| $script:Version = "1.0.0" | |
| # Set strict mode for better error detection | |
| Set-StrictMode -Version Latest | |
| $ErrorActionPreference = 'Stop' | |
| #region Functions | |
| # Function for handling log messages | |
| function Write-Log { | |
| [CmdletBinding()] | |
| param ( | |
| [Parameter(Mandatory = $true)] | |
| [string]$Message, | |
| [Parameter(Mandatory = $false)] | |
| [ValidateSet("INFO", "WARNING", "ERROR", "SUCCESS", "DEBUG")] | |
| [string]$Level = "INFO", | |
| [Parameter(Mandatory = $false)] | |
| [switch]$NoConsole | |
| ) | |
| $timestamp = Get-Date -Format "yyyy-MM-dd HH:mm:ss" | |
| $logMessage = "[$timestamp] [$Level] $Message" | |
| # Write to console with appropriate color if not suppressed | |
| if (-not $NoConsole) { | |
| switch ($Level) { | |
| "INFO" { Write-Host $logMessage -ForegroundColor Cyan } | |
| "WARNING" { Write-Host $logMessage -ForegroundColor Yellow } | |
| "ERROR" { Write-Host $logMessage -ForegroundColor Red } | |
| "SUCCESS" { Write-Host $logMessage -ForegroundColor Green } | |
| "DEBUG" { | |
| # Only show debug messages in verbose mode | |
| if ($VerbosePreference -eq 'Continue') { | |
| Write-Host $logMessage -ForegroundColor Gray | |
| } | |
| } | |
| } | |
| } | |
| # Append to log file | |
| try { | |
| # Create the log directory if it doesn't exist | |
| $logDir = Split-Path -Path $LogFilePath -Parent | |
| if (-not (Test-Path -Path $logDir -PathType Container)) { | |
| New-Item -Path $logDir -ItemType Directory -Force | Out-Null | |
| } | |
| Add-Content -Path $LogFilePath -Value $logMessage -ErrorAction Stop | |
| } | |
| catch { | |
| Write-Warning "Failed to write to log file: $_" | |
| } | |
| } | |
| # Function to rotate log files | |
| function Rotate-LogFile { | |
| [CmdletBinding()] | |
| param ( | |
| [Parameter(Mandatory = $true)] | |
| [string]$LogPath, | |
| [Parameter(Mandatory = $false)] | |
| [int]$MaxSizeMB = 10, | |
| [Parameter(Mandatory = $false)] | |
| [int]$FilesToKeep = 5 | |
| ) | |
| # Check if log file exists and exceeds max size | |
| if (Test-Path $LogPath) { | |
| $logFile = Get-Item $LogPath | |
| if ($logFile.Length -gt ($MaxSizeMB * 1MB)) { | |
| Write-Log "Log file size limit reached. Rotating logs..." -Level "DEBUG" | |
| $directory = Split-Path $LogPath -Parent | |
| $baseName = (Split-Path $LogPath -Leaf).Split('.')[0] | |
| $extension = if ($logFile.Extension) { $logFile.Extension } else { ".log" } | |
| $timestamp = Get-Date -Format "yyyyMMdd-HHmmss" | |
| $newName = Join-Path $directory "$($baseName)_$($timestamp)$extension" | |
| # Rename current log file | |
| try { | |
| Rename-Item -Path $LogPath -NewName $newName -Force | |
| Write-Log "Log file rotated to: $newName" -Level "DEBUG" | |
| # Clean up old log files | |
| $oldLogs = Get-ChildItem -Path $directory -Filter "$baseName*$extension" | | |
| Where-Object { $_.Name -ne (Split-Path $LogPath -Leaf) } | | |
| Sort-Object LastWriteTime -Descending | | |
| Select-Object -Skip $FilesToKeep | |
| foreach ($old in $oldLogs) { | |
| Remove-Item $old.FullName -Force | |
| Write-Log "Removed old log file: $($old.Name)" -Level "DEBUG" | |
| } | |
| } | |
| catch { | |
| Write-Log "Failed to rotate log file: $_" -Level "ERROR" | |
| } | |
| } | |
| } | |
| } | |
| # Function to read the configuration file | |
| function Get-FailoverConfig { | |
| [CmdletBinding()] | |
| param() | |
| try { | |
| if (Test-Path $ConfigPath) { | |
| $config = Get-Content $ConfigPath -Raw | ConvertFrom-Json | |
| # Validate required properties | |
| $requiredProps = @('dnsServer', 'lookupZone', 'dnsName', 'errorThreshold', | |
| 'windowMinutes', 'cooldownMinutes', 'Env') | |
| $missingProps = $requiredProps | Where-Object { -not $config.PSObject.Properties.Name.Contains($_) } | |
| if ($missingProps.Count -gt 0) { | |
| Write-Log "Configuration file is missing required properties: $($missingProps -join ', ')" -Level "WARNING" | |
| # Add missing properties with default values | |
| foreach ($prop in $missingProps) { | |
| Add-Member -InputObject $config -MemberType NoteProperty -Name $prop -Value $null | |
| } | |
| } | |
| # Apply default values for null properties | |
| if (-not $config.dnsServer) { $config.dnsServer = "dns1.company.com" } | |
| if (-not $config.lookupZone) { $config.lookupZone = "company.local" } | |
| if (-not $config.dnsName) { $config.dnsName = "LendingWebServer" } | |
| if (-not $config.errorThreshold) { $config.errorThreshold = 3 } | |
| if (-not $config.windowMinutes) { $config.windowMinutes = 10 } | |
| if (-not $config.cooldownMinutes) { $config.cooldownMinutes = 30 } | |
| if (-not $config.processInactiveServerErrors) { $config.processInactiveServerErrors = $false } | |
| if (-not $config.Env) { $config.Env = "Prod" } | |
| if (-not $config.mailSettings) { | |
| $config.mailSettings = @{ | |
| smtpServer = "smtp.company.com" | |
| from = "[email protected]" | |
| to = @("[email protected]") | |
| enableSsl = $false | |
| port = 25 | |
| } | |
| } | |
| # Save updated config if there were missing properties | |
| if ($missingProps.Count -gt 0) { | |
| $config | ConvertTo-Json -Depth 5 | Set-Content $ConfigPath | |
| } | |
| return $config | |
| } | |
| else { | |
| # Create default configuration | |
| $config = @{ | |
| dnsServer = "dns1.company.com" | |
| lookupZone = "company.local" | |
| dnsName = "LendingWebServer" | |
| errorThreshold = 3 | |
| windowMinutes = 10 | |
| cooldownMinutes = 30 | |
| processInactiveServerErrors = $false | |
| Env = "Prod" | |
| mailSettings = @{ | |
| smtpServer = "smtp.company.com" | |
| from = "[email protected]" | |
| to = @("[email protected]") | |
| enableSsl = $false | |
| port = 25 | |
| } | |
| } | |
| # Create directory if it doesn't exist | |
| $configDir = Split-Path -Path $ConfigPath -Parent | |
| if (-not (Test-Path -Path $configDir -PathType Container)) { | |
| New-Item -Path $configDir -ItemType Directory -Force | Out-Null | |
| } | |
| $config | ConvertTo-Json -Depth 5 | Set-Content $ConfigPath | |
| Write-Log "Created default configuration file at $ConfigPath" -Level "INFO" | |
| return $config | |
| } | |
| } | |
| catch { | |
| Write-Log "Error reading configuration: $_" -Level "ERROR" | |
| throw | |
| } | |
| } | |
| # Function to get DNS Failover script path | |
| function Get-DNSFailoverScriptPath { | |
| [CmdletBinding()] | |
| param() | |
| $scriptPath = Join-Path $PSScriptRoot "dnsFailover.ps1" | |
| if (-not (Test-Path $scriptPath)) { | |
| # Try to find it in parent directory | |
| $scriptPath = Join-Path (Split-Path $PSScriptRoot -Parent) "dnsFailover.ps1" | |
| if (-not (Test-Path $scriptPath)) { | |
| Write-Log "DNS Failover script not found at expected locations. Please specify path in config." -Level "ERROR" | |
| throw "DNS Failover script not found. Expected at: $scriptPath" | |
| } | |
| } | |
| return $scriptPath | |
| } | |
| # Import the Get-ActiveHost function from dnsFailover.ps1 | |
| function Import-DNSFailoverFunctions { | |
| [CmdletBinding()] | |
| param() | |
| try { | |
| $dnsFailoverPath = Get-DNSFailoverScriptPath | |
| # Import the script as a module to access its functions | |
| Import-Module $dnsFailoverPath -Force -DisableNameChecking | |
| # Verify the function exists | |
| if (-not (Get-Command "Get-ActiveHost" -ErrorAction SilentlyContinue)) { | |
| Write-Log "The Get-ActiveHost function was not found in the imported script." -Level "ERROR" | |
| throw "Required function 'Get-ActiveHost' not found in DNS Failover script" | |
| } | |
| Write-Log "Successfully imported functions from DNS Failover script" -Level "DEBUG" | |
| } | |
| catch { | |
| Write-Log "Failed to import DNS Failover functions: $_" -Level "ERROR" | |
| throw | |
| } | |
| } | |
| # Check if a server is the active server using the dnsFailover functions | |
| function Test-IsActiveServer { | |
| [CmdletBinding()] | |
| param ( | |
| [Parameter(Mandatory = $true)] | |
| [string]$ServerName, | |
| [Parameter(Mandatory = $true)] | |
| [PSCustomObject]$Config | |
| ) | |
| try { | |
| # Parameters for Get-ActiveHost | |
| $params = @{ | |
| dnsName = $Config.dnsName | |
| dnsServer = $Config.dnsServer | |
| lookupZone = $Config.lookupZone | |
| } | |
| # Call Get-ActiveHost from the imported module | |
| $activeHost = Get-ActiveHost @params | |
| # Compare server names (ignoring domain parts for flexibility) | |
| $shortActiveHost = $activeHost -replace '\..*$', '' | |
| $shortServerName = $ServerName -replace '\..*$', '' | |
| Write-Log "Active server check - Current: $ServerName, Active: $activeHost" -Level "DEBUG" | |
| return $shortServerName -ieq $shortActiveHost # Case-insensitive comparison | |
| } | |
| catch { | |
| Write-Log "Error checking if server is active: $_" -Level "ERROR" | |
| return $false # Default to not active on error | |
| } | |
| } | |
| # Get or initialize the error state tracking file | |
| function Get-ErrorState { | |
| [CmdletBinding()] | |
| param() | |
| try { | |
| if (Test-Path $StateFilePath) { | |
| $state = Get-Content $StateFilePath -Raw | ConvertFrom-Json | |
| # Check for required properties and initialize if missing | |
| if (-not (Get-Member -InputObject $state -Name "errors" -MemberType Properties)) { | |
| $state | Add-Member -MemberType NoteProperty -Name "errors" -Value @() | |
| } | |
| if (-not (Get-Member -InputObject $state -Name "last_failover" -MemberType Properties)) { | |
| $state | Add-Member -MemberType NoteProperty -Name "last_failover" -Value $null | |
| } | |
| if (-not (Get-Member -InputObject $state -Name "active_server" -MemberType Properties)) { | |
| $state | Add-Member -MemberType NoteProperty -Name "active_server" -Value $null | |
| } | |
| return $state | |
| } | |
| else { | |
| # Create directory if it doesn't exist | |
| $stateDir = Split-Path -Path $StateFilePath -Parent | |
| if (-not (Test-Path -Path $stateDir -PathType Container)) { | |
| New-Item -Path $stateDir -ItemType Directory -Force | Out-Null | |
| } | |
| # Initialize new state file | |
| $state = @{ | |
| errors = @() | |
| last_failover = $null | |
| active_server = $null | |
| last_trigger = $null | |
| last_from_server = $null | |
| failover_count_24h = 0 | |
| } | |
| $state | ConvertTo-Json -Depth 5 | Set-Content $StateFilePath | |
| Write-Log "Initialized new error state file" -Level "DEBUG" | |
| return $state | |
| } | |
| } | |
| catch { | |
| Write-Log "Error getting error state: $_" -Level "ERROR" | |
| # Return a default state object if there's an error | |
| return @{ | |
| errors = @() | |
| last_failover = $null | |
| active_server = $null | |
| last_trigger = $null | |
| last_from_server = $null | |
| failover_count_24h = 0 | |
| } | |
| } | |
| } | |
| # Save the current error state | |
| function Save-ErrorState { | |
| [CmdletBinding()] | |
| param ( | |
| [Parameter(Mandatory = $true)] | |
| [PSCustomObject]$State | |
| ) | |
| try { | |
| $State | ConvertTo-Json -Depth 5 | Set-Content $StateFilePath | |
| Write-Log "Error state saved successfully" -Level "DEBUG" | |
| } | |
| catch { | |
| Write-Log "Failed to save error state: $_" -Level "ERROR" | |
| } | |
| } | |
| # Update error occurrences and check thresholds | |
| function Update-ErrorOccurrence { | |
| [CmdletBinding()] | |
| param ( | |
| [Parameter(Mandatory = $true)] | |
| [string]$ErrorType, | |
| [Parameter(Mandatory = $true)] | |
| [string]$SourceServer, | |
| [Parameter(Mandatory = $true)] | |
| [PSCustomObject]$Config, | |
| [Parameter(Mandatory = $false)] | |
| [string]$EventMessage = "", | |
| [Parameter(Mandatory = $false)] | |
| [int]$EventId = 0 | |
| ) | |
| try { | |
| # Get current state | |
| $state = Get-ErrorState | |
| # Current time | |
| $now = Get-Date | |
| # Check if server is active | |
| $isActive = Test-IsActiveServer -ServerName $SourceServer -Config $Config | |
| # Add new error entry | |
| $errorEntry = @{ | |
| type = $ErrorType | |
| timestamp = $now.ToString('o') | |
| server = $SourceServer | |
| is_active_server = $isActive | |
| event_id = $EventId | |
| event_message = if ($EventMessage.Length -gt 500) { $EventMessage.Substring(0, 500) + "..." } else { $EventMessage } | |
| } | |
| $state.errors += $errorEntry | |
| # Clean up old errors outside the time window | |
| $cutoffTime = $now.AddMinutes(-$Config.windowMinutes) | |
| $state.errors = @($state.errors | Where-Object { | |
| [DateTime]::Parse($_.timestamp) -gt $cutoffTime | |
| }) | |
| # Save updated state | |
| Save-ErrorState -State $state | |
| # Filter errors based on configuration | |
| $relevantErrors = @($state.errors | Where-Object { | |
| $_.type -eq $ErrorType -and | |
| $_.server -eq $SourceServer -and | |
| ($_.is_active_server -or $Config.processInactiveServerErrors) | |
| }) | |
| $count = $relevantErrors.Count | |
| $serverType = if ($isActive) { "active" } else { "inactive" } | |
| Write-Log "Detected '$ErrorType' error (EventID: $EventId) from $serverType server: $SourceServer. Current count: $count/$($Config.errorThreshold) in $($Config.windowMinutes)min window" -Level "INFO" | |
| return @{ | |
| Count = $count | |
| IsActive = $isActive | |
| RelevantErrors = $relevantErrors | |
| } | |
| } | |
| catch { | |
| Write-Log "Error updating error occurrence: $_" -Level "ERROR" | |
| return @{ Count = 0; IsActive = $false; RelevantErrors = @() } | |
| } | |
| } | |
| # Check if the system is in cooldown period after a failover | |
| function Test-FailoverCooldown { | |
| [CmdletBinding()] | |
| param ( | |
| [Parameter(Mandatory = $true)] | |
| [PSCustomObject]$Config | |
| ) | |
| try { | |
| $state = Get-ErrorState | |
| # If no previous failover, no cooldown applies | |
| if (-not $state.last_failover) { | |
| return @{ InCooldown = $false } | |
| } | |
| # Check if within cooldown period | |
| $lastFailover = [DateTime]::Parse($state.last_failover) | |
| $cooldownEnd = $lastFailover.AddMinutes($Config.cooldownMinutes) | |
| $now = Get-Date | |
| if ($now -lt $cooldownEnd) { | |
| $remainingMinutes = [Math]::Ceiling(($cooldownEnd - $now).TotalMinutes) | |
| return @{ | |
| InCooldown = $true | |
| RemainingMinutes = $remainingMinutes | |
| CooldownEnd = $cooldownEnd | |
| LastFailover = $lastFailover | |
| } | |
| } | |
| return @{ InCooldown = $false } | |
| } | |
| catch { | |
| Write-Log "Error checking failover cooldown: $_" -Level "ERROR" | |
| return @{ InCooldown = $false } # Default to not in cooldown if error | |
| } | |
| } | |
| # Record a failover event | |
| function Record-Failover { | |
| [CmdletBinding()] | |
| param ( | |
| [Parameter(Mandatory = $true)] | |
| [string]$Trigger, | |
| [Parameter(Mandatory = $true)] | |
| [string]$FromServer, | |
| [Parameter(Mandatory = $true)] | |
| [string]$ToServer | |
| ) | |
| try { | |
| $state = Get-ErrorState | |
| $now = Get-Date | |
| # Check if this is a new day compared to last failover | |
| $resetDailyCount = $true | |
| if ($state.last_failover) { | |
| $lastFailover = [DateTime]::Parse($state.last_failover) | |
| if ($lastFailover.Date -eq $now.Date) { | |
| $resetDailyCount = $false | |
| } | |
| } | |
| $state.last_failover = $now.ToString('o') | |
| $state.active_server = $ToServer | |
| $state.last_trigger = $Trigger | |
| $state.last_from_server = $FromServer | |
| # Update daily failover counter | |
| if ($resetDailyCount) { | |
| $state.failover_count_24h = 1 | |
| } | |
| else { | |
| $state.failover_count_24h = if ($state.failover_count_24h) { $state.failover_count_24h + 1 } else { 1 } | |
| } | |
| Save-ErrorState -State $state | |
| Write-Log "Failover recorded: $FromServer -> $ToServer, Trigger: $Trigger, 24h count: $($state.failover_count_24h)" -Level "INFO" | |
| } | |
| catch { | |
| Write-Log "Error recording failover: $_" -Level "ERROR" | |
| } | |
| } | |
| # Send email notification about failover | |
| function Send-FailoverNotification { | |
| [CmdletBinding()] | |
| param ( | |
| [Parameter(Mandatory = $true)] | |
| [string]$OldServer, | |
| [Parameter(Mandatory = $true)] | |
| [string]$NewServer, | |
| [Parameter(Mandatory = $true)] | |
| [string]$ErrorType, | |
| [Parameter(Mandatory = $true)] | |
| [int]$EventId, | |
| [Parameter(Mandatory = $false)] | |
| [string]$EventMessage = "No detailed message available", | |
| [Parameter(Mandatory = $true)] | |
| [PSCustomObject]$Config | |
| ) | |
| try { | |
| $subject = "AUTOMATIC FAILOVER EXECUTED: $OldServer -> $NewServer" | |
| $body = @" | |
| <html> | |
| <body style="font-family: Arial, sans-serif;"> | |
| <h2 style="color: #c00;">Automatic Failover Has Been Executed</h2> | |
| <p><strong>Time:</strong> $(Get-Date -Format "yyyy-MM-dd HH:mm:ss")</p> | |
| <p><strong>Trigger:</strong> $ErrorType error threshold reached (Event ID: $EventId)</p> | |
| <p><strong>From Server:</strong> $OldServer</p> | |
| <p><strong>To Server:</strong> $NewServer</p> | |
| <p><strong>Environment:</strong> $($Config.Env)</p> | |
| <h3>Event Details:</h3> | |
| <pre style="background-color: #f0f0f0; padding: 10px; border: 1px solid #ddd;">$([System.Web.HttpUtility]::HtmlEncode($EventMessage))</pre> | |
| <p>This is an automated message from the Failover Monitoring System.</p> | |
| </body> | |
| </html> | |
| "@ | |
| $mailParams = @{ | |
| SmtpServer = $Config.mailSettings.smtpServer | |
| From = $Config.mailSettings.from | |
| To = $Config.mailSettings.to | |
| Subject = $subject | |
| Body = $body | |
| BodyAsHtml = $true | |
| } | |
| # Add optional email configuration parameters if they exist | |
| if ($Config.mailSettings.port) { | |
| $mailParams['Port'] = $Config.mailSettings.port | |
| } | |
| if ($Config.mailSettings.enableSsl -eq $true) { | |
| $mailParams['UseSsl'] = $true | |
| } | |
| if ($Config.mailSettings.credential) { | |
| # Handle credentials - might need adjustment based on how credentials are stored | |
| $securePassword = ConvertTo-SecureString $Config.mailSettings.credential.password -AsPlainText -Force | |
| $credential = New-Object System.Management.Automation.PSCredential($Config.mailSettings.credential.username, $securePassword) | |
| $mailParams['Credential'] = $credential | |
| } | |
| # Send the email | |
| Send-MailMessage @mailParams | |
| Write-Log "Failover notification email sent successfully" -Level "SUCCESS" | |
| return $true | |
| } | |
| catch { | |
| Write-Log "Failed to send failover notification email: $_" -Level "ERROR" | |
| return $false | |
| } | |
| } | |
| # Execute the DNS failover operation | |
| function Invoke-DNSFailover { | |
| [CmdletBinding()] | |
| param ( | |
| [Parameter(Mandatory = $true)] | |
| [PSCustomObject]$Config, | |
| [Parameter(Mandatory = $true)] | |
| [string]$ErrorType, | |
| [Parameter(Mandatory = $true)] | |
| [int]$EventId, | |
| [Parameter(Mandatory = $false)] | |
| [string]$EventMessage = "No detailed message available" | |
| ) | |
| try { | |
| # Get current active host before failover | |
| $params = @{ | |
| dnsName = $Config.dnsName | |
| dnsServer = $Config.dnsServer | |
| lookupZone = $Config.lookupZone | |
| } | |
| $activeHost = Get-ActiveHost @params | |
| Write-Log "Current active host before failover: $activeHost" -Level "INFO" | |
| if (-not $activeHost) { | |
| Write-Log "Unable to determine current active host. Aborting failover." -Level "ERROR" | |
| return $false | |
| } | |
| # Get DNS Failover script path | |
| $dnsFailoverScript = Get-DNSFailoverScriptPath | |
| # Build parameters for DNS failover script | |
| $scriptParams = @( | |
| "-Env", $Config.Env, | |
| "-Ops", "complete-cycle", | |
| "-dnsServer", $Config.dnsServer, | |
| "-lookupZone", $Config.lookupZone | |
| ) | |
| # Execute the failover script | |
| Write-Log "Executing DNS failover with parameters: $scriptParams" -Level "INFO" | |
| & $dnsFailoverScript @scriptParams | |
| if ($LASTEXITCODE -ne 0) { | |
| Write-Log "DNS failover script execution failed with exit code: $LASTEXITCODE" -Level "ERROR" | |
| return $false | |
| } | |
| # Get new active host after failover | |
| $newActiveHost = Get-ActiveHost @params | |
| Write-Log "New active host after failover: $newActiveHost" -Level "SUCCESS" | |
| # If the host didn't change, something went wrong | |
| if ($newActiveHost -eq $activeHost) { | |
| Write-Log "Failover appears to have failed - active host did not change" -Level "WARNING" | |
| # Continue anyway to record the attempt and send notification | |
| } | |
| # Record the failover | |
| Record-Failover -Trigger "$ErrorType threshold reached (EventID: $EventId)" -FromServer $activeHost -ToServer $newActiveHost | |
| # Send notification | |
| Send-FailoverNotification -OldServer $activeHost -NewServer $newActiveHost -ErrorType $ErrorType -EventId $EventId -EventMessage $EventMessage -Config $Config | |
| Write-Log "Failover process completed: $activeHost -> $newActiveHost" -Level "SUCCESS" | |
| return $true | |
| } | |
| catch { | |
| Write-Log "Error executing DNS failover: $_" -Level "ERROR" | |
| return $false | |
| } | |
| } | |
| # Process an error event and determine if failover is needed | |
| function Process-ErrorEvent { | |
| [CmdletBinding()] | |
| param ( | |
| [Parameter(Mandatory = $true)] | |
| [string]$ErrorType, | |
| [Parameter(Mandatory = $true)] | |
| [string]$SourceServer, | |
| [Parameter(Mandatory = $false)] | |
| [string]$EventMessage = "No message provided", | |
| [Parameter(Mandatory = $false)] | |
| [int]$EventId = 0, | |
| [Parameter(Mandatory = $false)] | |
| [switch]$ForceFailover | |
| ) | |
| try { | |
| # Load configuration | |
| $config = Get-FailoverConfig | |
| # Check if in cooldown period | |
| $cooldownCheck = Test-FailoverCooldown -Config $config | |
| if ($cooldownCheck.InCooldown -and -not $ForceFailover) { | |
| Write-Log "System is in cooldown period. $($cooldownCheck.RemainingMinutes) minutes remaining until next failover can be triggered" -Level "WARNING" | |
| return $false | |
| } | |
| # Update error count and get status | |
| $errorStatus = Update-ErrorOccurrence -ErrorType $ErrorType -SourceServer $SourceServer -Config $config -EventMessage $EventMessage -EventId $EventId | |
| # Check if server is active | |
| if (-not $errorStatus.IsActive -and -not $config.processInactiveServerErrors -and -not $ForceFailover) { | |
| Write-Log "Ignoring error from inactive server $SourceServer (configure 'processInactiveServerErrors=true' to change this behavior)" -Level "INFO" | |
| return $false | |
| } | |
| # Check if threshold is reached or force flag is set | |
| if ($errorStatus.Count -ge $config.errorThreshold -or $ForceFailover) { | |
| $triggerReason = if ($ForceFailover) { "forced failover" } else { "threshold reached" } | |
| Write-Log "Triggering failover due to $triggerReason : $($errorStatus.Count) errors of type $ErrorType detected in $($config.windowMinutes) minute window" -Level "WARNING" | |
| # Execute failover | |
| $failoverResult = Invoke-DNSFailover -Config $config -ErrorType $ErrorType -EventId $EventId -EventMessage $EventMessage | |
| if ($failoverResult) { | |
| Write-Log "Failover executed successfully" -Level "SUCCESS" | |
| } | |
| else { | |
| Write-Log "Failover execution failed" -Level "ERROR" | |
| } | |
| return $failoverResult | |
| } | |
| else { | |
| Write-Log "Error threshold not yet reached: $($errorStatus.Count)/$($config.errorThreshold) errors in $($config.windowMinutes) minute window" -Level "INFO" | |
| return $false | |
| } | |
| } | |
| catch { | |
| Write-Log "Error processing error event: $_" -Level "ERROR" | |
| return $false | |
| } | |
| } | |
| # Configure Windows Event Forwarding for COMException monitoring | |
| function Initialize-EventSubscription { | |
| [CmdletBinding()] | |
| param() | |
| try { | |
| # Ensure the Event Collector service is running | |
| $service = Get-Service -Name Wecsvc -ErrorAction SilentlyContinue | |
| if (-not $service) { | |
| Write-Log "Windows Event Collector service (Wecsvc) not found" -Level "ERROR" | |
| return $false | |
| } | |
| if ($service.Status -ne 'Running') { | |
| Write-Log "Starting Windows Event Collector service" -Level "INFO" | |
| Start-Service -Name Wecsvc | |
| } | |
| # Configure event collector | |
| Write-Log "Configuring Windows Event Collector" -Level "INFO" | |
| wecutil qc -quiet | |
| # Create subscription XML | |
| $subscriptionName = "ApplicationErrorMonitoring" | |
| $subscriptionXml = @" | |
| <Subscription xmlns="http://schemas.microsoft.com/2006/03/windows/events/subscription"> | |
| <SubscriptionId>$subscriptionName</SubscriptionId> | |
| <SubscriptionType>SourceInitiated</SubscriptionType> | |
| <Description>Monitor for COMException errors</Description> | |
| <Enabled>true</Enabled> | |
| <Uri>http://schemas.microsoft.com/wbem/wsman/1/windows/EventLog</Uri> | |
| <ConfigurationMode>Custom</ConfigurationMode> | |
| <Delivery Mode="Push"> | |
| <Batching> | |
| <MaxItems>1</MaxItems> | |
| <MaxLatencyTime>1000</MaxLatencyTime> | |
| </Batching> | |
| <PushSettings> | |
| <Heartbeat Interval="900000"/> | |
| </PushSettings> | |
| </Delivery> | |
| <Query> | |
| <![CDATA[ | |
| <QueryList> | |
| <Query Id="0"> | |
| <Select Path="Application">*[System[(Level=1 or Level=2 or Level=3) and EventData[Data and (Data contains 'COMException')]]]</Select> | |
| </Query> | |
| </QueryList> | |
| ]]> | |
| </Query> | |
| <ReadExistingEvents>false</ReadExistingEvents> | |
| <TransportName>HTTP</TransportName> | |
| <ContentFormat>RenderedText</ContentFormat> | |
| <Locale Language="en-US"/> | |
| <LogFile>ForwardedEvents</LogFile> | |
| <AllowedSourceDomainComputers>O:NSG:NSD:(A;;GA;;;DC)(A;;GA;;;NS)</AllowedSourceDomainComputers> | |
| </Subscription> | |
| "@ | |
| $subscriptionPath = Join-Path $PSScriptRoot "event_subscription.xml" | |
| $subscriptionXml | Out-File -FilePath $subscriptionPath -Encoding utf8 | |
| # Create or update the subscription | |
| $existingSubscription = wecutil es | Where-Object { $_ -eq $subscriptionName } | |
| if ($existingSubscription) { | |
| Write-Log "Updating existing subscription: $subscriptionName" -Level "INFO" | |
| wecutil ss $subscriptionName /c:$subscriptionPath | |
| } | |
| else { | |
| Write-Log "Creating new subscription: $subscriptionName" -Level "INFO" | |
| wecutil cs $subscriptionPath | |
| } | |
| # Get configuration to determine servers to monitor | |
| $config = Get-FailoverConfig | |
| # Check if there are servers defined for monitoring in config | |
| if ($config.monitoredServers -and $config.monitoredServers.Count -gt 0) { | |
| Write-Log "Adding monitored servers to subscription" -Level "INFO" | |
| # Create temp file for computer list | |
| $computerListPath = Join-Path $env:TEMP "monitored_computers.txt" | |
| $config.monitoredServers | Out-File -FilePath $computerListPath -Encoding utf8 | |
| # Add computers to subscription | |
| wecutil ss $subscriptionName /cf:$computerListPath | |
| # Clean up temp file | |
| Remove-Item $computerListPath -Force | |
| } | |
| else { | |
| Write-Log "No monitored servers defined in configuration. You need to add servers manually or update config." -Level "WARNING" | |
| } | |
| Write-Log "Event subscription setup complete" -Level "SUCCESS" | |
| return $true | |
| } | |
| catch { | |
| Write-Log "Error initializing event subscription: $_" -Level "ERROR" | |
| return $false | |
| } | |
| } | |
| # Setup event watcher for forwarded events | |
| function Start-EventWatcher { | |
| [CmdletBinding()] | |
| param() | |
| try { | |
| # Create event query for ForwardedEvents log | |
| $query = New-Object System.Diagnostics.Eventing.Reader.EventLogQuery( | |
| "ForwardedEvents", | |
| [System.Diagnostics.Eventing.Reader.StandardEventLogMode]::Default | |
| ) | |
| # Create watcher | |
| $script:EventWatcher = New-Object System.Diagnostics.Eventing.Reader.EventLogWatcher($query) | |
| # Register event handler | |
| Register-ObjectEvent -InputObject $script:EventWatcher -EventName EventRecordWritten -Action { | |
| $event = $EventArgs.EventRecord | |
| # Only process if we have a valid event | |
| if ($event) { | |
| try { | |
| # Check for COMException pattern | |
| if ($event.Message -like "*COMException*") { | |
| $sourceServer = $event.MachineName | |
| $eventId = $event.Id | |
| $eventMessage = $event.Message | |
| # Call our error processing function | |
| Process-ErrorEvent -ErrorType "COMException" -SourceServer $sourceServer -EventMessage $eventMessage -EventId $eventId | |
| } | |
| } | |
| catch { | |
| # Log the error but don't crash the event handler | |
| $timestamp = Get-Date -Format "yyyy-MM-dd HH:mm:ss" | |
| $errorMessage = "[$timestamp] [ERROR] Error in event handler: $_" | |
| Add-Content -Path $using:LogFilePath -Value $errorMessage | |
| } | |
| } | |
| } | Out-Null | |
| # Enable the watcher | |
| $script:EventWatcher.Enabled = $true | |
| Write-Log "Event watcher started. Now monitoring for COMException events." -Level "SUCCESS" | |
| return $true | |
| } | |
| catch { | |
| Write-Log "Error starting event watcher: $_" -Level "ERROR" | |
| return $false | |
| } | |
| } | |
| # Stop the event watcher | |
| function Stop-EventWatcher { | |
| [CmdletBinding()] | |
| param() | |
| if ($script:EventWatcher) { | |
| try { | |
| $script:EventWatcher.Enabled = $false | |
| $script:EventWatcher.Dispose() | |
| $script:EventWatcher = $null | |
| Write-Log "Event watcher stopped" -Level "INFO" | |
| } | |
| catch { | |
| Write-Log "Error stopping event watcher: $_" -Level "ERROR" | |
| } | |
| } | |
| } | |
| # Run a test by simulating an error event | |
| function Test-ErrorSimulation { | |
| [CmdletBinding()] | |
| param ( | |
| [Parameter(Mandatory = $true)] | |
| [string]$Server, | |
| [Parameter(Mandatory = $true)] | |
| [string]$ErrorType, | |
| [Parameter(Mandatory = $false)] | |
| [switch]$ForceFailover | |
| ) | |
| try { | |
| Write-Log "Running test error simulation for $ErrorType on server $Server" -Level "INFO" | |
| # Create a simulated event message | |
| $eventMessage = @" | |
| Test $ErrorType error generated for testing purposes. | |
| Process: TestProcess.exe | |
| Error Details: System.Runtime.InteropServices.COMException (0x80004005): Error HRESULT E_FAIL has been returned from a call to a COM component. | |
| at System.Runtime.InteropServices.Marshal.ThrowExceptionForHR(Int32 errorCode) | |
| at TestComponent.TestMethod() | |
| "@ | |
| # Process the simulated error | |
| $result = Process-ErrorEvent -ErrorType $ErrorType -SourceServer $Server -EventMessage $eventMessage -EventId 9999 -ForceFailover:$ForceFailover | |
| if ($result) { | |
| Write-Log "Test error simulation successfully triggered failover" -Level "SUCCESS" | |
| } | |
| else { | |
| Write-Log "Test error simulation did not trigger failover" -Level "INFO" | |
| } | |
| return $result | |
| } | |
| catch { | |
| Write-Log "Error during test error simulation: $_" -Level "ERROR" | |
| return $false | |
| } | |
| } | |
| # Main function to run the monitor as a service | |
| function Start-FailoverMonitor { | |
| [CmdletBinding()] | |
| param ( | |
| [Parameter(Mandatory = $false)] | |
| [switch]$AsService | |
| ) | |
| try { | |
| # Rotate log file if needed | |
| Rotate-LogFile -LogPath $LogFilePath | |
| Write-Log "=======================================================" -Level "INFO" | |
| Write-Log "Starting Failover Monitor v$script:Version" -Level "INFO" | |
| Write-Log "=======================================================" -Level "INFO" | |
| # Import DNS Failover functions | |
| Import-DNSFailoverFunctions | |
| # Get configuration | |
| $config = Get-FailoverConfig | |
| Write-Log "Configuration loaded" -Level "DEBUG" | |
| # Initialize event subscription | |
| Initialize-EventSubscription | |
| # Start event watcher | |
| Start-EventWatcher | |
| if (-not $AsService) { | |
| # If running interactively, keep script alive | |
| Write-Log "Failover Monitor is now running. Press Ctrl+C to stop." -Level "INFO" | |
| try { | |
| while ($true) { | |
| Start-Sleep -Seconds 60 | |
| # Rotate log file periodically | |
| Rotate-LogFile -LogPath $LogFilePath | |
| } | |
| } | |
| finally { | |
| Stop-EventWatcher | |
| Write-Log "Failover Monitor stopped" -Level "INFO" | |
| } | |
| } | |
| else { | |
| # When running as a service, just return - the event registration keeps it active | |
| Write-Log "Failover Monitor service is now running" -Level "INFO" | |
| return | |
| } | |
| } | |
| catch { | |
| Write-Log "Critical error in Failover Monitor: $_" -Level "ERROR" | |
| Stop-EventWatcher | |
| throw | |
| } | |
| } | |
| #endregion Functions | |
| #region Main Execution | |
| # Handle script parameters | |
| if ($Initialize) { | |
| Write-Log "Initializing Failover Monitor" -Level "INFO" | |
| try { | |
| # Import DNS Failover functions | |
| Import-DNSFailoverFunctions | |
| # Initialize configuration | |
| $config = Get-FailoverConfig | |
| # Initialize event subscription | |
| $result = Initialize-EventSubscription | |
| if ($result) { | |
| Write-Log "Initialization completed successfully" -Level "SUCCESS" | |
| } | |
| else { | |
| Write-Log "Initialization completed with warnings" -Level "WARNING" | |
| } | |
| } | |
| catch { | |
| Write-Log "Initialization failed: $_" -Level "ERROR" | |
| exit 1 | |
| } | |
| exit 0 | |
| } | |
| elseif ($TestError) { | |
| Write-Log "Running error simulation test" -Level "INFO" | |
| try { | |
| # Import DNS Failover functions | |
| Import-DNSFailoverFunctions | |
| # Run test | |
| $result = Test-ErrorSimulation -Server $Server -ErrorType $ErrorType -ForceFailover:$Force | |
| if ($result) { | |
| Write-Log "Test completed successfully and triggered failover" -Level "SUCCESS" | |
| } | |
| else { | |
| Write-Log "Test completed but did not trigger failover" -Level "INFO" | |
| } | |
| } | |
| catch { | |
| Write-Log "Test failed: $_" -Level "ERROR" | |
| exit 1 | |
| } | |
| exit 0 | |
| } | |
| else { | |
| # Start the monitor | |
| Start-FailoverMonitor | |
| } | |
| #endregion Main Execution |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment