Last active
April 1, 2025 04:41
-
-
Save davidlu1001/fb5490654922dd811cc22625bc8e223b to your computer and use it in GitHub Desktop.
autoFailoverMonitorEmail.ps1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# autoFailoverMonitor.ps1 | |
# This script monitors the active server for COMException errors in EventLog | |
# and automatically triggers a complete failover cycle when threshold is met. | |
# Enhanced with simplified email reporting capabilities. | |
[CmdletBinding()] | |
param( | |
[Parameter(Mandatory = $false)] | |
[ValidateSet("Dev", "Prod")] | |
[string]$Env = "Dev", | |
[Parameter(Mandatory = $false)] | |
[string]$dnsServer, | |
[Parameter(Mandatory = $false)] | |
[string]$lookupZone, | |
[Parameter(Mandatory = $false)] | |
[string]$CompleteFailoverScriptPath = "$PSScriptRoot\completeFailoverCycle.ps1", | |
[Parameter(Mandatory = $false)] | |
[string]$logFilePath = "$PSScriptRoot\AutoFailoverMonitor.log", | |
[Parameter(Mandatory = $false)] | |
[string]$stateFilePath = "$PSScriptRoot\AutoFailoverState.json", | |
[Parameter(Mandatory = $false)] | |
[int]$ErrorThreshold = 1, | |
[Parameter(Mandatory = $false)] | |
[int]$TimeWindowMinutes = 5, | |
[Parameter(Mandatory = $false)] | |
[int]$CooldownPeriodMinutes = 45, | |
[Parameter(Mandatory = $false)] | |
[switch]$RunAsService, | |
[Parameter(Mandatory = $false)] | |
[switch]$ForceFailover, | |
[Parameter(Mandatory = $false)] | |
[switch]$Initialize, | |
[Parameter(Mandatory = $false)] | |
[switch]$TestMode, | |
[Parameter(Mandatory = $false)] | |
[switch]$SimulateError, | |
[Parameter(Mandatory = $false)] | |
[switch]$Help, | |
# Email reporting parameters | |
[Parameter(Mandatory = $false)] | |
[string]$SmtpServer, | |
[Parameter(Mandatory = $false)] | |
[int]$SmtpPort = 25, | |
[Parameter(Mandatory = $false)] | |
[string]$EmailFrom, | |
[Parameter(Mandatory = $false)] | |
[string[]]$EmailTo, | |
[Parameter(Mandatory = $false)] | |
[string[]]$EmailCc, | |
[Parameter(Mandatory = $false)] | |
[switch]$EnableSSL, | |
[Parameter(Mandatory = $false)] | |
[string]$SmtpUsername, | |
[Parameter(Mandatory = $false)] | |
[System.Security.SecureString]$SmtpPassword, | |
[Parameter(Mandatory = $false)] | |
[switch]$SendEmailReport, | |
[Parameter(Mandatory = $false)] | |
[switch]$GenerateReportOnly, | |
[Parameter(Mandatory = $false)] | |
[int]$LogRotationCheckIntervalMinutes = 60, | |
[Parameter(Mandatory = $false)] | |
[int]$DefaultTTLMinutes = 3, | |
[Parameter(Mandatory = $false)] | |
[int]$ReducedTTLMinutes = 1 | |
) | |
# Add reference to System.Web for HTML encoding | |
Add-Type -AssemblyName System.Web | |
# Function to show help information | |
function Show-Help { | |
$helpText = @" | |
Automatic Failover Monitor Help | |
=============================== | |
Description: | |
This script monitors the active server for COMException errors in the Event Log | |
and automatically triggers a complete failover cycle when threshold is met. | |
Enhanced with email reporting capabilities. | |
Syntax: | |
.\autoFailoverMonitor.ps1 [-Env <String>] [-dnsServer <String>] [-lookupZone <String>] | |
[-CompleteFailoverScriptPath <String>] [-logFilePath <String>] | |
[-stateFilePath <String>] [-ErrorThreshold <Int>] | |
[-TimeWindowMinutes <Int>] [-CooldownPeriodMinutes <Int>] | |
[-DefaultTTLMinutes <Int>] [-ReducedTTLMinutes <Int>] | |
[-LogRotationCheckIntervalMinutes <Int>] | |
[-RunAsService] [-ForceFailover] [-Initialize] [-TestMode] | |
[-SimulateError] [-Help] | |
[-SmtpServer <String>] [-SmtpPort <Int>] [-EmailFrom <String>] | |
[-EmailTo <String[]>] [-EmailCc <String[]>] [-EnableSSL] | |
[-SmtpUsername <String>] [-SmtpPassword <String>] | |
[-SendEmailReport] [-GenerateReportOnly] | |
Parameters: | |
-Env <String> | |
Specifies the environment to operate in. | |
Valid values: Dev, Prod | |
Default: Dev | |
-dnsServer <String> | |
Specifies the DNS server to use for operations. | |
-lookupZone <String> | |
Specifies the DNS lookup zone. | |
-CompleteFailoverScriptPath <String> | |
Path to the completeFailoverCycle.ps1 script. | |
Default: .\completeFailoverCycle.ps1 | |
-logFilePath <String> | |
Path to the log file. | |
Default: .\AutoFailoverMonitor.log | |
-stateFilePath <String> | |
Path to the state file that tracks error occurrences and cooldown period. | |
Default: .\AutoFailoverState.json | |
-ErrorThreshold <Int> | |
Number of COMException errors that must occur within the time window to trigger failover. | |
Default: 1 | |
-TimeWindowMinutes <Int> | |
Time window in minutes within which errors are counted. | |
Default: 5 | |
-CooldownPeriodMinutes <Int> | |
Cooldown period in minutes after a failover during which no new failover will be triggered. | |
Default: 45 | |
-DefaultTTLMinutes <Int> | |
Standard DNS TTL value in minutes to use during normal operations. | |
Default: 3 | |
-ReducedTTLMinutes <Int> | |
Reduced DNS TTL value in minutes to use when errors are detected. | |
Default: 1 | |
-LogRotationCheckIntervalMinutes <Int> | |
Interval in minutes between log rotation checks when running as a service. | |
Default: 60 | |
-RunAsService [Switch] | |
If specified, the script will run as a continuous monitoring service. | |
-ForceFailover [Switch] | |
If specified, forces a failover regardless of error count or cooldown period. | |
-Initialize [Switch] | |
If specified, initializes the monitoring environment (creates state file, etc.) | |
-TestMode [Switch] | |
If specified, runs in test mode without actually triggering failover. | |
-SimulateError [Switch] | |
If specified, simulates COMException errors for testing purposes. | |
-Help [Switch] | |
Shows this help message. | |
Email Reporting Parameters: | |
-SmtpServer <String> | |
SMTP server address for sending email reports. | |
-SmtpPort <Int> | |
SMTP server port. | |
Default: 25 | |
-EmailFrom <String> | |
Email address to send reports from. | |
-EmailTo <String[]> | |
Email addresses to send reports to (comma-separated for multiple). | |
-EmailCc <String[]> | |
Email addresses to carbon copy on reports (comma-separated for multiple). | |
-EnableSSL [Switch] | |
If specified, enables SSL for SMTP connection. | |
-SmtpUsername <String> | |
Username for SMTP authentication. | |
-SmtpPassword <String> | |
Password for SMTP authentication. | |
-SendEmailReport [Switch] | |
If specified, enables automatic email reports for failover events. | |
-GenerateReportOnly [Switch] | |
If specified, only generates a report from existing logs without monitoring. | |
Examples: | |
# Show help | |
.\autoFailoverMonitor.ps1 -Help | |
# Initialize the monitoring environment | |
.\autoFailoverMonitor.ps1 -Initialize | |
# Run the monitor once to check for errors and trigger failover if needed | |
.\autoFailoverMonitor.ps1 -Env Prod -dnsServer "dns1.company.com" -lookupZone "company.local" | |
# Run the monitor as a continuous service with email reporting | |
.\autoFailoverMonitor.ps1 -Env Prod -RunAsService -SendEmailReport -SmtpServer "smtp.company.com" -EmailFrom "[email protected]" -EmailTo "[email protected]" | |
# Run as service with custom log rotation interval | |
.\autoFailoverMonitor.ps1 -Env Prod -RunAsService -LogRotationCheckIntervalMinutes 120 | |
# Force a failover and send email report | |
.\autoFailoverMonitor.ps1 -ForceFailover -SendEmailReport | |
# Generate a report from existing logs without monitoring | |
.\autoFailoverMonitor.ps1 -GenerateReportOnly -SmtpServer "smtp.company.com" -EmailFrom "[email protected]" -EmailTo "[email protected]" | |
Notes: | |
- This script requires the completeFailoverCycle.ps1 script to be accessible | |
- Appropriate permissions are required for reading Event Logs and executing the failover script | |
- All operations are logged to the specified log file | |
- The script maintains state to track error occurrences and enforce cooldown periods | |
- Email reporting provides notifications at the start and completion of failover operations | |
- Email reporting requires valid SMTP server settings | |
"@ | |
Write-Host $helpText | |
exit 0 | |
} | |
# Show help if requested | |
if ($Help) { | |
Show-Help | |
} | |
# Enable strict mode for better error handling | |
Set-StrictMode -Version Latest | |
# Set error action preference to stop script execution on error | |
$ErrorActionPreference = 'Stop' | |
# Ensure TLS 1.2 is used for all secure communications | |
[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 | |
#region Supporting Functions | |
# Create a variable to hold the temporary log for the current execution | |
$script:tempLogPath = [System.IO.Path]::GetTempFileName() | |
$script:tempLogContent = @() | |
# Function to write log messages | |
function Write-Log { | |
[CmdletBinding()] | |
param ( | |
[Parameter(Mandatory = $true)] | |
[string]$Message, | |
[Parameter(Mandatory = $false)] | |
[ValidateSet("INFO", "WARNING", "ERROR", "SUCCESS")] | |
[string]$Level = "INFO" | |
) | |
$timestamp = Get-Date -Format "yyyy-MM-dd HH:mm:ss" | |
$logMessage = "[$timestamp] [$Level] $Message" | |
# Write to console with appropriate color | |
switch ($Level) { | |
"INFO" { Write-Host $logMessage -ForegroundColor Cyan } | |
"WARNING" { Write-Host $logMessage -ForegroundColor Yellow } | |
"ERROR" { Write-Host $logMessage -ForegroundColor Red } | |
"SUCCESS" { Write-Host $logMessage -ForegroundColor Green } | |
} | |
# Append to temp log content for this execution | |
$script:tempLogContent += $logMessage | |
# Append to log file | |
try { | |
# Create the log directory if it doesn't exist | |
$logDir = Split-Path -Path $logFilePath -Parent | |
if (-not (Test-Path -Path $logDir -PathType Container)) { | |
New-Item -Path $logDir -ItemType Directory -Force | Out-Null | |
} | |
Add-Content -Path $logFilePath -Value $logMessage -ErrorAction Stop | |
Add-Content -Path $script:tempLogPath -Value $logMessage -ErrorAction Stop | |
} | |
catch { | |
Write-Warning "Failed to write to log file: $_" | |
} | |
} | |
# Function to rotate log files | |
function Rotate-LogFile { | |
[CmdletBinding()] | |
param ( | |
[Parameter(Mandatory = $true)] | |
[string]$LogPath, | |
[Parameter(Mandatory = $false)] | |
[int]$MaxSizeMB = 10, | |
[Parameter(Mandatory = $false)] | |
[int]$FilesToKeep = 5 | |
) | |
# Check if log file exists and exceeds max size | |
if (Test-Path $LogPath) { | |
$logFile = Get-Item $LogPath | |
if ($logFile.Length -gt ($MaxSizeMB * 1MB)) { | |
Write-Log "Log file size limit reached. Rotating logs..." -Level "INFO" | |
$directory = Split-Path $LogPath -Parent | |
$baseName = (Split-Path $LogPath -Leaf).Split('.')[0] | |
$extension = if ($logFile.Extension) { $logFile.Extension } else { ".log" } | |
$timestamp = Get-Date -Format "yyyyMMdd-HHmmss" | |
$newName = Join-Path $directory "$($baseName)_$($timestamp)$extension" | |
# Rename current log file | |
try { | |
Copy-Item -Path $LogPath -Destination $newName -Force | |
Remove-Item -Path $LogPath -Force | |
Write-Host "Log file rotated to: $newName" | |
# Clean up old log files | |
$oldLogs = Get-ChildItem -Path $directory -Filter "$baseName*$extension" | | |
Where-Object { $_.Name -ne (Split-Path $LogPath -Leaf) } | | |
Sort-Object LastWriteTime -Descending | | |
Select-Object -Skip $FilesToKeep | |
foreach ($old in $oldLogs) { | |
Remove-Item $old.FullName -Force | |
Write-Host "Removed old log file: $($old.Name)" | |
} | |
} | |
catch { | |
Write-Warning "Failed to rotate log file: $_" | |
} | |
} | |
} | |
} | |
# Function to validate the CompleteFailoverCycle script exists | |
function Test-CompleteFailoverScript { | |
if (-not (Test-Path -Path $CompleteFailoverScriptPath)) { | |
Write-Log "CompleteFailoverCycle script not found at: $CompleteFailoverScriptPath" -Level "ERROR" | |
Write-Log "Please provide the correct path using -CompleteFailoverScriptPath parameter" -Level "ERROR" | |
return $false | |
} | |
return $true | |
} | |
# Function to extract hostname from FQDN | |
function Get-HostnameFromFQDN { | |
param ( | |
[Parameter(Mandatory = $true)] | |
[string]$FQDN | |
) | |
# Extract hostname part (remove domain if present) | |
return $FQDN -replace '\..*$', '' | |
} | |
# Function to get current active server | |
function Get-CurrentActiveServer { | |
try { | |
# Determine the path to the dnsFailover script | |
$dnsFailoverScriptPath = Join-Path (Split-Path $CompleteFailoverScriptPath -Parent) "dnsFailover_v2.ps1" | |
if (-not (Test-Path $dnsFailoverScriptPath)) { | |
Write-Log "DNS Failover script not found at: $dnsFailoverScriptPath" -Level "ERROR" | |
return $null | |
} | |
Write-Log "Attempting to determine current active host..." -Level "INFO" | |
# Create the command line | |
$cmdArgs = "-Env `"$Env`" -Ops `"check`"" | |
# Add optional parameters if provided | |
if ($dnsServer) { | |
$cmdArgs += " -dnsServer `"$dnsServer`"" | |
} | |
if ($lookupZone) { | |
$cmdArgs += " -lookupZone `"$lookupZone`"" | |
} | |
# Execute the script using Invoke-Expression | |
$tempFile = [System.IO.Path]::GetTempFileName() | |
$scriptCmd = "& '$dnsFailoverScriptPath' $cmdArgs *>&1 | Tee-Object -FilePath '$tempFile'" | |
Write-Log "Running command: $scriptCmd" -Level "INFO" | |
$result = Invoke-Expression $scriptCmd | |
$exitCode = $LASTEXITCODE | |
# Read the captured output | |
$output = Get-Content -Path $tempFile -Raw -ErrorAction SilentlyContinue | |
Remove-Item -Path $tempFile -Force -ErrorAction SilentlyContinue | |
# Log the output for debugging | |
Write-Log "Script output: $output" -Level "INFO" | |
# Parse the output to find the active host - trying multiple patterns | |
$activeHost = $null | |
# Pattern 1: Direct "Active host check completed: X" message | |
if ($output -match "Active host check completed: ([^\s]+)") { | |
$activeHost = $Matches[1] | |
} | |
# Pattern 2: Look for "Active host is reachable: X" message | |
elseif ($output -match "Active host is reachable: ([^\s]+)") { | |
$activeHost = $Matches[1] | |
} | |
# Pattern 3: Look for "Current active host: X" message | |
elseif ($output -match "Current active host(?: before failover)?: ([^\s]+)") { | |
$activeHost = $Matches[1] | |
} | |
# Additional fallback patterns | |
elseif ($output -match "host(?:name)?\s+obtained: ([^\s]+)") { | |
$activeHost = $Matches[1] | |
} | |
elseif ($output -match "Pinging\s+([^\s]+)\s+") { | |
$activeHost = $Matches[1] | |
} | |
elseif ($output -match "Active host is: ([^\s]+)") { | |
$activeHost = $Matches[1] | |
} | |
# Fallback to a default server name if we cannot determine the active server | |
# This is only for simulation/test purposes | |
if (-not $activeHost -and ($TestMode -or $SimulateError)) { | |
# Use a generic server name based on environment | |
# In the Dev environment, there are two potential servers that could be active | |
if ($Env -eq "Dev") { | |
# Randomly choose between SERVER1 and SERVER2 for Dev environment | |
$serverNumber = Get-Random -Minimum 1 -Maximum 3 # Will return either 1 or 2 | |
$activeHost = "SERVER$serverNumber.$Env.example.com" | |
} | |
else { | |
$activeHost = "SERVER1.$Env.example.com" | |
} | |
Write-Log "Using default test server: $activeHost" -Level "WARNING" | |
} | |
if ($activeHost) { | |
Write-Log "Current active host: $activeHost" -Level "INFO" | |
return $activeHost | |
} | |
else { | |
Write-Log "Could not determine active host from script output" -Level "ERROR" | |
return $null | |
} | |
} | |
catch { | |
Write-Log "Error getting current active host: $_" -Level "ERROR" | |
# Fallback to a default server name in case of error | |
# This is only for simulation/test purposes | |
if ($TestMode -or $SimulateError) { | |
if ($Env -eq "Dev") { | |
# Randomly choose between SERVER1 and SERVER2 for Dev environment | |
$serverNumber = Get-Random -Minimum 1 -Maximum 3 # Will return either 1 or 2 | |
$activeHost = "SERVER$serverNumber.$Env.example.com" | |
} | |
else { | |
$activeHost = "SERVER1.$Env.example.com" | |
} | |
Write-Log "Using default test server: $activeHost" -Level "WARNING" | |
return $activeHost | |
} | |
return $null | |
} | |
} | |
# Function to update DNS TTL | |
function Update-DnsTTL { | |
[CmdletBinding()] | |
param ( | |
[Parameter(Mandatory = $true)] | |
[string]$DnsServer, | |
[Parameter(Mandatory = $true)] | |
[string]$LookupZone, | |
[Parameter(Mandatory = $true)] | |
[string]$DnsName, | |
[Parameter(Mandatory = $true)] | |
[int]$TTLMinutes, | |
[Parameter(Mandatory = $false)] | |
[int]$RetryAttempts = 2, | |
[Parameter(Mandatory = $false)] | |
[int]$RetryWaitSeconds = 5, | |
[Parameter(Mandatory = $false)] | |
[int]$DnsVerificationWaitSeconds = 5 | |
) | |
try { | |
Write-Log "Attempting to update TTL for $DnsName to $TTLMinutes minutes" -Level "INFO" | |
for ($attempt = 1; $attempt -le $RetryAttempts; $attempt++) { | |
try { | |
# Get current CNAME record | |
$dnsRecord = Get-DnsServerResourceRecord -Name $DnsName -RRType CName -ZoneName $LookupZone -ComputerName $DnsServer -ErrorAction Stop | |
if ($null -eq $dnsRecord) { | |
Write-Log "DNS record not found: $DnsName" -Level "ERROR" | |
return $false | |
} | |
$currentAlias = $dnsRecord.RecordData.HostNameAlias | |
$currentTTL = $dnsRecord.TimeToLive.TotalMinutes | |
Write-Log "Attempt $attempt : Current record - Alias: $currentAlias, TTL: $currentTTL minutes" -Level "INFO" | |
# Only update if the TTL actually needs to change | |
if ([Math]::Abs($currentTTL - $TTLMinutes) -lt 0.1) { | |
Write-Log "TTL already set to $TTLMinutes minutes, no update needed" -Level "INFO" | |
return $true | |
} | |
# Create new TTL timespan | |
$newTTL = [System.TimeSpan]::FromMinutes($TTLMinutes) | |
# Remove existing record | |
Write-Log "Removing existing DNS record" -Level "INFO" | |
Remove-DnsServerResourceRecord -ZoneName $LookupZone -Name $DnsName -RRType CName -ComputerName $DnsServer -Force -ErrorAction Stop | |
# Brief pause to ensure record is fully removed | |
Start-Sleep -Seconds $DnsVerificationWaitSeconds | |
# Add record with new TTL | |
Write-Log "Adding DNS record with new TTL" -Level "INFO" | |
Add-DnsServerResourceRecordCName -Name $DnsName -HostNameAlias $currentAlias -ZoneName $LookupZone -ComputerName $DnsServer -TimeToLive $newTTL -ErrorAction Stop | |
# Allow time for the change to propagate | |
Start-Sleep -Seconds $DnsVerificationWaitSeconds | |
# Verify the update | |
$verifyRecord = Get-DnsServerResourceRecord -Name $DnsName -RRType CName -ZoneName $LookupZone -ComputerName $DnsServer -ErrorAction Stop | |
$actualTTL = $verifyRecord.TimeToLive.TotalMinutes | |
if ([Math]::Abs($actualTTL - $TTLMinutes) -lt 0.1) { | |
Write-Log "Successfully updated TTL from $currentTTL to $actualTTL minutes" -Level "SUCCESS" | |
return $true | |
} | |
else { | |
Write-Log "TTL verification failed. Current: $actualTTL, Expected: $TTLMinutes" -Level "WARNING" | |
if ($attempt -lt $RetryAttempts) { | |
Write-Log "Will retry TTL update (attempt $attempt of $RetryAttempts)" -Level "WARNING" | |
Start-Sleep -Seconds $RetryWaitSeconds | |
} | |
else { | |
Write-Log "Maximum retry attempts reached" -Level "ERROR" | |
return $false | |
} | |
} | |
} | |
catch { | |
Write-Log "Error updating TTL (attempt $attempt of $RetryAttempts): $_" -Level "ERROR" | |
if ($attempt -lt $RetryAttempts) { | |
Write-Log "Retrying in $RetryWaitSeconds seconds..." -Level "WARNING" | |
Start-Sleep -Seconds $RetryWaitSeconds | |
} | |
else { | |
Write-Log "Failed to update TTL after all retry attempts" -Level "ERROR" | |
return $false | |
} | |
} | |
} | |
} | |
catch { | |
Write-Log "Failed to update DNS TTL: $_" -Level "ERROR" | |
return $false | |
} | |
} | |
# Function to get or initialize state | |
function Get-State { | |
$defaultState = @{ | |
Errors = @() | |
LastFailoverTime = $null | |
FailoverCount = 0 | |
LastReportTime = $null | |
ReportsSent = 0 | |
TTLStatus = "Standard" | |
LastTTLChange = $null | |
CurrentFailoverID = $null | |
} | |
# Check if state file exists | |
if (-not (Test-Path $stateFilePath)) { | |
# Create parent directory if it doesn't exist | |
$stateDir = Split-Path -Path $stateFilePath -Parent | |
if (-not (Test-Path -Path $stateDir -PathType Container)) { | |
try { | |
New-Item -Path $stateDir -ItemType Directory -Force | Out-Null | |
} | |
catch { | |
Write-Log "Failed to create state directory: $_" -Level "ERROR" | |
return $defaultState | |
} | |
} | |
# Create new state file | |
try { | |
$defaultState | ConvertTo-Json -Depth 5 | Set-Content -Path $stateFilePath | |
Write-Log "Initialized new state file at $stateFilePath" -Level "INFO" | |
} | |
catch { | |
Write-Log "Failed to create state file: $_" -Level "WARNING" | |
} | |
return $defaultState | |
} | |
# Try to read existing state file | |
try { | |
$stateContent = Get-Content -Path $stateFilePath -Raw -ErrorAction Stop | |
$state = $stateContent | ConvertFrom-Json -ErrorAction Stop | |
# Validate required properties exist | |
$requiredProperties = @("Errors", "LastFailoverTime", "FailoverCount", "LastReportTime", "ReportsSent", | |
"TTLStatus", "LastTTLChange", "CurrentFailoverID") | |
$missingProperties = @() | |
foreach ($prop in $requiredProperties) { | |
if (-not (Get-Member -InputObject $state -Name $prop -MemberType Properties)) { | |
$missingProperties += $prop | |
Add-Member -InputObject $state -MemberType NoteProperty -Name $prop -Value $defaultState[$prop] | |
} | |
} | |
if ($missingProperties.Count -gt 0) { | |
Write-Log "Added missing properties to state file: $($missingProperties -join ', ')" -Level "WARNING" | |
$state | ConvertTo-Json -Depth 5 | Set-Content -Path $stateFilePath | |
} | |
return $state | |
} | |
catch { | |
Write-Log "Error reading state file: $_" -Level "ERROR" | |
# Try to create backup of corrupted state file | |
try { | |
$backupPath = "$stateFilePath.bak" | |
Copy-Item -Path $stateFilePath -Destination $backupPath -Force | |
Write-Log "Created backup of corrupted state file at $backupPath" -Level "WARNING" | |
} | |
catch { | |
Write-Log "Failed to backup corrupted state file: $_" -Level "ERROR" | |
} | |
# Return default state | |
return $defaultState | |
} | |
} | |
# Function to save state | |
function Save-State { | |
param ( | |
[Parameter(Mandatory = $true)] | |
[PSCustomObject]$State | |
) | |
try { | |
$State | ConvertTo-Json -Depth 5 | Set-Content -Path $stateFilePath | |
Write-Log "State saved successfully" -Level "INFO" | |
} | |
catch { | |
Write-Log "Error saving state: $_" -Level "ERROR" | |
} | |
} | |
# Function to check if within cooldown period | |
function Test-CooldownPeriod { | |
param ( | |
[Parameter(Mandatory = $true)] | |
[PSCustomObject]$State | |
) | |
if ($null -eq $State.LastFailoverTime) { | |
return $false | |
} | |
try { | |
$lastFailover = [DateTime]::Parse($State.LastFailoverTime) | |
$cooldownEndTime = $lastFailover.AddMinutes($CooldownPeriodMinutes) | |
$now = Get-Date | |
if ($now -lt $cooldownEndTime) { | |
$minutesRemaining = [math]::Ceiling(($cooldownEndTime - $now).TotalMinutes) | |
Write-Log "Currently in cooldown period. $minutesRemaining minutes remaining before next possible failover." -Level "WARNING" | |
return $true | |
} | |
} | |
catch { | |
Write-Log "Error calculating cooldown period: $_" -Level "ERROR" | |
# If there's an error parsing the date, assume we're not in cooldown | |
return $false | |
} | |
return $false | |
} | |
#endregion | |
#region Event Monitoring | |
# Function to simulate COMException events for testing | |
function New-SimulatedCOMExceptionEvents { | |
param ( | |
[Parameter(Mandatory = $false)] | |
[int]$Count = 3 | |
) | |
Write-Log "Simulating $Count COMException events for testing" -Level "INFO" | |
$events = @() | |
$now = Get-Date | |
# Create error messages that match real COMException patterns | |
$comExceptionMessages = @( | |
"System.Runtime.InteropServices.COMException: Exception from HRESULT: 0x80010105 (RPC_E_SERVERFAULT)", | |
"System.Runtime.InteropServices.COMException: The RPC server is unavailable. (Exception from HRESULT: 0x800706BA)", | |
"System.Runtime.InteropServices.COMException: The object invoked has disconnected from its clients. (Exception from HRESULT: 0x80010108)", | |
"System.Runtime.InteropServices.COMException: Class not registered (Exception from HRESULT: 0x80040154)", | |
"System.Runtime.InteropServices.COMException: The interface is unknown. (Exception from HRESULT: 0x80004002)" | |
) | |
for ($i = 0; $i -lt $Count; $i++) { | |
$eventTime = $now.AddMinutes( - ($i * 2)) | |
$randomMessageIndex = Get-Random -Minimum 0 -Maximum $comExceptionMessages.Count | |
$errorMessage = $comExceptionMessages[$randomMessageIndex] | |
$stackTrace = @" | |
at MyApp.ServiceProxy.ExecuteRequest() | |
at MyApp.Controller.ProcessCommand() | |
at MyApp.Program.Main() | |
"@ | |
$fullMessage = $errorMessage + "`r`n" + $stackTrace | |
$event = @{ | |
TimeCreated = $eventTime | |
EventID = 1000 + $i | |
Level = "Error" | |
Message = $fullMessage | |
LogName = "Application" | |
} | |
$events += $event | |
Write-Log "Simulated Event ID: $($event.EventID), Time: $($event.TimeCreated), Error: $($comExceptionMessages[$randomMessageIndex])" -Level "INFO" | |
} | |
return $events | |
} | |
# Function to check Event Log for COMExceptions | |
function Get-COMExceptionEvents { | |
param ( | |
[Parameter(Mandatory = $true)] | |
[string]$ComputerName, | |
[Parameter(Mandatory = $true)] | |
[int]$MinutesAgo | |
) | |
# If in simulation mode, return simulated events | |
if ($SimulateError) { | |
return New-SimulatedCOMExceptionEvents | |
} | |
try { | |
$startTime = (Get-Date).AddMinutes(-$MinutesAgo) | |
$events = @() | |
Write-Log "Checking for COMException events on $ComputerName in the last $MinutesAgo minutes" -Level "INFO" | |
# Test if the computer is reachable | |
if (-not (Test-Connection -ComputerName $ComputerName -Count 1 -Quiet -ErrorAction SilentlyContinue)) { | |
Write-Log "Server $ComputerName is not reachable, cannot query event log" -Level "ERROR" | |
return @() | |
} | |
# Define logs to search - Application is primary but also check System | |
$logsToSearch = @('Application', 'System') | |
foreach ($logName in $logsToSearch) { | |
# Create filter for the query | |
$filter = @{ | |
LogName = $logName | |
StartTime = $startTime | |
EndTime = Get-Date | |
} | |
try { | |
# First attempt - Get-WinEvent with remote computer | |
$logEvents = Get-WinEvent -FilterHashtable $filter -ComputerName $ComputerName -ErrorAction Stop | |
# Filter for COMException - case insensitive | |
$comExceptionEvents = $logEvents | Where-Object { $_.Message -match "(?i)COMException" } | |
foreach ($event in $comExceptionEvents) { | |
$eventInfo = @{ | |
TimeCreated = $event.TimeCreated | |
EventID = $event.Id | |
Level = $event.LevelDisplayName | |
Message = $event.Message | |
LogName = $logName | |
} | |
$events += $eventInfo | |
Write-Log "Found COMException in $logName log: Event ID: $($event.Id), Time: $($event.TimeCreated)" -Level "INFO" | |
} | |
} | |
catch { | |
# Handle the specific case when no events were found | |
if ($_.Exception.Message -match "No events were found that match the specified selection criteria") { | |
Write-Log "No events found in $logName log matching criteria" -Level "INFO" | |
continue | |
} | |
Write-Log "Error querying $logName event log with Get-WinEvent: $_" -Level "WARNING" | |
try { | |
# Second attempt - PowerShell session | |
$scriptBlock = { | |
param($filterStart, $filterEnd, $logName) | |
$filter = @{ | |
LogName = $logName | |
StartTime = $filterStart | |
EndTime = $filterEnd | |
} | |
try { | |
$events = Get-WinEvent -FilterHashtable $filter -ErrorAction Stop | |
return $events | Where-Object { $_.Message -match "(?i)COMException" } | |
} | |
catch { | |
if ($_.Exception.Message -match "No events were found that match the specified selection criteria") { | |
return @() | |
} | |
throw $_ | |
} | |
} | |
$session = New-PSSession -ComputerName $ComputerName -ErrorAction Stop | |
$sessionEvents = Invoke-Command -Session $session -ScriptBlock $scriptBlock -ArgumentList $startTime, (Get-Date), $logName | |
Remove-PSSession $session -ErrorAction SilentlyContinue | |
foreach ($event in $sessionEvents) { | |
$eventInfo = @{ | |
TimeCreated = $event.TimeCreated | |
EventID = $event.Id | |
Level = $event.LevelDisplayName | |
Message = $event.Message | |
LogName = $logName | |
} | |
$events += $eventInfo | |
Write-Log "Found COMException in $logName log via PS Session: Event ID: $($event.Id), Time: $($event.TimeCreated)" -Level "INFO" | |
} | |
} | |
catch { | |
# Handle the specific case when no events were found | |
if ($_.Exception.Message -match "No events were found that match the specified selection criteria") { | |
Write-Log "No events found in $logName log matching criteria (via PS Session)" -Level "INFO" | |
} | |
else { | |
Write-Log "Error querying $logName event log with PowerShell session: $_" -Level "WARNING" | |
} | |
} | |
} | |
} | |
# Log results | |
if ($events.Count -gt 0) { | |
Write-Log "Found $($events.Count) COMException events across all logs" -Level "WARNING" | |
} | |
else { | |
Write-Log "No COMException events found in the specified time period" -Level "INFO" | |
} | |
return $events | |
} | |
catch { | |
Write-Log "Error in event monitoring: $_" -Level "ERROR" | |
# Ensure we return an empty array rather than null | |
return @() | |
} | |
} | |
# Function to update error state | |
function Update-ErrorState { | |
param ( | |
[Parameter(Mandatory = $true)] | |
[PSCustomObject]$State, | |
[Parameter(ValueFromPipeline = $true)] | |
[object[]]$NewErrors = @() | |
) | |
# Defensive check - make sure NewErrors is not null and is an array | |
if ($null -eq $NewErrors) { | |
$NewErrors = @() | |
Write-Log "No new errors to update (null converted to empty array)" -Level "WARNING" | |
} | |
# If NewErrors is an empty array or empty collection, log warning but continue | |
if ($NewErrors.Count -eq 0) { | |
Write-Log "No events found or events returned empty collection - using empty array" -Level "WARNING" | |
} | |
$now = Get-Date | |
$cutoffTime = $now.AddMinutes(-$TimeWindowMinutes) | |
# Remove errors older than the time window | |
$updatedErrors = @() | |
$invalidEntries = 0 | |
$oldEntries = 0 | |
foreach ($error in $State.Errors) { | |
try { | |
$errorTime = [DateTime]::Parse($error.TimeCreated) | |
if ($errorTime -gt $cutoffTime) { | |
$updatedErrors += $error | |
} | |
else { | |
$oldEntries++ | |
} | |
} | |
catch { | |
$invalidEntries++ | |
} | |
} | |
if ($invalidEntries -gt 0) { | |
Write-Log "Found $invalidEntries invalid date entries in error state" -Level "WARNING" | |
} | |
if ($oldEntries -gt 0) { | |
Write-Log "Removed $oldEntries old entries from error state" -Level "INFO" | |
} | |
# Add new errors (loop only processes if NewErrors has elements) | |
foreach ($error in $NewErrors) { | |
$errorEntry = @{ | |
TimeCreated = $error.TimeCreated.ToString('o') | |
EventID = $error.EventID | |
Level = $error.Level | |
LogName = $error.LogName | |
Message = if ($error.Message.Length -gt 500) { $error.Message.Substring(0, 500) + "..." } else { $error.Message } | |
} | |
$updatedErrors += $errorEntry | |
} | |
# Update state | |
$State.Errors = $updatedErrors | |
# Save the updated state | |
Save-State -State $State | |
# Return count of errors in the current time window | |
return $updatedErrors.Count | |
} | |
#endregion | |
#region Email Reporting | |
# Function to validate email settings | |
function Test-EmailSettings { | |
if (-not $SendEmailReport -and -not $GenerateReportOnly) { | |
# Email reporting is not enabled | |
return $false | |
} | |
# Check required parameters | |
if ([string]::IsNullOrWhiteSpace($SmtpServer)) { | |
Write-Log "SMTP server is required for email reporting. Use -SmtpServer parameter." -Level "ERROR" | |
return $false | |
} | |
if ([string]::IsNullOrWhiteSpace($EmailFrom)) { | |
Write-Log "Sender email address is required. Use -EmailFrom parameter." -Level "ERROR" | |
return $false | |
} | |
if ($null -eq $EmailTo -or $EmailTo.Count -eq 0) { | |
Write-Log "Recipient email address(es) required. Use -EmailTo parameter." -Level "ERROR" | |
return $false | |
} | |
# If auth is specified, check credentials | |
if (-not [string]::IsNullOrWhiteSpace($SmtpUsername)) { | |
if ([string]::IsNullOrWhiteSpace($SmtpPassword)) { | |
Write-Log "SMTP password is required when username is provided. Use -SmtpPassword parameter." -Level "ERROR" | |
return $false | |
} | |
} | |
return $true | |
} | |
# Simplified function to create a report from the current execution logs | |
function New-EmailReport { | |
param ( | |
[Parameter(Mandatory = $true)] | |
[string]$EventType, | |
[Parameter(Mandatory = $true)] | |
[string]$Environment, | |
[Parameter(Mandatory = $false)] | |
[string]$FailoverID = $null, | |
[Parameter(Mandatory = $false)] | |
[int]$ErrorCount = 0, | |
[Parameter(Mandatory = $false)] | |
[int]$ErrorThreshold = 0, | |
[Parameter(Mandatory = $false)] | |
[switch]$IsInitialNotification, | |
[Parameter(Mandatory = $false)] | |
[string]$Reason = "" | |
) | |
# Get current timestamp | |
$reportTime = Get-Date -Format "yyyy-MM-dd HH:mm:ss" | |
# Determine overall status based on event type | |
$overallStatus = "Monitoring" | |
if ($IsInitialNotification) { | |
$overallStatus = "Failover Operation Started" | |
} | |
else { | |
switch ($EventType) { | |
"FailoverInProgress" { $overallStatus = "Failover In Progress" } | |
"FailoverCompleted" { $overallStatus = "Failover Completed Successfully" } | |
"FailoverFailed" { $overallStatus = "Failover Failed" } | |
"ForcedFailover" { $overallStatus = "Manual Failover Executed" } | |
"Cooldown" { $overallStatus = "In Cooldown Period" } | |
"ErrorThresholdTriggered" { $overallStatus = "Error Threshold Triggered" } | |
"ReportingError" { $overallStatus = "Reporting Error Occurred" } | |
default { $overallStatus = "System Status Report" } | |
} | |
} | |
# Create text email content - Header | |
$textBody = @" | |
======================================================== | |
FAILOVER MONITOR REPORT - $Environment | |
======================================================== | |
Status: $overallStatus | |
Report Time: $reportTime | |
"@ | |
# Add failover ID if available | |
if (-not [string]::IsNullOrEmpty($FailoverID)) { | |
$textBody += "Failover ID: $FailoverID`n" | |
} | |
# Summary section | |
$textBody += @" | |
EVENT SUMMARY | |
-------------------------------------------------------- | |
Environment: $Environment | |
Event Type: $EventType | |
"@ | |
# Add reason if provided | |
if (-not [string]::IsNullOrEmpty($Reason)) { | |
$textBody += "Reason: $Reason`n" | |
} | |
# Add error count for non-initial notifications | |
if (-not $IsInitialNotification -and $ErrorCount -gt 0) { | |
$textBody += "Error Count/Threshold: $ErrorCount/$ErrorThreshold`n" | |
} | |
# Find script start time from logs | |
$scriptStartTime = "Unknown" | |
$startLogEntry = $script:tempLogContent | Where-Object { $_ -match "Auto Failover Monitor v1.2" } | Select-Object -First 1 | |
if ($startLogEntry) { | |
$scriptStartTime = $startLogEntry -replace '^\[([^\]]+)\].*', '$1' | |
} | |
$textBody += "Execution Start Time: $scriptStartTime`n" | |
$textBody += "Report Generation Time: $reportTime`n" | |
# Add notification type info | |
if ($IsInitialNotification) { | |
$textBody += @" | |
Notification Type: Initial Failover Notification | |
Note: A failover operation has been initiated. You will receive another notification when the operation completes. | |
"@ | |
} | |
# Add execution log entries | |
$textBody += @" | |
EXECUTION LOG | |
-------------------------------------------------------- | |
"@ | |
# Get all log entries from the temp log file | |
$logEntries = $script:tempLogContent | |
# If we have log entries, add them to the report | |
if ($logEntries.Count -gt 0) { | |
$textBody += "`n" + ($logEntries -join "`n") | |
} | |
else { | |
$textBody += "`n" + "No log entries found for this execution." | |
} | |
# Footer | |
$textBody += @" | |
======================================================== | |
This is an automated report from the Auto Failover Monitoring system. | |
For issues, contact IT Support. | |
Generated: $reportTime | |
======================================================== | |
"@ | |
return $textBody | |
} | |
# Function to send email report | |
function Send-FailoverReport { | |
param ( | |
[Parameter(Mandatory = $true)] | |
[string]$EventType, | |
[Parameter(Mandatory = $true)] | |
[string]$Environment, | |
[Parameter(Mandatory = $false)] | |
[string]$FailoverID = $null, | |
[Parameter(Mandatory = $false)] | |
[int]$ErrorCount = 0, | |
[Parameter(Mandatory = $false)] | |
[int]$ErrorThreshold = 0, | |
[Parameter(Mandatory = $false)] | |
[switch]$IsInitialNotification, | |
[Parameter(Mandatory = $false)] | |
[string]$Reason = "" | |
) | |
# Check email settings | |
if (-not (Test-EmailSettings)) { | |
Write-Log "Email settings validation failed. Report will not be sent." -Level "ERROR" | |
return $false | |
} | |
# Prevent sending too frequent reports | |
$state = Get-State | |
if ($state.LastReportTime) { | |
try { | |
$lastReportTime = [DateTime]::Parse($state.LastReportTime) | |
$minimumInterval = [TimeSpan]::FromMinutes(5) # Minimum 5 minutes between emails | |
if ((Get-Date) - $lastReportTime -lt $minimumInterval -and -not $IsInitialNotification) { | |
Write-Log "Skipping report as one was sent recently (within $($minimumInterval.TotalMinutes) minutes)" -Level "INFO" | |
return $true # Return as if it was successful | |
} | |
} | |
catch { | |
Write-Log "Error parsing last report time: $_" -Level "WARNING" | |
} | |
} | |
try { | |
# Generate email subject based on event type and notification type | |
$subjectPrefix = "[$Environment]" | |
$subject = if ($IsInitialNotification) { | |
"$subjectPrefix ALERT: Failover Operation Started" | |
} | |
else { | |
switch ($EventType) { | |
"ForcedFailover" { "$subjectPrefix ALERT: Manual Failover Executed" } | |
"FailoverInProgress" { "$subjectPrefix ALERT: Failover Operation In Progress" } | |
"ErrorThresholdTriggered" { "$subjectPrefix ALERT: Error Threshold Triggered Failover" } | |
"FailoverCompleted" { "$subjectPrefix INFO: Failover Completed Successfully" } | |
"FailoverFailed" { "$subjectPrefix ALERT: Failover Failed" } | |
"Cooldown" { "$subjectPrefix WARNING: In Cooldown Period - Issues Detected" } | |
default { "$subjectPrefix INFO: Failover Monitor Report" } | |
} | |
} | |
# Add failover ID to subject if available | |
if (-not [string]::IsNullOrEmpty($FailoverID)) { | |
$subject += " [ID: $FailoverID]" | |
} | |
# Generate text body | |
$textBody = New-EmailReport -EventType $EventType -Environment $Environment -FailoverID $FailoverID ` | |
-ErrorCount $ErrorCount -ErrorThreshold $ErrorThreshold ` | |
-IsInitialNotification:$IsInitialNotification -Reason $Reason | |
# Set email priority | |
$priority = if ($IsInitialNotification) { | |
"High" | |
} | |
else { | |
switch ($EventType) { | |
{ $_ -match "ErrorThresholdTriggered|ForcedFailover|FailoverFailed" } { "High" } | |
"FailoverInProgress" { "High" } | |
"FailoverCompleted" { "Normal" } | |
"Cooldown" { "Normal" } | |
default { "Normal" } | |
} | |
} | |
# Create mail message | |
$mailParams = @{ | |
SmtpServer = $SmtpServer | |
Port = $SmtpPort | |
From = $EmailFrom | |
Subject = $subject | |
Body = $textBody | |
BodyAsHtml = $false # Set to false for plain text email | |
Priority = $priority | |
ErrorAction = "Stop" | |
} | |
# Handle To recipients | |
if ($EmailTo) { | |
# Force array type regardless of item count | |
$toRecipients = @($EmailTo) | |
$mailParams.Add("To", $toRecipients) | |
} | |
else { | |
Write-Log "No recipients specified in EmailTo parameter" -Level "ERROR" | |
return $false | |
} | |
# Add Cc recipients if specified | |
if ($EmailCc) { | |
# Force array type regardless of item count | |
$ccRecipients = @($EmailCc) | |
$mailParams.Add("Cc", $ccRecipients) | |
} | |
# Add credentials if specified | |
if (-not [string]::IsNullOrWhiteSpace($SmtpUsername) -and $null -ne $SmtpPassword) { | |
try { | |
$credentials = New-Object System.Management.Automation.PSCredential ($SmtpUsername, $SmtpPassword) | |
$mailParams.Add("Credential", $credentials) | |
} | |
catch { | |
Write-Log "Error creating credentials for SMTP: $_" -Level "ERROR" | |
return $false | |
} | |
} | |
# Add SSL if enabled | |
if ($EnableSSL) { | |
$mailParams.Add("UseSsl", $true) | |
# TLS config for PowerShell v5 | |
[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 | |
} | |
# Send email with enhanced error handling | |
try { | |
Send-MailMessage @mailParams | |
# Update last report time in state | |
$state = Get-State | |
$state.LastReportTime = (Get-Date).ToString('o') | |
$state.ReportsSent++ | |
Save-State -State $state | |
Write-Log "Email report sent successfully to $($toRecipients -join ', ')" -Level "SUCCESS" | |
return $true | |
} | |
catch [System.Net.Mail.SmtpException] { | |
Write-Log "SMTP server error: $_" -Level "ERROR" | |
Write-Log "Mail parameters: Server=$SmtpServer, Port=$SmtpPort, SSL=$EnableSSL" -Level "ERROR" | |
return $false | |
} | |
catch [System.Net.WebException] { | |
Write-Log "Network connectivity issue with mail server: $_" -Level "ERROR" | |
return $false | |
} | |
catch { | |
Write-Log "Error sending email report: $_" -Level "ERROR" | |
return $false | |
} | |
} | |
catch { | |
Write-Log "Error preparing email report: $_" -Level "ERROR" | |
return $false | |
} | |
} | |
# Function to generate and send initial failover notification | |
function Send-InitialFailoverNotification { | |
param ( | |
[Parameter(Mandatory = $true)] | |
[string]$FailoverID, | |
[Parameter(Mandatory = $true)] | |
[string]$Reason | |
) | |
if (-not $SendEmailReport) { | |
return $false | |
} | |
Write-Log "Sending initial failover notification" -Level "INFO" | |
# Send the initial notification | |
$result = Send-FailoverReport -EventType "FailoverInProgress" -Environment $Env -FailoverID $FailoverID ` | |
-Reason $Reason -IsInitialNotification | |
return $result | |
} | |
# Function to generate and send failover completion notification | |
function Send-CompletionFailoverNotification { | |
param ( | |
[Parameter(Mandatory = $true)] | |
[string]$FailoverID, | |
[Parameter(Mandatory = $true)] | |
[bool]$Success | |
) | |
if (-not $SendEmailReport) { | |
return $false | |
} | |
Write-Log "Sending failover completion notification" -Level "INFO" | |
# Determine event type based on success | |
$eventType = if ($Success) { "FailoverCompleted" } else { "FailoverFailed" } | |
# Send the completion notification | |
$result = Send-FailoverReport -EventType $eventType -Environment $Env -FailoverID $FailoverID | |
return $result | |
} | |
# Function to generate and send report from existing logs | |
function Invoke-FailoverReporting { | |
param ( | |
[Parameter(Mandatory = $false)] | |
[string]$EventType = "Monitoring", | |
[Parameter(Mandatory = $false)] | |
[int]$ErrorCount = 0, | |
[Parameter(Mandatory = $false)] | |
[int]$ThresholdCount = 0 | |
) | |
Write-Log "Starting failover reporting process" -Level "INFO" | |
if (-not $SendEmailReport -and -not $GenerateReportOnly) { | |
Write-Log "Email reporting not enabled, skipping report generation" -Level "INFO" | |
return $false | |
} | |
# Send the report | |
$result = Send-FailoverReport -EventType $EventType -Environment $Env ` | |
-ErrorCount $ErrorCount -ErrorThreshold $ThresholdCount | |
if ($result) { | |
Write-Log "Email report sent successfully" -Level "SUCCESS" | |
return $true | |
} | |
else { | |
Write-Log "Failed to send email report" -Level "ERROR" | |
return $false | |
} | |
} | |
#endregion | |
#region Failover Management | |
# Function to trigger failover | |
function Invoke-CompleteFailoverCycle { | |
param ( | |
[Parameter(Mandatory = $false)] | |
[string]$Reason = "Automatic failover triggered by COMException threshold" | |
) | |
Write-Log "Triggering complete failover cycle: $Reason" -Level "WARNING" | |
if (-not (Test-CompleteFailoverScript)) { | |
Write-Log "Cannot proceed with failover - CompleteFailoverCycle script not found" -Level "ERROR" | |
return $false | |
} | |
# Generate a unique ID for this failover operation | |
$failoverID = [guid]::NewGuid().ToString() | |
Write-Log "Starting failover operation with ID: $failoverID" -Level "WARNING" | |
# Store the failover ID in state | |
$state = Get-State | |
$state.CurrentFailoverID = $failoverID | |
Save-State -State $state | |
# If in test mode, simulate a successful failover | |
if ($TestMode) { | |
Write-Log "TEST MODE: Simulating failover operation (no actual failover performed)" -Level "WARNING" | |
# Send initial notification if email reporting is enabled | |
if ($SendEmailReport) { | |
Write-Log "Sending initial failover notification" -Level "INFO" | |
Send-InitialFailoverNotification -FailoverID $failoverID -Reason "TEST MODE: $Reason" | |
} | |
Start-Sleep -Seconds 5 # Simulate some processing time | |
# Update state with last failover time | |
$state = Get-State | |
$state.LastFailoverTime = (Get-Date).ToString('o') | |
$state.FailoverCount++ | |
$state.Errors = @() # Clear errors after successful failover | |
$state.CurrentFailoverID = $null # Clear current failover ID | |
Save-State -State $state | |
Write-Log "TEST MODE: Simulated failover completed successfully" -Level "SUCCESS" | |
Write-Log "Completed failover operation with ID: $failoverID" -Level "SUCCESS" | |
# Send completion notification if email reporting is enabled | |
if ($SendEmailReport) { | |
Write-Log "Sending failover completion notification" -Level "INFO" | |
Send-CompletionFailoverNotification -FailoverID $failoverID -Success $true | |
} | |
return $true | |
} | |
try { | |
# Send initial notification if email reporting is enabled | |
if ($SendEmailReport) { | |
Write-Log "Sending initial failover notification" -Level "INFO" | |
Send-InitialFailoverNotification -FailoverID $failoverID -Reason $Reason | |
} | |
# Create temp file for output | |
$tempFile = [System.IO.Path]::GetTempFileName() | |
# Build powershell command with proper argument escaping | |
$scriptCmd = "& '$CompleteFailoverScriptPath'" | |
$scriptCmd += " -Env '$Env'" | |
# Add optional parameters with proper escaping | |
if ($dnsServer) { | |
$scriptCmd += " -dnsServer '$dnsServer'" | |
} | |
if ($lookupZone) { | |
$scriptCmd += " -lookupZone '$lookupZone'" | |
} | |
# Add TTL parameters if specified | |
$scriptCmd += " -DefaultTTLMinutes $DefaultTTLMinutes -ReducedTTLMinutes $ReducedTTLMinutes" | |
# Redirect output to a file | |
$scriptCmd += " *>&1 | Tee-Object -FilePath '$tempFile'" | |
# Log the command | |
Write-Log "Executing failover: $scriptCmd" -Level "INFO" | |
# Log execution start time | |
$scriptStartTime = Get-Date | |
Write-Log "Script execution started at: $scriptStartTime" -Level "DEBUG" | |
# Execute the command | |
# Reset LASTEXITCODE before execution to ensure clean state | |
$result = Invoke-Expression $scriptCmd | |
$exitCode = $LASTEXITCODE | |
# Log completion time | |
$scriptEndTime = Get-Date | |
$executionDuration = ($scriptEndTime - $scriptStartTime).TotalSeconds | |
Write-Log "Script execution completed at: $scriptEndTime (took $executionDuration seconds)" -Level "DEBUG" | |
# Read the captured output | |
$output = Get-Content -Path $tempFile -Raw -ErrorAction SilentlyContinue | |
Remove-Item -Path $tempFile -Force -ErrorAction SilentlyContinue | |
# Log output for debugging | |
Write-Log "Failover script output: $output" -Level "INFO" | |
# Check if failover was successful | |
$success = $false | |
if ($exitCode -eq 0) { | |
$success = $true | |
} | |
else { | |
# Check for success indicators in the output even if exit code is non-zero | |
if ($output -match "Complete failover cycle finished" -or | |
$output -match "Script completed successfully" -or | |
$output -match "First failover successful:" -or | |
$output -match "Second failover successful:") { | |
$success = $true | |
Write-Log "Failover script reported non-zero exit code but appears successful" -Level "WARNING" | |
} | |
} | |
if ($success) { | |
Write-Log "Complete failover cycle executed successfully" -Level "SUCCESS" | |
Write-Log "Completed failover operation with ID: $failoverID" -Level "SUCCESS" | |
# Update state with last failover time | |
$state = Get-State | |
$state.LastFailoverTime = (Get-Date).ToString('o') | |
$state.FailoverCount++ | |
$state.Errors = @() # Clear errors after successful failover | |
$state.CurrentFailoverID = $null # Clear current failover ID | |
## Reset TTL back to standard value after successful failover if needed | |
#if ($state.TTLStatus -eq "Reduced") { | |
# $dnsName = if ($Env -eq "Dev") { "LendingWebServerDev" } else { "LendingWebServer" } | |
# Write-Log "Resetting TTL back to standard value ($DefaultTTLMinutes minutes) for $dnsName" -Level "INFO" | |
# if ($dnsServer -and $lookupZone) { | |
# if (Update-DnsTTL -DnsServer $dnsServer -LookupZone $lookupZone -DnsName $dnsName -TTLMinutes $DefaultTTLMinutes) { | |
# $state.TTLStatus = "Standard" | |
# $state.LastTTLChange = (Get-Date).ToString('o') | |
# Write-Log "TTL reset to standard value successfully" -Level "SUCCESS" | |
# } | |
# } | |
# else { | |
# Write-Log "DNS server or lookup zone not specified, skipping TTL reset" -Level "WARNING" | |
# } | |
#} | |
Save-State -State $state | |
# Send completion notification if email reporting is enabled | |
if ($SendEmailReport) { | |
Write-Log "Sending failover completion notification" -Level "INFO" | |
Send-CompletionFailoverNotification -FailoverID $failoverID -Success $true | |
} | |
return $true | |
} | |
else { | |
Write-Log "Complete failover cycle failed with exit code: $exitCode" -Level "ERROR" | |
Write-Log "Failed failover operation with ID: $failoverID" -Level "ERROR" | |
# Update current failover ID state | |
$state = Get-State | |
$state.CurrentFailoverID = $null # Clear current failover ID | |
Save-State -State $state | |
# Send completion notification if email reporting is enabled | |
if ($SendEmailReport) { | |
Write-Log "Sending failover completion notification" -Level "INFO" | |
Send-CompletionFailoverNotification -FailoverID $failoverID -Success $false | |
} | |
return $false | |
} | |
} | |
catch { | |
Write-Log "Error executing complete failover cycle: $_" -Level "ERROR" | |
Write-Log "Failed failover operation with ID: $failoverID" -Level "ERROR" | |
# Update current failover ID state | |
$state = Get-State | |
$state.CurrentFailoverID = $null # Clear current failover ID | |
Save-State -State $state | |
# Send completion notification if email reporting is enabled | |
if ($SendEmailReport) { | |
Write-Log "Sending failover completion notification" -Level "INFO" | |
Send-CompletionFailoverNotification -FailoverID $failoverID -Success $false | |
} | |
return $false | |
} | |
} | |
#endregion | |
#region Setup and Monitoring | |
# Function to initialize the monitoring environment | |
function Initialize-Monitoring { | |
Write-Log "Initializing Auto Failover Monitoring environment" -Level "INFO" | |
# Validate the CompleteFailoverCycle script exists | |
if (-not (Test-CompleteFailoverScript)) { | |
Write-Log "Initialization failed - CompleteFailoverCycle script not found" -Level "ERROR" | |
return $false | |
} | |
# Initialize state file | |
$state = Get-State | |
Write-Log "State file initialized at: $stateFilePath" -Level "INFO" | |
# Create scheduled task for periodic monitoring if not running as service | |
if (-not $RunAsService) { | |
try { | |
# Check if scheduled task already exists | |
$taskName = "AutoFailoverMonitor_$Env" | |
$task = Get-ScheduledTask -TaskName $taskName -ErrorAction SilentlyContinue | |
if ($task) { | |
Write-Log "Scheduled task '$taskName' already exists" -Level "INFO" | |
} | |
else { | |
# Create a scheduled task to run every 5 minutes | |
$action = New-ScheduledTaskAction -Execute "powershell.exe" -Argument "-NoProfile -ExecutionPolicy Bypass -File `"$PSCommandPath`" -Env `"$Env`"" | |
$trigger = New-ScheduledTaskTrigger -Once -At (Get-Date) -RepetitionInterval (New-TimeSpan -Minutes 5) | |
$settings = New-ScheduledTaskSettingsSet -ExecutionTimeLimit (New-TimeSpan -Minutes 10) -RestartCount 3 | |
Register-ScheduledTask -TaskName $taskName -Action $action -Trigger $trigger -Settings $settings -Description "Automatic Failover Monitor for $Env environment" | |
Write-Log "Scheduled task '$taskName' created to run every 5 minutes" -Level "SUCCESS" | |
} | |
} | |
catch { | |
Write-Log "Error creating scheduled task: $_" -Level "WARNING" | |
Write-Log "You may need to manually create a scheduled task to run this script periodically" -Level "WARNING" | |
} | |
} | |
Write-Log "Initialization completed successfully" -Level "SUCCESS" | |
return $true | |
} | |
# Function to run the monitor once | |
function Invoke-MonitoringCheck { | |
Write-Log "Starting monitoring check for COMException errors" -Level "INFO" | |
# Get DNS name based on environment | |
$dnsName = if ($Env -eq "Dev") { "LendingWebServerDev" } else { "LendingWebServer" } | |
Write-Log "Using DNS name for $Env environment: $dnsName" -Level "INFO" | |
# Get current active server | |
$activeServer = Get-CurrentActiveServer | |
if (-not $activeServer) { | |
Write-Log "Cannot proceed without determining active server" -Level "ERROR" | |
return $false | |
} | |
# Extract server name without domain | |
$serverName = Get-HostnameFromFQDN -FQDN $activeServer | |
Write-Log "Active server short name: $serverName" -Level "INFO" | |
# Get current state | |
$state = Get-State | |
# Check if we're in a cooldown period (unless force failover is specified) | |
if (-not $ForceFailover -and (Test-CooldownPeriod -State $state)) { | |
Write-Log "Skipping error check due to cooldown period" -Level "INFO" | |
# Send cooldown notification if email reporting is enabled | |
if ($SendEmailReport) { | |
Write-Log "Sending cooldown notification" -Level "INFO" | |
Invoke-FailoverReporting -EventType "Cooldown" | |
} | |
return $true | |
} | |
# Check for COMException events on the active server | |
$events = Get-COMExceptionEvents -ComputerName $serverName -MinutesAgo $TimeWindowMinutes | |
# Update error state with new events | |
$errorCount = Update-ErrorState -State $state -NewErrors $events | |
Write-Log "Current COMException error count: $errorCount/$ErrorThreshold in last $TimeWindowMinutes minutes" -Level "INFO" | |
## Add TTL Management - When first error is detected (or force failover), reduce TTL to 1 minute | |
#if (($errorCount -ge 1 -and $state.TTLStatus -eq "Standard") -or $ForceFailover) { | |
# Write-Log "First COM error detected (or force failover), proactively reducing TTL" -Level "WARNING" | |
# if ($dnsServer -and $lookupZone) { | |
# if (Update-DnsTTL -DnsServer $dnsServer -LookupZone $lookupZone -DnsName $dnsName -TTLMinutes $ReducedTTLMinutes) { | |
# $state.TTLStatus = "Reduced" | |
# $state.LastTTLChange = (Get-Date).ToString('o') | |
# Save-State -State $state | |
# Write-Log "TTL reduced to $ReducedTTLMinutes minute(s) successfully" -Level "SUCCESS" | |
# } | |
# } | |
# else { | |
# Write-Log "DNS server or lookup zone not specified, skipping TTL reduction" -Level "WARNING" | |
# } | |
#} | |
# Check if threshold is exceeded or force failover is specified | |
if ($errorCount -ge $ErrorThreshold -or $ForceFailover) { | |
$reason = if ($ForceFailover) { | |
"Forced failover requested" | |
} | |
else { | |
"COMException threshold reached ($errorCount events in $TimeWindowMinutes minutes)" | |
} | |
# Trigger failover | |
$result = Invoke-CompleteFailoverCycle -Reason $reason | |
if ($result) { | |
Write-Log "Automatic failover successfully completed" -Level "SUCCESS" | |
} | |
else { | |
Write-Log "Automatic failover failed" -Level "ERROR" | |
} | |
return $result | |
} | |
else { | |
Write-Log "Error threshold not reached, no action needed" -Level "INFO" | |
# Send status report if email reporting is enabled and at least one error was found | |
if ($SendEmailReport -and $errorCount -gt 0) { | |
Write-Log "Sending error detection report" -Level "INFO" | |
Invoke-FailoverReporting -EventType "ErrorDetected" -ErrorCount $errorCount -ThresholdCount $ErrorThreshold | |
} | |
return $true | |
} | |
} | |
# Function to run continuous monitoring as a service | |
# Function to run continuous monitoring as a service | |
function Start-MonitoringService { | |
param ( | |
[Parameter(Mandatory = $false)] | |
[int]$LogRotationCheckIntervalMinutes = 60 | |
) | |
Write-Log "Starting Auto Failover Monitoring service" -Level "INFO" | |
# Validate script dependencies before entering loop | |
if (-not (Test-CompleteFailoverScript)) { | |
Write-Log "Cannot start monitoring service - CompleteFailoverCycle script not found" -Level "ERROR" | |
return $false | |
} | |
# Validate email settings if email reporting is enabled | |
if ($SendEmailReport) { | |
if (-not (Test-EmailSettings)) { | |
Write-Log "Email settings validation failed. Email reporting will be disabled." -Level "WARNING" | |
$SendEmailReport = $false | |
} | |
} | |
try { | |
# Initial check to get and validate active server | |
$activeServer = Get-CurrentActiveServer | |
if (-not $activeServer) { | |
Write-Log "Cannot start monitoring service - unable to determine active server" -Level "ERROR" | |
return $false | |
} | |
# Initialize failover counter | |
$failoverAttempts = 0 | |
$maxFailoverAttempts = 10 # Maximum number of failover attempts in a 24-hour period | |
$failoverCountResetTime = (Get-Date).AddHours(24) | |
# Track last log rotation time | |
$lastLogRotationCheck = [DateTime]::MinValue | |
Write-Log "Monitoring service started successfully" -Level "SUCCESS" | |
# If email reporting is enabled, send a startup notification | |
if ($SendEmailReport) { | |
Write-Log "Sending service startup notification email" -Level "INFO" | |
try { | |
Invoke-FailoverReporting -EventType "ServiceStart" | |
} | |
catch { | |
Write-Log "Failed to send startup notification: $_" -Level "WARNING" | |
} | |
} | |
# Main monitoring loop | |
while ($true) { | |
try { | |
$now = Get-Date | |
# Rotate log file if needed (only check periodically to reduce filesystem operations) | |
if (($now - $lastLogRotationCheck).TotalMinutes -ge $LogRotationCheckIntervalMinutes) { | |
Rotate-LogFile -LogPath $logFilePath | |
$lastLogRotationCheck = $now | |
} | |
# Check if we need to reset the failover counter | |
if ($now -gt $failoverCountResetTime) { | |
$failoverAttempts = 0 | |
$failoverCountResetTime = $now.AddHours(24) | |
Write-Log "Failover attempt counter reset" -Level "INFO" | |
} | |
# Run monitoring check if we haven't exceeded the maximum failover attempts | |
if ($failoverAttempts -lt $maxFailoverAttempts) { | |
$result = Invoke-MonitoringCheck | |
# If a failover was triggered and successful, increment the counter | |
$state = Get-State | |
if ($state.LastFailoverTime) { | |
$lastFailoverTime = [DateTime]::Parse($state.LastFailoverTime) | |
if ($lastFailoverTime -gt $now.AddMinutes(-10)) { | |
$failoverAttempts++ | |
Write-Log "Failover attempt count: $failoverAttempts/$maxFailoverAttempts in current 24-hour period" -Level "WARNING" | |
} | |
} | |
# Clear the temp log for next run and optimize memory usage | |
$script:tempLogContent = @() | |
[System.GC]::Collect() | |
} | |
else { | |
$maxAttemptsMessage = "Maximum failover attempts ($maxFailoverAttempts) reached for 24-hour period. Monitoring continues but no failovers will be triggered." | |
Write-Log $maxAttemptsMessage -Level "WARNING" | |
# Send email notification about max attempts reached if enabled | |
if ($SendEmailReport) { | |
try { | |
Invoke-FailoverReporting -EventType "MaxAttemptsReached" | |
} | |
catch { | |
Write-Log "Failed to send max attempts notification: $_" -Level "WARNING" | |
} | |
} | |
} | |
# Sleep for 5 minutes between checks | |
Write-Log "Sleeping for 5 minutes before next check..." -Level "INFO" | |
Start-Sleep -Seconds 300 | |
} | |
catch { | |
Write-Log "Error in monitoring cycle: $_" -Level "ERROR" | |
# Send email notification about monitoring error if enabled | |
if ($SendEmailReport) { | |
try { | |
Invoke-FailoverReporting -EventType "MonitoringError" | |
} | |
catch { | |
Write-Log "Failed to send error notification: $_" -Level "WARNING" | |
} | |
} | |
Write-Log "Continuing with next check in 5 minutes..." -Level "WARNING" | |
Start-Sleep -Seconds 300 | |
} | |
} | |
} | |
catch { | |
$criticalError = "Critical error in monitoring service: $_" | |
Write-Log $criticalError -Level "ERROR" | |
# Send email notification about critical error if enabled | |
if ($SendEmailReport) { | |
try { | |
Invoke-FailoverReporting -EventType "CriticalError" | |
} | |
catch { | |
Write-Log "Failed to send critical error notification: $_" -Level "WARNING" | |
} | |
} | |
Write-Log "Monitoring service stopped" -Level "ERROR" | |
return $false | |
} | |
} | |
#endregion | |
# Main execution block | |
try { | |
Write-Log "=======================================================" -Level "INFO" | |
Write-Log "Auto Failover Monitor v1.2 with Simplified Email Reporting" -Level "INFO" | |
Write-Log "Environment: $Env, Error Threshold: $ErrorThreshold/$TimeWindowMinutes min, Cooldown: $CooldownPeriodMinutes min" -Level "INFO" | |
Write-Log "TTL Settings: Default: $DefaultTTLMinutes min, Reduced: $ReducedTTLMinutes min" -Level "INFO" | |
# Log email settings if email reporting is enabled | |
if ($SendEmailReport) { | |
Write-Log "Email reporting enabled - Sending to: $($EmailTo -join ', ')" -Level "INFO" | |
} | |
Write-Log "=======================================================" -Level "INFO" | |
# Handle different execution modes | |
if ($Initialize) { | |
Initialize-Monitoring | |
} | |
elseif ($GenerateReportOnly) { | |
Write-Log "Generate report only mode - creating report from existing logs" -Level "INFO" | |
$activeServer = Get-CurrentActiveServer | |
Invoke-FailoverReporting -EventType "GenerateReportOnly" | |
} | |
elseif ($RunAsService) { | |
Start-MonitoringService | |
} | |
elseif ($ForceFailover) { | |
Write-Log "Force failover mode enabled - will trigger failover regardless of error state" -Level "WARNING" | |
Invoke-MonitoringCheck | |
} | |
elseif ($TestMode) { | |
Write-Log "Test mode enabled - no actual failover will be performed" -Level "WARNING" | |
Invoke-MonitoringCheck | |
} | |
else { | |
# Run a single monitoring check | |
Invoke-MonitoringCheck | |
} | |
# Clean up temporary log file | |
if (Test-Path -Path $script:tempLogPath) { | |
Remove-Item -Path $script:tempLogPath -Force -ErrorAction SilentlyContinue | |
} | |
exit 0 | |
} | |
catch { | |
Write-Log "Unhandled exception in Auto Failover Monitor: $_" -Level "ERROR" | |
Write-Log $_.ScriptStackTrace -Level "ERROR" | |
# Send email report for critical error if email reporting is enabled | |
if ($SendEmailReport) { | |
Write-Log "Attempting to send error report via email" -Level "INFO" | |
try { | |
Invoke-FailoverReporting -EventType "CriticalError" -Environment $Env | |
} | |
catch { | |
Write-Log "Failed to send error report: $_" -Level "ERROR" | |
} | |
} | |
# Clean up temporary log file | |
if (Test-Path -Path $script:tempLogPath) { | |
Remove-Item -Path $script:tempLogPath -Force -ErrorAction SilentlyContinue | |
} | |
exit 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment