Skip to content

Instantly share code, notes, and snippets.

@davidlu1001
Last active April 1, 2025 04:41
Show Gist options
  • Save davidlu1001/fb5490654922dd811cc22625bc8e223b to your computer and use it in GitHub Desktop.
Save davidlu1001/fb5490654922dd811cc22625bc8e223b to your computer and use it in GitHub Desktop.
autoFailoverMonitorEmail.ps1
# autoFailoverMonitor.ps1
# This script monitors the active server for COMException errors in EventLog
# and automatically triggers a complete failover cycle when threshold is met.
# Enhanced with simplified email reporting capabilities.
[CmdletBinding()]
param(
[Parameter(Mandatory = $false)]
[ValidateSet("Dev", "Prod")]
[string]$Env = "Dev",
[Parameter(Mandatory = $false)]
[string]$dnsServer,
[Parameter(Mandatory = $false)]
[string]$lookupZone,
[Parameter(Mandatory = $false)]
[string]$CompleteFailoverScriptPath = "$PSScriptRoot\completeFailoverCycle.ps1",
[Parameter(Mandatory = $false)]
[string]$logFilePath = "$PSScriptRoot\AutoFailoverMonitor.log",
[Parameter(Mandatory = $false)]
[string]$stateFilePath = "$PSScriptRoot\AutoFailoverState.json",
[Parameter(Mandatory = $false)]
[int]$ErrorThreshold = 1,
[Parameter(Mandatory = $false)]
[int]$TimeWindowMinutes = 5,
[Parameter(Mandatory = $false)]
[int]$CooldownPeriodMinutes = 45,
[Parameter(Mandatory = $false)]
[switch]$RunAsService,
[Parameter(Mandatory = $false)]
[switch]$ForceFailover,
[Parameter(Mandatory = $false)]
[switch]$Initialize,
[Parameter(Mandatory = $false)]
[switch]$TestMode,
[Parameter(Mandatory = $false)]
[switch]$SimulateError,
[Parameter(Mandatory = $false)]
[switch]$Help,
# Email reporting parameters
[Parameter(Mandatory = $false)]
[string]$SmtpServer,
[Parameter(Mandatory = $false)]
[int]$SmtpPort = 25,
[Parameter(Mandatory = $false)]
[string]$EmailFrom,
[Parameter(Mandatory = $false)]
[string[]]$EmailTo,
[Parameter(Mandatory = $false)]
[string[]]$EmailCc,
[Parameter(Mandatory = $false)]
[switch]$EnableSSL,
[Parameter(Mandatory = $false)]
[string]$SmtpUsername,
[Parameter(Mandatory = $false)]
[System.Security.SecureString]$SmtpPassword,
[Parameter(Mandatory = $false)]
[switch]$SendEmailReport,
[Parameter(Mandatory = $false)]
[switch]$GenerateReportOnly,
[Parameter(Mandatory = $false)]
[int]$LogRotationCheckIntervalMinutes = 60,
[Parameter(Mandatory = $false)]
[int]$DefaultTTLMinutes = 3,
[Parameter(Mandatory = $false)]
[int]$ReducedTTLMinutes = 1
)
# Add reference to System.Web for HTML encoding
Add-Type -AssemblyName System.Web
# Function to show help information
function Show-Help {
$helpText = @"
Automatic Failover Monitor Help
===============================
Description:
This script monitors the active server for COMException errors in the Event Log
and automatically triggers a complete failover cycle when threshold is met.
Enhanced with email reporting capabilities.
Syntax:
.\autoFailoverMonitor.ps1 [-Env <String>] [-dnsServer <String>] [-lookupZone <String>]
[-CompleteFailoverScriptPath <String>] [-logFilePath <String>]
[-stateFilePath <String>] [-ErrorThreshold <Int>]
[-TimeWindowMinutes <Int>] [-CooldownPeriodMinutes <Int>]
[-DefaultTTLMinutes <Int>] [-ReducedTTLMinutes <Int>]
[-LogRotationCheckIntervalMinutes <Int>]
[-RunAsService] [-ForceFailover] [-Initialize] [-TestMode]
[-SimulateError] [-Help]
[-SmtpServer <String>] [-SmtpPort <Int>] [-EmailFrom <String>]
[-EmailTo <String[]>] [-EmailCc <String[]>] [-EnableSSL]
[-SmtpUsername <String>] [-SmtpPassword <String>]
[-SendEmailReport] [-GenerateReportOnly]
Parameters:
-Env <String>
Specifies the environment to operate in.
Valid values: Dev, Prod
Default: Dev
-dnsServer <String>
Specifies the DNS server to use for operations.
-lookupZone <String>
Specifies the DNS lookup zone.
-CompleteFailoverScriptPath <String>
Path to the completeFailoverCycle.ps1 script.
Default: .\completeFailoverCycle.ps1
-logFilePath <String>
Path to the log file.
Default: .\AutoFailoverMonitor.log
-stateFilePath <String>
Path to the state file that tracks error occurrences and cooldown period.
Default: .\AutoFailoverState.json
-ErrorThreshold <Int>
Number of COMException errors that must occur within the time window to trigger failover.
Default: 1
-TimeWindowMinutes <Int>
Time window in minutes within which errors are counted.
Default: 5
-CooldownPeriodMinutes <Int>
Cooldown period in minutes after a failover during which no new failover will be triggered.
Default: 45
-DefaultTTLMinutes <Int>
Standard DNS TTL value in minutes to use during normal operations.
Default: 3
-ReducedTTLMinutes <Int>
Reduced DNS TTL value in minutes to use when errors are detected.
Default: 1
-LogRotationCheckIntervalMinutes <Int>
Interval in minutes between log rotation checks when running as a service.
Default: 60
-RunAsService [Switch]
If specified, the script will run as a continuous monitoring service.
-ForceFailover [Switch]
If specified, forces a failover regardless of error count or cooldown period.
-Initialize [Switch]
If specified, initializes the monitoring environment (creates state file, etc.)
-TestMode [Switch]
If specified, runs in test mode without actually triggering failover.
-SimulateError [Switch]
If specified, simulates COMException errors for testing purposes.
-Help [Switch]
Shows this help message.
Email Reporting Parameters:
-SmtpServer <String>
SMTP server address for sending email reports.
-SmtpPort <Int>
SMTP server port.
Default: 25
-EmailFrom <String>
Email address to send reports from.
-EmailTo <String[]>
Email addresses to send reports to (comma-separated for multiple).
-EmailCc <String[]>
Email addresses to carbon copy on reports (comma-separated for multiple).
-EnableSSL [Switch]
If specified, enables SSL for SMTP connection.
-SmtpUsername <String>
Username for SMTP authentication.
-SmtpPassword <String>
Password for SMTP authentication.
-SendEmailReport [Switch]
If specified, enables automatic email reports for failover events.
-GenerateReportOnly [Switch]
If specified, only generates a report from existing logs without monitoring.
Examples:
# Show help
.\autoFailoverMonitor.ps1 -Help
# Initialize the monitoring environment
.\autoFailoverMonitor.ps1 -Initialize
# Run the monitor once to check for errors and trigger failover if needed
.\autoFailoverMonitor.ps1 -Env Prod -dnsServer "dns1.company.com" -lookupZone "company.local"
# Run the monitor as a continuous service with email reporting
.\autoFailoverMonitor.ps1 -Env Prod -RunAsService -SendEmailReport -SmtpServer "smtp.company.com" -EmailFrom "[email protected]" -EmailTo "[email protected]"
# Run as service with custom log rotation interval
.\autoFailoverMonitor.ps1 -Env Prod -RunAsService -LogRotationCheckIntervalMinutes 120
# Force a failover and send email report
.\autoFailoverMonitor.ps1 -ForceFailover -SendEmailReport
# Generate a report from existing logs without monitoring
.\autoFailoverMonitor.ps1 -GenerateReportOnly -SmtpServer "smtp.company.com" -EmailFrom "[email protected]" -EmailTo "[email protected]"
Notes:
- This script requires the completeFailoverCycle.ps1 script to be accessible
- Appropriate permissions are required for reading Event Logs and executing the failover script
- All operations are logged to the specified log file
- The script maintains state to track error occurrences and enforce cooldown periods
- Email reporting provides notifications at the start and completion of failover operations
- Email reporting requires valid SMTP server settings
"@
Write-Host $helpText
exit 0
}
# Show help if requested
if ($Help) {
Show-Help
}
# Enable strict mode for better error handling
Set-StrictMode -Version Latest
# Set error action preference to stop script execution on error
$ErrorActionPreference = 'Stop'
# Ensure TLS 1.2 is used for all secure communications
[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12
#region Supporting Functions
# Create a variable to hold the temporary log for the current execution
$script:tempLogPath = [System.IO.Path]::GetTempFileName()
$script:tempLogContent = @()
# Function to write log messages
function Write-Log {
[CmdletBinding()]
param (
[Parameter(Mandatory = $true)]
[string]$Message,
[Parameter(Mandatory = $false)]
[ValidateSet("INFO", "WARNING", "ERROR", "SUCCESS")]
[string]$Level = "INFO"
)
$timestamp = Get-Date -Format "yyyy-MM-dd HH:mm:ss"
$logMessage = "[$timestamp] [$Level] $Message"
# Write to console with appropriate color
switch ($Level) {
"INFO" { Write-Host $logMessage -ForegroundColor Cyan }
"WARNING" { Write-Host $logMessage -ForegroundColor Yellow }
"ERROR" { Write-Host $logMessage -ForegroundColor Red }
"SUCCESS" { Write-Host $logMessage -ForegroundColor Green }
}
# Append to temp log content for this execution
$script:tempLogContent += $logMessage
# Append to log file
try {
# Create the log directory if it doesn't exist
$logDir = Split-Path -Path $logFilePath -Parent
if (-not (Test-Path -Path $logDir -PathType Container)) {
New-Item -Path $logDir -ItemType Directory -Force | Out-Null
}
Add-Content -Path $logFilePath -Value $logMessage -ErrorAction Stop
Add-Content -Path $script:tempLogPath -Value $logMessage -ErrorAction Stop
}
catch {
Write-Warning "Failed to write to log file: $_"
}
}
# Function to rotate log files
function Rotate-LogFile {
[CmdletBinding()]
param (
[Parameter(Mandatory = $true)]
[string]$LogPath,
[Parameter(Mandatory = $false)]
[int]$MaxSizeMB = 10,
[Parameter(Mandatory = $false)]
[int]$FilesToKeep = 5
)
# Check if log file exists and exceeds max size
if (Test-Path $LogPath) {
$logFile = Get-Item $LogPath
if ($logFile.Length -gt ($MaxSizeMB * 1MB)) {
Write-Log "Log file size limit reached. Rotating logs..." -Level "INFO"
$directory = Split-Path $LogPath -Parent
$baseName = (Split-Path $LogPath -Leaf).Split('.')[0]
$extension = if ($logFile.Extension) { $logFile.Extension } else { ".log" }
$timestamp = Get-Date -Format "yyyyMMdd-HHmmss"
$newName = Join-Path $directory "$($baseName)_$($timestamp)$extension"
# Rename current log file
try {
Copy-Item -Path $LogPath -Destination $newName -Force
Remove-Item -Path $LogPath -Force
Write-Host "Log file rotated to: $newName"
# Clean up old log files
$oldLogs = Get-ChildItem -Path $directory -Filter "$baseName*$extension" |
Where-Object { $_.Name -ne (Split-Path $LogPath -Leaf) } |
Sort-Object LastWriteTime -Descending |
Select-Object -Skip $FilesToKeep
foreach ($old in $oldLogs) {
Remove-Item $old.FullName -Force
Write-Host "Removed old log file: $($old.Name)"
}
}
catch {
Write-Warning "Failed to rotate log file: $_"
}
}
}
}
# Function to validate the CompleteFailoverCycle script exists
function Test-CompleteFailoverScript {
if (-not (Test-Path -Path $CompleteFailoverScriptPath)) {
Write-Log "CompleteFailoverCycle script not found at: $CompleteFailoverScriptPath" -Level "ERROR"
Write-Log "Please provide the correct path using -CompleteFailoverScriptPath parameter" -Level "ERROR"
return $false
}
return $true
}
# Function to extract hostname from FQDN
function Get-HostnameFromFQDN {
param (
[Parameter(Mandatory = $true)]
[string]$FQDN
)
# Extract hostname part (remove domain if present)
return $FQDN -replace '\..*$', ''
}
# Function to get current active server
function Get-CurrentActiveServer {
try {
# Determine the path to the dnsFailover script
$dnsFailoverScriptPath = Join-Path (Split-Path $CompleteFailoverScriptPath -Parent) "dnsFailover_v2.ps1"
if (-not (Test-Path $dnsFailoverScriptPath)) {
Write-Log "DNS Failover script not found at: $dnsFailoverScriptPath" -Level "ERROR"
return $null
}
Write-Log "Attempting to determine current active host..." -Level "INFO"
# Create the command line
$cmdArgs = "-Env `"$Env`" -Ops `"check`""
# Add optional parameters if provided
if ($dnsServer) {
$cmdArgs += " -dnsServer `"$dnsServer`""
}
if ($lookupZone) {
$cmdArgs += " -lookupZone `"$lookupZone`""
}
# Execute the script using Invoke-Expression
$tempFile = [System.IO.Path]::GetTempFileName()
$scriptCmd = "& '$dnsFailoverScriptPath' $cmdArgs *>&1 | Tee-Object -FilePath '$tempFile'"
Write-Log "Running command: $scriptCmd" -Level "INFO"
$result = Invoke-Expression $scriptCmd
$exitCode = $LASTEXITCODE
# Read the captured output
$output = Get-Content -Path $tempFile -Raw -ErrorAction SilentlyContinue
Remove-Item -Path $tempFile -Force -ErrorAction SilentlyContinue
# Log the output for debugging
Write-Log "Script output: $output" -Level "INFO"
# Parse the output to find the active host - trying multiple patterns
$activeHost = $null
# Pattern 1: Direct "Active host check completed: X" message
if ($output -match "Active host check completed: ([^\s]+)") {
$activeHost = $Matches[1]
}
# Pattern 2: Look for "Active host is reachable: X" message
elseif ($output -match "Active host is reachable: ([^\s]+)") {
$activeHost = $Matches[1]
}
# Pattern 3: Look for "Current active host: X" message
elseif ($output -match "Current active host(?: before failover)?: ([^\s]+)") {
$activeHost = $Matches[1]
}
# Additional fallback patterns
elseif ($output -match "host(?:name)?\s+obtained: ([^\s]+)") {
$activeHost = $Matches[1]
}
elseif ($output -match "Pinging\s+([^\s]+)\s+") {
$activeHost = $Matches[1]
}
elseif ($output -match "Active host is: ([^\s]+)") {
$activeHost = $Matches[1]
}
# Fallback to a default server name if we cannot determine the active server
# This is only for simulation/test purposes
if (-not $activeHost -and ($TestMode -or $SimulateError)) {
# Use a generic server name based on environment
# In the Dev environment, there are two potential servers that could be active
if ($Env -eq "Dev") {
# Randomly choose between SERVER1 and SERVER2 for Dev environment
$serverNumber = Get-Random -Minimum 1 -Maximum 3 # Will return either 1 or 2
$activeHost = "SERVER$serverNumber.$Env.example.com"
}
else {
$activeHost = "SERVER1.$Env.example.com"
}
Write-Log "Using default test server: $activeHost" -Level "WARNING"
}
if ($activeHost) {
Write-Log "Current active host: $activeHost" -Level "INFO"
return $activeHost
}
else {
Write-Log "Could not determine active host from script output" -Level "ERROR"
return $null
}
}
catch {
Write-Log "Error getting current active host: $_" -Level "ERROR"
# Fallback to a default server name in case of error
# This is only for simulation/test purposes
if ($TestMode -or $SimulateError) {
if ($Env -eq "Dev") {
# Randomly choose between SERVER1 and SERVER2 for Dev environment
$serverNumber = Get-Random -Minimum 1 -Maximum 3 # Will return either 1 or 2
$activeHost = "SERVER$serverNumber.$Env.example.com"
}
else {
$activeHost = "SERVER1.$Env.example.com"
}
Write-Log "Using default test server: $activeHost" -Level "WARNING"
return $activeHost
}
return $null
}
}
# Function to update DNS TTL
function Update-DnsTTL {
[CmdletBinding()]
param (
[Parameter(Mandatory = $true)]
[string]$DnsServer,
[Parameter(Mandatory = $true)]
[string]$LookupZone,
[Parameter(Mandatory = $true)]
[string]$DnsName,
[Parameter(Mandatory = $true)]
[int]$TTLMinutes,
[Parameter(Mandatory = $false)]
[int]$RetryAttempts = 2,
[Parameter(Mandatory = $false)]
[int]$RetryWaitSeconds = 5,
[Parameter(Mandatory = $false)]
[int]$DnsVerificationWaitSeconds = 5
)
try {
Write-Log "Attempting to update TTL for $DnsName to $TTLMinutes minutes" -Level "INFO"
for ($attempt = 1; $attempt -le $RetryAttempts; $attempt++) {
try {
# Get current CNAME record
$dnsRecord = Get-DnsServerResourceRecord -Name $DnsName -RRType CName -ZoneName $LookupZone -ComputerName $DnsServer -ErrorAction Stop
if ($null -eq $dnsRecord) {
Write-Log "DNS record not found: $DnsName" -Level "ERROR"
return $false
}
$currentAlias = $dnsRecord.RecordData.HostNameAlias
$currentTTL = $dnsRecord.TimeToLive.TotalMinutes
Write-Log "Attempt $attempt : Current record - Alias: $currentAlias, TTL: $currentTTL minutes" -Level "INFO"
# Only update if the TTL actually needs to change
if ([Math]::Abs($currentTTL - $TTLMinutes) -lt 0.1) {
Write-Log "TTL already set to $TTLMinutes minutes, no update needed" -Level "INFO"
return $true
}
# Create new TTL timespan
$newTTL = [System.TimeSpan]::FromMinutes($TTLMinutes)
# Remove existing record
Write-Log "Removing existing DNS record" -Level "INFO"
Remove-DnsServerResourceRecord -ZoneName $LookupZone -Name $DnsName -RRType CName -ComputerName $DnsServer -Force -ErrorAction Stop
# Brief pause to ensure record is fully removed
Start-Sleep -Seconds $DnsVerificationWaitSeconds
# Add record with new TTL
Write-Log "Adding DNS record with new TTL" -Level "INFO"
Add-DnsServerResourceRecordCName -Name $DnsName -HostNameAlias $currentAlias -ZoneName $LookupZone -ComputerName $DnsServer -TimeToLive $newTTL -ErrorAction Stop
# Allow time for the change to propagate
Start-Sleep -Seconds $DnsVerificationWaitSeconds
# Verify the update
$verifyRecord = Get-DnsServerResourceRecord -Name $DnsName -RRType CName -ZoneName $LookupZone -ComputerName $DnsServer -ErrorAction Stop
$actualTTL = $verifyRecord.TimeToLive.TotalMinutes
if ([Math]::Abs($actualTTL - $TTLMinutes) -lt 0.1) {
Write-Log "Successfully updated TTL from $currentTTL to $actualTTL minutes" -Level "SUCCESS"
return $true
}
else {
Write-Log "TTL verification failed. Current: $actualTTL, Expected: $TTLMinutes" -Level "WARNING"
if ($attempt -lt $RetryAttempts) {
Write-Log "Will retry TTL update (attempt $attempt of $RetryAttempts)" -Level "WARNING"
Start-Sleep -Seconds $RetryWaitSeconds
}
else {
Write-Log "Maximum retry attempts reached" -Level "ERROR"
return $false
}
}
}
catch {
Write-Log "Error updating TTL (attempt $attempt of $RetryAttempts): $_" -Level "ERROR"
if ($attempt -lt $RetryAttempts) {
Write-Log "Retrying in $RetryWaitSeconds seconds..." -Level "WARNING"
Start-Sleep -Seconds $RetryWaitSeconds
}
else {
Write-Log "Failed to update TTL after all retry attempts" -Level "ERROR"
return $false
}
}
}
}
catch {
Write-Log "Failed to update DNS TTL: $_" -Level "ERROR"
return $false
}
}
# Function to get or initialize state
function Get-State {
$defaultState = @{
Errors = @()
LastFailoverTime = $null
FailoverCount = 0
LastReportTime = $null
ReportsSent = 0
TTLStatus = "Standard"
LastTTLChange = $null
CurrentFailoverID = $null
}
# Check if state file exists
if (-not (Test-Path $stateFilePath)) {
# Create parent directory if it doesn't exist
$stateDir = Split-Path -Path $stateFilePath -Parent
if (-not (Test-Path -Path $stateDir -PathType Container)) {
try {
New-Item -Path $stateDir -ItemType Directory -Force | Out-Null
}
catch {
Write-Log "Failed to create state directory: $_" -Level "ERROR"
return $defaultState
}
}
# Create new state file
try {
$defaultState | ConvertTo-Json -Depth 5 | Set-Content -Path $stateFilePath
Write-Log "Initialized new state file at $stateFilePath" -Level "INFO"
}
catch {
Write-Log "Failed to create state file: $_" -Level "WARNING"
}
return $defaultState
}
# Try to read existing state file
try {
$stateContent = Get-Content -Path $stateFilePath -Raw -ErrorAction Stop
$state = $stateContent | ConvertFrom-Json -ErrorAction Stop
# Validate required properties exist
$requiredProperties = @("Errors", "LastFailoverTime", "FailoverCount", "LastReportTime", "ReportsSent",
"TTLStatus", "LastTTLChange", "CurrentFailoverID")
$missingProperties = @()
foreach ($prop in $requiredProperties) {
if (-not (Get-Member -InputObject $state -Name $prop -MemberType Properties)) {
$missingProperties += $prop
Add-Member -InputObject $state -MemberType NoteProperty -Name $prop -Value $defaultState[$prop]
}
}
if ($missingProperties.Count -gt 0) {
Write-Log "Added missing properties to state file: $($missingProperties -join ', ')" -Level "WARNING"
$state | ConvertTo-Json -Depth 5 | Set-Content -Path $stateFilePath
}
return $state
}
catch {
Write-Log "Error reading state file: $_" -Level "ERROR"
# Try to create backup of corrupted state file
try {
$backupPath = "$stateFilePath.bak"
Copy-Item -Path $stateFilePath -Destination $backupPath -Force
Write-Log "Created backup of corrupted state file at $backupPath" -Level "WARNING"
}
catch {
Write-Log "Failed to backup corrupted state file: $_" -Level "ERROR"
}
# Return default state
return $defaultState
}
}
# Function to save state
function Save-State {
param (
[Parameter(Mandatory = $true)]
[PSCustomObject]$State
)
try {
$State | ConvertTo-Json -Depth 5 | Set-Content -Path $stateFilePath
Write-Log "State saved successfully" -Level "INFO"
}
catch {
Write-Log "Error saving state: $_" -Level "ERROR"
}
}
# Function to check if within cooldown period
function Test-CooldownPeriod {
param (
[Parameter(Mandatory = $true)]
[PSCustomObject]$State
)
if ($null -eq $State.LastFailoverTime) {
return $false
}
try {
$lastFailover = [DateTime]::Parse($State.LastFailoverTime)
$cooldownEndTime = $lastFailover.AddMinutes($CooldownPeriodMinutes)
$now = Get-Date
if ($now -lt $cooldownEndTime) {
$minutesRemaining = [math]::Ceiling(($cooldownEndTime - $now).TotalMinutes)
Write-Log "Currently in cooldown period. $minutesRemaining minutes remaining before next possible failover." -Level "WARNING"
return $true
}
}
catch {
Write-Log "Error calculating cooldown period: $_" -Level "ERROR"
# If there's an error parsing the date, assume we're not in cooldown
return $false
}
return $false
}
#endregion
#region Event Monitoring
# Function to simulate COMException events for testing
function New-SimulatedCOMExceptionEvents {
param (
[Parameter(Mandatory = $false)]
[int]$Count = 3
)
Write-Log "Simulating $Count COMException events for testing" -Level "INFO"
$events = @()
$now = Get-Date
# Create error messages that match real COMException patterns
$comExceptionMessages = @(
"System.Runtime.InteropServices.COMException: Exception from HRESULT: 0x80010105 (RPC_E_SERVERFAULT)",
"System.Runtime.InteropServices.COMException: The RPC server is unavailable. (Exception from HRESULT: 0x800706BA)",
"System.Runtime.InteropServices.COMException: The object invoked has disconnected from its clients. (Exception from HRESULT: 0x80010108)",
"System.Runtime.InteropServices.COMException: Class not registered (Exception from HRESULT: 0x80040154)",
"System.Runtime.InteropServices.COMException: The interface is unknown. (Exception from HRESULT: 0x80004002)"
)
for ($i = 0; $i -lt $Count; $i++) {
$eventTime = $now.AddMinutes( - ($i * 2))
$randomMessageIndex = Get-Random -Minimum 0 -Maximum $comExceptionMessages.Count
$errorMessage = $comExceptionMessages[$randomMessageIndex]
$stackTrace = @"
at MyApp.ServiceProxy.ExecuteRequest()
at MyApp.Controller.ProcessCommand()
at MyApp.Program.Main()
"@
$fullMessage = $errorMessage + "`r`n" + $stackTrace
$event = @{
TimeCreated = $eventTime
EventID = 1000 + $i
Level = "Error"
Message = $fullMessage
LogName = "Application"
}
$events += $event
Write-Log "Simulated Event ID: $($event.EventID), Time: $($event.TimeCreated), Error: $($comExceptionMessages[$randomMessageIndex])" -Level "INFO"
}
return $events
}
# Function to check Event Log for COMExceptions
function Get-COMExceptionEvents {
param (
[Parameter(Mandatory = $true)]
[string]$ComputerName,
[Parameter(Mandatory = $true)]
[int]$MinutesAgo
)
# If in simulation mode, return simulated events
if ($SimulateError) {
return New-SimulatedCOMExceptionEvents
}
try {
$startTime = (Get-Date).AddMinutes(-$MinutesAgo)
$events = @()
Write-Log "Checking for COMException events on $ComputerName in the last $MinutesAgo minutes" -Level "INFO"
# Test if the computer is reachable
if (-not (Test-Connection -ComputerName $ComputerName -Count 1 -Quiet -ErrorAction SilentlyContinue)) {
Write-Log "Server $ComputerName is not reachable, cannot query event log" -Level "ERROR"
return @()
}
# Define logs to search - Application is primary but also check System
$logsToSearch = @('Application', 'System')
foreach ($logName in $logsToSearch) {
# Create filter for the query
$filter = @{
LogName = $logName
StartTime = $startTime
EndTime = Get-Date
}
try {
# First attempt - Get-WinEvent with remote computer
$logEvents = Get-WinEvent -FilterHashtable $filter -ComputerName $ComputerName -ErrorAction Stop
# Filter for COMException - case insensitive
$comExceptionEvents = $logEvents | Where-Object { $_.Message -match "(?i)COMException" }
foreach ($event in $comExceptionEvents) {
$eventInfo = @{
TimeCreated = $event.TimeCreated
EventID = $event.Id
Level = $event.LevelDisplayName
Message = $event.Message
LogName = $logName
}
$events += $eventInfo
Write-Log "Found COMException in $logName log: Event ID: $($event.Id), Time: $($event.TimeCreated)" -Level "INFO"
}
}
catch {
# Handle the specific case when no events were found
if ($_.Exception.Message -match "No events were found that match the specified selection criteria") {
Write-Log "No events found in $logName log matching criteria" -Level "INFO"
continue
}
Write-Log "Error querying $logName event log with Get-WinEvent: $_" -Level "WARNING"
try {
# Second attempt - PowerShell session
$scriptBlock = {
param($filterStart, $filterEnd, $logName)
$filter = @{
LogName = $logName
StartTime = $filterStart
EndTime = $filterEnd
}
try {
$events = Get-WinEvent -FilterHashtable $filter -ErrorAction Stop
return $events | Where-Object { $_.Message -match "(?i)COMException" }
}
catch {
if ($_.Exception.Message -match "No events were found that match the specified selection criteria") {
return @()
}
throw $_
}
}
$session = New-PSSession -ComputerName $ComputerName -ErrorAction Stop
$sessionEvents = Invoke-Command -Session $session -ScriptBlock $scriptBlock -ArgumentList $startTime, (Get-Date), $logName
Remove-PSSession $session -ErrorAction SilentlyContinue
foreach ($event in $sessionEvents) {
$eventInfo = @{
TimeCreated = $event.TimeCreated
EventID = $event.Id
Level = $event.LevelDisplayName
Message = $event.Message
LogName = $logName
}
$events += $eventInfo
Write-Log "Found COMException in $logName log via PS Session: Event ID: $($event.Id), Time: $($event.TimeCreated)" -Level "INFO"
}
}
catch {
# Handle the specific case when no events were found
if ($_.Exception.Message -match "No events were found that match the specified selection criteria") {
Write-Log "No events found in $logName log matching criteria (via PS Session)" -Level "INFO"
}
else {
Write-Log "Error querying $logName event log with PowerShell session: $_" -Level "WARNING"
}
}
}
}
# Log results
if ($events.Count -gt 0) {
Write-Log "Found $($events.Count) COMException events across all logs" -Level "WARNING"
}
else {
Write-Log "No COMException events found in the specified time period" -Level "INFO"
}
return $events
}
catch {
Write-Log "Error in event monitoring: $_" -Level "ERROR"
# Ensure we return an empty array rather than null
return @()
}
}
# Function to update error state
function Update-ErrorState {
param (
[Parameter(Mandatory = $true)]
[PSCustomObject]$State,
[Parameter(ValueFromPipeline = $true)]
[object[]]$NewErrors = @()
)
# Defensive check - make sure NewErrors is not null and is an array
if ($null -eq $NewErrors) {
$NewErrors = @()
Write-Log "No new errors to update (null converted to empty array)" -Level "WARNING"
}
# If NewErrors is an empty array or empty collection, log warning but continue
if ($NewErrors.Count -eq 0) {
Write-Log "No events found or events returned empty collection - using empty array" -Level "WARNING"
}
$now = Get-Date
$cutoffTime = $now.AddMinutes(-$TimeWindowMinutes)
# Remove errors older than the time window
$updatedErrors = @()
$invalidEntries = 0
$oldEntries = 0
foreach ($error in $State.Errors) {
try {
$errorTime = [DateTime]::Parse($error.TimeCreated)
if ($errorTime -gt $cutoffTime) {
$updatedErrors += $error
}
else {
$oldEntries++
}
}
catch {
$invalidEntries++
}
}
if ($invalidEntries -gt 0) {
Write-Log "Found $invalidEntries invalid date entries in error state" -Level "WARNING"
}
if ($oldEntries -gt 0) {
Write-Log "Removed $oldEntries old entries from error state" -Level "INFO"
}
# Add new errors (loop only processes if NewErrors has elements)
foreach ($error in $NewErrors) {
$errorEntry = @{
TimeCreated = $error.TimeCreated.ToString('o')
EventID = $error.EventID
Level = $error.Level
LogName = $error.LogName
Message = if ($error.Message.Length -gt 500) { $error.Message.Substring(0, 500) + "..." } else { $error.Message }
}
$updatedErrors += $errorEntry
}
# Update state
$State.Errors = $updatedErrors
# Save the updated state
Save-State -State $State
# Return count of errors in the current time window
return $updatedErrors.Count
}
#endregion
#region Email Reporting
# Function to validate email settings
function Test-EmailSettings {
if (-not $SendEmailReport -and -not $GenerateReportOnly) {
# Email reporting is not enabled
return $false
}
# Check required parameters
if ([string]::IsNullOrWhiteSpace($SmtpServer)) {
Write-Log "SMTP server is required for email reporting. Use -SmtpServer parameter." -Level "ERROR"
return $false
}
if ([string]::IsNullOrWhiteSpace($EmailFrom)) {
Write-Log "Sender email address is required. Use -EmailFrom parameter." -Level "ERROR"
return $false
}
if ($null -eq $EmailTo -or $EmailTo.Count -eq 0) {
Write-Log "Recipient email address(es) required. Use -EmailTo parameter." -Level "ERROR"
return $false
}
# If auth is specified, check credentials
if (-not [string]::IsNullOrWhiteSpace($SmtpUsername)) {
if ([string]::IsNullOrWhiteSpace($SmtpPassword)) {
Write-Log "SMTP password is required when username is provided. Use -SmtpPassword parameter." -Level "ERROR"
return $false
}
}
return $true
}
# Simplified function to create a report from the current execution logs
function New-EmailReport {
param (
[Parameter(Mandatory = $true)]
[string]$EventType,
[Parameter(Mandatory = $true)]
[string]$Environment,
[Parameter(Mandatory = $false)]
[string]$FailoverID = $null,
[Parameter(Mandatory = $false)]
[int]$ErrorCount = 0,
[Parameter(Mandatory = $false)]
[int]$ErrorThreshold = 0,
[Parameter(Mandatory = $false)]
[switch]$IsInitialNotification,
[Parameter(Mandatory = $false)]
[string]$Reason = ""
)
# Get current timestamp
$reportTime = Get-Date -Format "yyyy-MM-dd HH:mm:ss"
# Determine overall status based on event type
$overallStatus = "Monitoring"
if ($IsInitialNotification) {
$overallStatus = "Failover Operation Started"
}
else {
switch ($EventType) {
"FailoverInProgress" { $overallStatus = "Failover In Progress" }
"FailoverCompleted" { $overallStatus = "Failover Completed Successfully" }
"FailoverFailed" { $overallStatus = "Failover Failed" }
"ForcedFailover" { $overallStatus = "Manual Failover Executed" }
"Cooldown" { $overallStatus = "In Cooldown Period" }
"ErrorThresholdTriggered" { $overallStatus = "Error Threshold Triggered" }
"ReportingError" { $overallStatus = "Reporting Error Occurred" }
default { $overallStatus = "System Status Report" }
}
}
# Create text email content - Header
$textBody = @"
========================================================
FAILOVER MONITOR REPORT - $Environment
========================================================
Status: $overallStatus
Report Time: $reportTime
"@
# Add failover ID if available
if (-not [string]::IsNullOrEmpty($FailoverID)) {
$textBody += "Failover ID: $FailoverID`n"
}
# Summary section
$textBody += @"
EVENT SUMMARY
--------------------------------------------------------
Environment: $Environment
Event Type: $EventType
"@
# Add reason if provided
if (-not [string]::IsNullOrEmpty($Reason)) {
$textBody += "Reason: $Reason`n"
}
# Add error count for non-initial notifications
if (-not $IsInitialNotification -and $ErrorCount -gt 0) {
$textBody += "Error Count/Threshold: $ErrorCount/$ErrorThreshold`n"
}
# Find script start time from logs
$scriptStartTime = "Unknown"
$startLogEntry = $script:tempLogContent | Where-Object { $_ -match "Auto Failover Monitor v1.2" } | Select-Object -First 1
if ($startLogEntry) {
$scriptStartTime = $startLogEntry -replace '^\[([^\]]+)\].*', '$1'
}
$textBody += "Execution Start Time: $scriptStartTime`n"
$textBody += "Report Generation Time: $reportTime`n"
# Add notification type info
if ($IsInitialNotification) {
$textBody += @"
Notification Type: Initial Failover Notification
Note: A failover operation has been initiated. You will receive another notification when the operation completes.
"@
}
# Add execution log entries
$textBody += @"
EXECUTION LOG
--------------------------------------------------------
"@
# Get all log entries from the temp log file
$logEntries = $script:tempLogContent
# If we have log entries, add them to the report
if ($logEntries.Count -gt 0) {
$textBody += "`n" + ($logEntries -join "`n")
}
else {
$textBody += "`n" + "No log entries found for this execution."
}
# Footer
$textBody += @"
========================================================
This is an automated report from the Auto Failover Monitoring system.
For issues, contact IT Support.
Generated: $reportTime
========================================================
"@
return $textBody
}
# Function to send email report
function Send-FailoverReport {
param (
[Parameter(Mandatory = $true)]
[string]$EventType,
[Parameter(Mandatory = $true)]
[string]$Environment,
[Parameter(Mandatory = $false)]
[string]$FailoverID = $null,
[Parameter(Mandatory = $false)]
[int]$ErrorCount = 0,
[Parameter(Mandatory = $false)]
[int]$ErrorThreshold = 0,
[Parameter(Mandatory = $false)]
[switch]$IsInitialNotification,
[Parameter(Mandatory = $false)]
[string]$Reason = ""
)
# Check email settings
if (-not (Test-EmailSettings)) {
Write-Log "Email settings validation failed. Report will not be sent." -Level "ERROR"
return $false
}
# Prevent sending too frequent reports
$state = Get-State
if ($state.LastReportTime) {
try {
$lastReportTime = [DateTime]::Parse($state.LastReportTime)
$minimumInterval = [TimeSpan]::FromMinutes(5) # Minimum 5 minutes between emails
if ((Get-Date) - $lastReportTime -lt $minimumInterval -and -not $IsInitialNotification) {
Write-Log "Skipping report as one was sent recently (within $($minimumInterval.TotalMinutes) minutes)" -Level "INFO"
return $true # Return as if it was successful
}
}
catch {
Write-Log "Error parsing last report time: $_" -Level "WARNING"
}
}
try {
# Generate email subject based on event type and notification type
$subjectPrefix = "[$Environment]"
$subject = if ($IsInitialNotification) {
"$subjectPrefix ALERT: Failover Operation Started"
}
else {
switch ($EventType) {
"ForcedFailover" { "$subjectPrefix ALERT: Manual Failover Executed" }
"FailoverInProgress" { "$subjectPrefix ALERT: Failover Operation In Progress" }
"ErrorThresholdTriggered" { "$subjectPrefix ALERT: Error Threshold Triggered Failover" }
"FailoverCompleted" { "$subjectPrefix INFO: Failover Completed Successfully" }
"FailoverFailed" { "$subjectPrefix ALERT: Failover Failed" }
"Cooldown" { "$subjectPrefix WARNING: In Cooldown Period - Issues Detected" }
default { "$subjectPrefix INFO: Failover Monitor Report" }
}
}
# Add failover ID to subject if available
if (-not [string]::IsNullOrEmpty($FailoverID)) {
$subject += " [ID: $FailoverID]"
}
# Generate text body
$textBody = New-EmailReport -EventType $EventType -Environment $Environment -FailoverID $FailoverID `
-ErrorCount $ErrorCount -ErrorThreshold $ErrorThreshold `
-IsInitialNotification:$IsInitialNotification -Reason $Reason
# Set email priority
$priority = if ($IsInitialNotification) {
"High"
}
else {
switch ($EventType) {
{ $_ -match "ErrorThresholdTriggered|ForcedFailover|FailoverFailed" } { "High" }
"FailoverInProgress" { "High" }
"FailoverCompleted" { "Normal" }
"Cooldown" { "Normal" }
default { "Normal" }
}
}
# Create mail message
$mailParams = @{
SmtpServer = $SmtpServer
Port = $SmtpPort
From = $EmailFrom
Subject = $subject
Body = $textBody
BodyAsHtml = $false # Set to false for plain text email
Priority = $priority
ErrorAction = "Stop"
}
# Handle To recipients
if ($EmailTo) {
# Force array type regardless of item count
$toRecipients = @($EmailTo)
$mailParams.Add("To", $toRecipients)
}
else {
Write-Log "No recipients specified in EmailTo parameter" -Level "ERROR"
return $false
}
# Add Cc recipients if specified
if ($EmailCc) {
# Force array type regardless of item count
$ccRecipients = @($EmailCc)
$mailParams.Add("Cc", $ccRecipients)
}
# Add credentials if specified
if (-not [string]::IsNullOrWhiteSpace($SmtpUsername) -and $null -ne $SmtpPassword) {
try {
$credentials = New-Object System.Management.Automation.PSCredential ($SmtpUsername, $SmtpPassword)
$mailParams.Add("Credential", $credentials)
}
catch {
Write-Log "Error creating credentials for SMTP: $_" -Level "ERROR"
return $false
}
}
# Add SSL if enabled
if ($EnableSSL) {
$mailParams.Add("UseSsl", $true)
# TLS config for PowerShell v5
[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12
}
# Send email with enhanced error handling
try {
Send-MailMessage @mailParams
# Update last report time in state
$state = Get-State
$state.LastReportTime = (Get-Date).ToString('o')
$state.ReportsSent++
Save-State -State $state
Write-Log "Email report sent successfully to $($toRecipients -join ', ')" -Level "SUCCESS"
return $true
}
catch [System.Net.Mail.SmtpException] {
Write-Log "SMTP server error: $_" -Level "ERROR"
Write-Log "Mail parameters: Server=$SmtpServer, Port=$SmtpPort, SSL=$EnableSSL" -Level "ERROR"
return $false
}
catch [System.Net.WebException] {
Write-Log "Network connectivity issue with mail server: $_" -Level "ERROR"
return $false
}
catch {
Write-Log "Error sending email report: $_" -Level "ERROR"
return $false
}
}
catch {
Write-Log "Error preparing email report: $_" -Level "ERROR"
return $false
}
}
# Function to generate and send initial failover notification
function Send-InitialFailoverNotification {
param (
[Parameter(Mandatory = $true)]
[string]$FailoverID,
[Parameter(Mandatory = $true)]
[string]$Reason
)
if (-not $SendEmailReport) {
return $false
}
Write-Log "Sending initial failover notification" -Level "INFO"
# Send the initial notification
$result = Send-FailoverReport -EventType "FailoverInProgress" -Environment $Env -FailoverID $FailoverID `
-Reason $Reason -IsInitialNotification
return $result
}
# Function to generate and send failover completion notification
function Send-CompletionFailoverNotification {
param (
[Parameter(Mandatory = $true)]
[string]$FailoverID,
[Parameter(Mandatory = $true)]
[bool]$Success
)
if (-not $SendEmailReport) {
return $false
}
Write-Log "Sending failover completion notification" -Level "INFO"
# Determine event type based on success
$eventType = if ($Success) { "FailoverCompleted" } else { "FailoverFailed" }
# Send the completion notification
$result = Send-FailoverReport -EventType $eventType -Environment $Env -FailoverID $FailoverID
return $result
}
# Function to generate and send report from existing logs
function Invoke-FailoverReporting {
param (
[Parameter(Mandatory = $false)]
[string]$EventType = "Monitoring",
[Parameter(Mandatory = $false)]
[int]$ErrorCount = 0,
[Parameter(Mandatory = $false)]
[int]$ThresholdCount = 0
)
Write-Log "Starting failover reporting process" -Level "INFO"
if (-not $SendEmailReport -and -not $GenerateReportOnly) {
Write-Log "Email reporting not enabled, skipping report generation" -Level "INFO"
return $false
}
# Send the report
$result = Send-FailoverReport -EventType $EventType -Environment $Env `
-ErrorCount $ErrorCount -ErrorThreshold $ThresholdCount
if ($result) {
Write-Log "Email report sent successfully" -Level "SUCCESS"
return $true
}
else {
Write-Log "Failed to send email report" -Level "ERROR"
return $false
}
}
#endregion
#region Failover Management
# Function to trigger failover
function Invoke-CompleteFailoverCycle {
param (
[Parameter(Mandatory = $false)]
[string]$Reason = "Automatic failover triggered by COMException threshold"
)
Write-Log "Triggering complete failover cycle: $Reason" -Level "WARNING"
if (-not (Test-CompleteFailoverScript)) {
Write-Log "Cannot proceed with failover - CompleteFailoverCycle script not found" -Level "ERROR"
return $false
}
# Generate a unique ID for this failover operation
$failoverID = [guid]::NewGuid().ToString()
Write-Log "Starting failover operation with ID: $failoverID" -Level "WARNING"
# Store the failover ID in state
$state = Get-State
$state.CurrentFailoverID = $failoverID
Save-State -State $state
# If in test mode, simulate a successful failover
if ($TestMode) {
Write-Log "TEST MODE: Simulating failover operation (no actual failover performed)" -Level "WARNING"
# Send initial notification if email reporting is enabled
if ($SendEmailReport) {
Write-Log "Sending initial failover notification" -Level "INFO"
Send-InitialFailoverNotification -FailoverID $failoverID -Reason "TEST MODE: $Reason"
}
Start-Sleep -Seconds 5 # Simulate some processing time
# Update state with last failover time
$state = Get-State
$state.LastFailoverTime = (Get-Date).ToString('o')
$state.FailoverCount++
$state.Errors = @() # Clear errors after successful failover
$state.CurrentFailoverID = $null # Clear current failover ID
Save-State -State $state
Write-Log "TEST MODE: Simulated failover completed successfully" -Level "SUCCESS"
Write-Log "Completed failover operation with ID: $failoverID" -Level "SUCCESS"
# Send completion notification if email reporting is enabled
if ($SendEmailReport) {
Write-Log "Sending failover completion notification" -Level "INFO"
Send-CompletionFailoverNotification -FailoverID $failoverID -Success $true
}
return $true
}
try {
# Send initial notification if email reporting is enabled
if ($SendEmailReport) {
Write-Log "Sending initial failover notification" -Level "INFO"
Send-InitialFailoverNotification -FailoverID $failoverID -Reason $Reason
}
# Create temp file for output
$tempFile = [System.IO.Path]::GetTempFileName()
# Build powershell command with proper argument escaping
$scriptCmd = "& '$CompleteFailoverScriptPath'"
$scriptCmd += " -Env '$Env'"
# Add optional parameters with proper escaping
if ($dnsServer) {
$scriptCmd += " -dnsServer '$dnsServer'"
}
if ($lookupZone) {
$scriptCmd += " -lookupZone '$lookupZone'"
}
# Add TTL parameters if specified
$scriptCmd += " -DefaultTTLMinutes $DefaultTTLMinutes -ReducedTTLMinutes $ReducedTTLMinutes"
# Redirect output to a file
$scriptCmd += " *>&1 | Tee-Object -FilePath '$tempFile'"
# Log the command
Write-Log "Executing failover: $scriptCmd" -Level "INFO"
# Log execution start time
$scriptStartTime = Get-Date
Write-Log "Script execution started at: $scriptStartTime" -Level "DEBUG"
# Execute the command
# Reset LASTEXITCODE before execution to ensure clean state
$result = Invoke-Expression $scriptCmd
$exitCode = $LASTEXITCODE
# Log completion time
$scriptEndTime = Get-Date
$executionDuration = ($scriptEndTime - $scriptStartTime).TotalSeconds
Write-Log "Script execution completed at: $scriptEndTime (took $executionDuration seconds)" -Level "DEBUG"
# Read the captured output
$output = Get-Content -Path $tempFile -Raw -ErrorAction SilentlyContinue
Remove-Item -Path $tempFile -Force -ErrorAction SilentlyContinue
# Log output for debugging
Write-Log "Failover script output: $output" -Level "INFO"
# Check if failover was successful
$success = $false
if ($exitCode -eq 0) {
$success = $true
}
else {
# Check for success indicators in the output even if exit code is non-zero
if ($output -match "Complete failover cycle finished" -or
$output -match "Script completed successfully" -or
$output -match "First failover successful:" -or
$output -match "Second failover successful:") {
$success = $true
Write-Log "Failover script reported non-zero exit code but appears successful" -Level "WARNING"
}
}
if ($success) {
Write-Log "Complete failover cycle executed successfully" -Level "SUCCESS"
Write-Log "Completed failover operation with ID: $failoverID" -Level "SUCCESS"
# Update state with last failover time
$state = Get-State
$state.LastFailoverTime = (Get-Date).ToString('o')
$state.FailoverCount++
$state.Errors = @() # Clear errors after successful failover
$state.CurrentFailoverID = $null # Clear current failover ID
## Reset TTL back to standard value after successful failover if needed
#if ($state.TTLStatus -eq "Reduced") {
# $dnsName = if ($Env -eq "Dev") { "LendingWebServerDev" } else { "LendingWebServer" }
# Write-Log "Resetting TTL back to standard value ($DefaultTTLMinutes minutes) for $dnsName" -Level "INFO"
# if ($dnsServer -and $lookupZone) {
# if (Update-DnsTTL -DnsServer $dnsServer -LookupZone $lookupZone -DnsName $dnsName -TTLMinutes $DefaultTTLMinutes) {
# $state.TTLStatus = "Standard"
# $state.LastTTLChange = (Get-Date).ToString('o')
# Write-Log "TTL reset to standard value successfully" -Level "SUCCESS"
# }
# }
# else {
# Write-Log "DNS server or lookup zone not specified, skipping TTL reset" -Level "WARNING"
# }
#}
Save-State -State $state
# Send completion notification if email reporting is enabled
if ($SendEmailReport) {
Write-Log "Sending failover completion notification" -Level "INFO"
Send-CompletionFailoverNotification -FailoverID $failoverID -Success $true
}
return $true
}
else {
Write-Log "Complete failover cycle failed with exit code: $exitCode" -Level "ERROR"
Write-Log "Failed failover operation with ID: $failoverID" -Level "ERROR"
# Update current failover ID state
$state = Get-State
$state.CurrentFailoverID = $null # Clear current failover ID
Save-State -State $state
# Send completion notification if email reporting is enabled
if ($SendEmailReport) {
Write-Log "Sending failover completion notification" -Level "INFO"
Send-CompletionFailoverNotification -FailoverID $failoverID -Success $false
}
return $false
}
}
catch {
Write-Log "Error executing complete failover cycle: $_" -Level "ERROR"
Write-Log "Failed failover operation with ID: $failoverID" -Level "ERROR"
# Update current failover ID state
$state = Get-State
$state.CurrentFailoverID = $null # Clear current failover ID
Save-State -State $state
# Send completion notification if email reporting is enabled
if ($SendEmailReport) {
Write-Log "Sending failover completion notification" -Level "INFO"
Send-CompletionFailoverNotification -FailoverID $failoverID -Success $false
}
return $false
}
}
#endregion
#region Setup and Monitoring
# Function to initialize the monitoring environment
function Initialize-Monitoring {
Write-Log "Initializing Auto Failover Monitoring environment" -Level "INFO"
# Validate the CompleteFailoverCycle script exists
if (-not (Test-CompleteFailoverScript)) {
Write-Log "Initialization failed - CompleteFailoverCycle script not found" -Level "ERROR"
return $false
}
# Initialize state file
$state = Get-State
Write-Log "State file initialized at: $stateFilePath" -Level "INFO"
# Create scheduled task for periodic monitoring if not running as service
if (-not $RunAsService) {
try {
# Check if scheduled task already exists
$taskName = "AutoFailoverMonitor_$Env"
$task = Get-ScheduledTask -TaskName $taskName -ErrorAction SilentlyContinue
if ($task) {
Write-Log "Scheduled task '$taskName' already exists" -Level "INFO"
}
else {
# Create a scheduled task to run every 5 minutes
$action = New-ScheduledTaskAction -Execute "powershell.exe" -Argument "-NoProfile -ExecutionPolicy Bypass -File `"$PSCommandPath`" -Env `"$Env`""
$trigger = New-ScheduledTaskTrigger -Once -At (Get-Date) -RepetitionInterval (New-TimeSpan -Minutes 5)
$settings = New-ScheduledTaskSettingsSet -ExecutionTimeLimit (New-TimeSpan -Minutes 10) -RestartCount 3
Register-ScheduledTask -TaskName $taskName -Action $action -Trigger $trigger -Settings $settings -Description "Automatic Failover Monitor for $Env environment"
Write-Log "Scheduled task '$taskName' created to run every 5 minutes" -Level "SUCCESS"
}
}
catch {
Write-Log "Error creating scheduled task: $_" -Level "WARNING"
Write-Log "You may need to manually create a scheduled task to run this script periodically" -Level "WARNING"
}
}
Write-Log "Initialization completed successfully" -Level "SUCCESS"
return $true
}
# Function to run the monitor once
function Invoke-MonitoringCheck {
Write-Log "Starting monitoring check for COMException errors" -Level "INFO"
# Get DNS name based on environment
$dnsName = if ($Env -eq "Dev") { "LendingWebServerDev" } else { "LendingWebServer" }
Write-Log "Using DNS name for $Env environment: $dnsName" -Level "INFO"
# Get current active server
$activeServer = Get-CurrentActiveServer
if (-not $activeServer) {
Write-Log "Cannot proceed without determining active server" -Level "ERROR"
return $false
}
# Extract server name without domain
$serverName = Get-HostnameFromFQDN -FQDN $activeServer
Write-Log "Active server short name: $serverName" -Level "INFO"
# Get current state
$state = Get-State
# Check if we're in a cooldown period (unless force failover is specified)
if (-not $ForceFailover -and (Test-CooldownPeriod -State $state)) {
Write-Log "Skipping error check due to cooldown period" -Level "INFO"
# Send cooldown notification if email reporting is enabled
if ($SendEmailReport) {
Write-Log "Sending cooldown notification" -Level "INFO"
Invoke-FailoverReporting -EventType "Cooldown"
}
return $true
}
# Check for COMException events on the active server
$events = Get-COMExceptionEvents -ComputerName $serverName -MinutesAgo $TimeWindowMinutes
# Update error state with new events
$errorCount = Update-ErrorState -State $state -NewErrors $events
Write-Log "Current COMException error count: $errorCount/$ErrorThreshold in last $TimeWindowMinutes minutes" -Level "INFO"
## Add TTL Management - When first error is detected (or force failover), reduce TTL to 1 minute
#if (($errorCount -ge 1 -and $state.TTLStatus -eq "Standard") -or $ForceFailover) {
# Write-Log "First COM error detected (or force failover), proactively reducing TTL" -Level "WARNING"
# if ($dnsServer -and $lookupZone) {
# if (Update-DnsTTL -DnsServer $dnsServer -LookupZone $lookupZone -DnsName $dnsName -TTLMinutes $ReducedTTLMinutes) {
# $state.TTLStatus = "Reduced"
# $state.LastTTLChange = (Get-Date).ToString('o')
# Save-State -State $state
# Write-Log "TTL reduced to $ReducedTTLMinutes minute(s) successfully" -Level "SUCCESS"
# }
# }
# else {
# Write-Log "DNS server or lookup zone not specified, skipping TTL reduction" -Level "WARNING"
# }
#}
# Check if threshold is exceeded or force failover is specified
if ($errorCount -ge $ErrorThreshold -or $ForceFailover) {
$reason = if ($ForceFailover) {
"Forced failover requested"
}
else {
"COMException threshold reached ($errorCount events in $TimeWindowMinutes minutes)"
}
# Trigger failover
$result = Invoke-CompleteFailoverCycle -Reason $reason
if ($result) {
Write-Log "Automatic failover successfully completed" -Level "SUCCESS"
}
else {
Write-Log "Automatic failover failed" -Level "ERROR"
}
return $result
}
else {
Write-Log "Error threshold not reached, no action needed" -Level "INFO"
# Send status report if email reporting is enabled and at least one error was found
if ($SendEmailReport -and $errorCount -gt 0) {
Write-Log "Sending error detection report" -Level "INFO"
Invoke-FailoverReporting -EventType "ErrorDetected" -ErrorCount $errorCount -ThresholdCount $ErrorThreshold
}
return $true
}
}
# Function to run continuous monitoring as a service
# Function to run continuous monitoring as a service
function Start-MonitoringService {
param (
[Parameter(Mandatory = $false)]
[int]$LogRotationCheckIntervalMinutes = 60
)
Write-Log "Starting Auto Failover Monitoring service" -Level "INFO"
# Validate script dependencies before entering loop
if (-not (Test-CompleteFailoverScript)) {
Write-Log "Cannot start monitoring service - CompleteFailoverCycle script not found" -Level "ERROR"
return $false
}
# Validate email settings if email reporting is enabled
if ($SendEmailReport) {
if (-not (Test-EmailSettings)) {
Write-Log "Email settings validation failed. Email reporting will be disabled." -Level "WARNING"
$SendEmailReport = $false
}
}
try {
# Initial check to get and validate active server
$activeServer = Get-CurrentActiveServer
if (-not $activeServer) {
Write-Log "Cannot start monitoring service - unable to determine active server" -Level "ERROR"
return $false
}
# Initialize failover counter
$failoverAttempts = 0
$maxFailoverAttempts = 10 # Maximum number of failover attempts in a 24-hour period
$failoverCountResetTime = (Get-Date).AddHours(24)
# Track last log rotation time
$lastLogRotationCheck = [DateTime]::MinValue
Write-Log "Monitoring service started successfully" -Level "SUCCESS"
# If email reporting is enabled, send a startup notification
if ($SendEmailReport) {
Write-Log "Sending service startup notification email" -Level "INFO"
try {
Invoke-FailoverReporting -EventType "ServiceStart"
}
catch {
Write-Log "Failed to send startup notification: $_" -Level "WARNING"
}
}
# Main monitoring loop
while ($true) {
try {
$now = Get-Date
# Rotate log file if needed (only check periodically to reduce filesystem operations)
if (($now - $lastLogRotationCheck).TotalMinutes -ge $LogRotationCheckIntervalMinutes) {
Rotate-LogFile -LogPath $logFilePath
$lastLogRotationCheck = $now
}
# Check if we need to reset the failover counter
if ($now -gt $failoverCountResetTime) {
$failoverAttempts = 0
$failoverCountResetTime = $now.AddHours(24)
Write-Log "Failover attempt counter reset" -Level "INFO"
}
# Run monitoring check if we haven't exceeded the maximum failover attempts
if ($failoverAttempts -lt $maxFailoverAttempts) {
$result = Invoke-MonitoringCheck
# If a failover was triggered and successful, increment the counter
$state = Get-State
if ($state.LastFailoverTime) {
$lastFailoverTime = [DateTime]::Parse($state.LastFailoverTime)
if ($lastFailoverTime -gt $now.AddMinutes(-10)) {
$failoverAttempts++
Write-Log "Failover attempt count: $failoverAttempts/$maxFailoverAttempts in current 24-hour period" -Level "WARNING"
}
}
# Clear the temp log for next run and optimize memory usage
$script:tempLogContent = @()
[System.GC]::Collect()
}
else {
$maxAttemptsMessage = "Maximum failover attempts ($maxFailoverAttempts) reached for 24-hour period. Monitoring continues but no failovers will be triggered."
Write-Log $maxAttemptsMessage -Level "WARNING"
# Send email notification about max attempts reached if enabled
if ($SendEmailReport) {
try {
Invoke-FailoverReporting -EventType "MaxAttemptsReached"
}
catch {
Write-Log "Failed to send max attempts notification: $_" -Level "WARNING"
}
}
}
# Sleep for 5 minutes between checks
Write-Log "Sleeping for 5 minutes before next check..." -Level "INFO"
Start-Sleep -Seconds 300
}
catch {
Write-Log "Error in monitoring cycle: $_" -Level "ERROR"
# Send email notification about monitoring error if enabled
if ($SendEmailReport) {
try {
Invoke-FailoverReporting -EventType "MonitoringError"
}
catch {
Write-Log "Failed to send error notification: $_" -Level "WARNING"
}
}
Write-Log "Continuing with next check in 5 minutes..." -Level "WARNING"
Start-Sleep -Seconds 300
}
}
}
catch {
$criticalError = "Critical error in monitoring service: $_"
Write-Log $criticalError -Level "ERROR"
# Send email notification about critical error if enabled
if ($SendEmailReport) {
try {
Invoke-FailoverReporting -EventType "CriticalError"
}
catch {
Write-Log "Failed to send critical error notification: $_" -Level "WARNING"
}
}
Write-Log "Monitoring service stopped" -Level "ERROR"
return $false
}
}
#endregion
# Main execution block
try {
Write-Log "=======================================================" -Level "INFO"
Write-Log "Auto Failover Monitor v1.2 with Simplified Email Reporting" -Level "INFO"
Write-Log "Environment: $Env, Error Threshold: $ErrorThreshold/$TimeWindowMinutes min, Cooldown: $CooldownPeriodMinutes min" -Level "INFO"
Write-Log "TTL Settings: Default: $DefaultTTLMinutes min, Reduced: $ReducedTTLMinutes min" -Level "INFO"
# Log email settings if email reporting is enabled
if ($SendEmailReport) {
Write-Log "Email reporting enabled - Sending to: $($EmailTo -join ', ')" -Level "INFO"
}
Write-Log "=======================================================" -Level "INFO"
# Handle different execution modes
if ($Initialize) {
Initialize-Monitoring
}
elseif ($GenerateReportOnly) {
Write-Log "Generate report only mode - creating report from existing logs" -Level "INFO"
$activeServer = Get-CurrentActiveServer
Invoke-FailoverReporting -EventType "GenerateReportOnly"
}
elseif ($RunAsService) {
Start-MonitoringService
}
elseif ($ForceFailover) {
Write-Log "Force failover mode enabled - will trigger failover regardless of error state" -Level "WARNING"
Invoke-MonitoringCheck
}
elseif ($TestMode) {
Write-Log "Test mode enabled - no actual failover will be performed" -Level "WARNING"
Invoke-MonitoringCheck
}
else {
# Run a single monitoring check
Invoke-MonitoringCheck
}
# Clean up temporary log file
if (Test-Path -Path $script:tempLogPath) {
Remove-Item -Path $script:tempLogPath -Force -ErrorAction SilentlyContinue
}
exit 0
}
catch {
Write-Log "Unhandled exception in Auto Failover Monitor: $_" -Level "ERROR"
Write-Log $_.ScriptStackTrace -Level "ERROR"
# Send email report for critical error if email reporting is enabled
if ($SendEmailReport) {
Write-Log "Attempting to send error report via email" -Level "INFO"
try {
Invoke-FailoverReporting -EventType "CriticalError" -Environment $Env
}
catch {
Write-Log "Failed to send error report: $_" -Level "ERROR"
}
}
# Clean up temporary log file
if (Test-Path -Path $script:tempLogPath) {
Remove-Item -Path $script:tempLogPath -Force -ErrorAction SilentlyContinue
}
exit 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment