Created
December 18, 2015 14:01
-
-
Save ezeeetm/1f2e72b8a68062ce9ba9 to your computer and use it in GitHub Desktop.
blueGreenDeploy.ps1
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<# | |
.SYNOPSIS | |
Command line utility for initiating blue-green deployments, to be called by a build step, or by another script, in a Jenkins job. | |
.PARAMETER region | |
Optional/has default value. Specifies the AWS region, e.g. 'us-east-1'. See script Param section for allowed values. | |
.PARAMETER environment | |
Required. Specifies a friendly name for the environment, e.g. 'dev'. See script Param section for allowed values. | |
.PARAMETER product | |
Required. Specifies a friendly name for a product, e.g. 'fms'. See script Param section for allowed values. | |
.PARAMETER uid | |
Required. Specifies the uid for the product stack, e.g. 'uaa'. | |
.PARAMETER zip_name | |
Required. Specifies the value to apply to the 'deploymentZip' tag, to bind the new ASG to a single app version, e.g. 'dev-lms-49-2015-10-28.zip'. | |
.PARAMETER accessKey | |
Optional. Specifies an AWS access key to use for running locally/testing. | |
.PARAMETER secretKey | |
Optional. Specifies an AWS secret key to use for running locally/testing. | |
.EXAMPLE | |
.\deploy.ps1 -environment dev -product fms -uid uaa -zip_name dev-lms-49-2015-10-28.zip | |
.NOTES | |
DEPENDS: AWS Tools for Windows PowerShell: 3.1.23.0 or greater | |
.TODO | |
- add granular $rollBack levels and modify RollBack function to handle them | |
- make detachment from live elb conditional in RollBack | |
- make check for new scaling events occur during each CheckInService iteration, so new values can be applied as early as possible | |
#> | |
Param | |
( | |
[Parameter(Mandatory=$False)] | |
[ValidateSet("us-east-1","us-west-1","us-west-2","ap-northeast-1","ap-southeast-1","ap-southeast-2","eu-central-1","eu-west-1","sa-east-1",IgnoreCase = $false)] | |
[string]$region = "us-east-1", | |
[Parameter(Mandatory=$True)] | |
[ValidateSet("dev","test","testfull","preview","prod",IgnoreCase = $false)] | |
[string]$environment, | |
[Parameter(Mandatory=$True)] | |
[ValidateSet("fms","lms","cms","none",IgnoreCase = $false)] | |
[string]$product, | |
[Parameter(Mandatory=$True)] | |
[string]$uid, | |
[Parameter(Mandatory=$True)] | |
[string]$zip_name, | |
[Parameter(Mandatory=$False)] | |
[string]$accessKey, | |
[Parameter(Mandatory=$False)] | |
[string]$secretKey | |
) | |
# pretty logging for Jenkins console output. | |
function Log ($indentLevel, $logLevel, $logMessage) | |
{ | |
try | |
{ | |
$now = get-date -Format "hh:mm:ss" | |
$indent = "`t" * $indentLevel | |
Write-Host $now $loglevel$indent$logMessage | |
} | |
Catch [Exception] | |
{ | |
$exMsg = $_.Exception.Message | |
HandleException "Log" $exMsg | |
} | |
} | |
# to ensure API calls that don't return anything actually got made | |
function awsLog ($indentLevel) | |
{ | |
try | |
{ | |
$logLevel = "INFO" | |
$requestId = $AWSHistory.LastServiceResponse.ResponseMetadata.RequestId | |
$requestStatusCode = $AWSHistory.LastServiceResponse.HttpStatusCode | |
if ($requestStatusCode -ne "OK") | |
{ | |
$logLevel = "WARN" | |
} | |
Log $indentLevel $logLevel "AWS response: $requestStatusCode / Request ID: $requestId" | |
} | |
Catch [Exception] | |
{ | |
$exMsg = $_.Exception.Message | |
HandleException "awsLog" $exMsg | |
} | |
} | |
function HandleException($function,$exceptionMessage) | |
{ | |
$msg = "Exception in $($function): $exceptionMessage, exiting." | |
Log 1 FATAL $msg | |
throw $msg | |
exit 1 | |
} | |
function Setup ($region) | |
{ | |
try | |
{ | |
# remove any lingering AWS sessions, to ensure that Jenkins role or passed key params are used | |
Clear-AWSDefaults | |
Clear-AWSCredentials | |
Set-DefaultAWSRegion -Region $region | |
Log 2 INFO "script environment ready" | |
} | |
Catch [Exception] | |
{ | |
$exMsg = $_.Exception.Message | |
HandleException "Setup" $exMsg | |
} | |
} | |
function CheckStackState | |
{ | |
try | |
{ | |
$isBad = $false | |
$asgs = Get-ASAutoScalingGroup | where {$_.AutoScalingGroupName.StartsWith("$environment-$product")} | |
$liveAsgCount = ($asgs | where {$_.DesiredCapacity -ne 0}).count | |
if ($liveAsgCount -gt 1) | |
{ | |
Log 2 FATAL "there are $liveAsgCount ASGs with nonzero desired capacity for product $product in environment $environment" | |
$isBad = $true | |
} | |
$activeAsgCount = ($asgs | where {$_.Tags.value.Contains("active")}).count | |
if ($activeAsgCount -gt 1) | |
{ | |
Log 2 FATAL "there are $activeAsgCount ASGs with state = 'active' for product $product in environment $environment" | |
$isBad = $true | |
} | |
$tempElb = Get-ELBLoadBalancer | where {$_.LoadBalancerName -eq "$environment-$product-elb-temp"} | |
if ($tempElb) | |
{ | |
Log 2 FATAL "at least one ELB with name $environment-$product-elb-temp already exists" | |
$isBad = $true | |
} | |
$tempElbStack = Get-CFNStack | where {$_.StackName -eq "$environment-$product-temp-elb"} | |
if ($tempElbStack) | |
{ | |
# very rarely, DeleteTempElb at the end of the script results in DELETE_FAILED | |
# in those cases, we'll just delete it again. | |
# incuding "DELETE_IN_PROGRESS" here so back-to-back deployments function correctly as well | |
if (($tempElbStack.StackStatus.Value -eq "DELETE_FAILED") -or ($tempElbStack.StackStatus.Value -eq "DELETE_IN_PROGRESS")) | |
{ | |
Log 2 WARN "CFN stack $environment-$product-temp-elb is in state $($tempElbStack.StackStatus.Value) from previous deployment, cleaning this up" | |
$isBad = DeleteTempElb "recheck" | |
} | |
else | |
{ | |
Log 2 FATAL "CFN stack $environment-$product-temp-elb is in an unexpected state" | |
$isBad = $true | |
} | |
} | |
# check that current live ASG instances are all InService | |
$instanceStates = Get-ELBInstanceHealth -LoadBalancerName "$environment-$product-elb-$uid" | select State | |
foreach ($instanceState in $instanceStates) | |
{ | |
if ($instanceState.State -ne "InService") | |
{ | |
Log 2 FATAL "elb for active ASG $environment-$product-elb-$uid has one or more instances not InService." | |
$isBad = $true | |
} | |
} | |
if ($isBad) | |
{ | |
Log 2 FATAL "stack is not in acceptable state for deployment" | |
Log 2 FATAL "is a previous deployment still in progress / teardown failed?" | |
HandleException "CheckStackState" "FAILED" | |
} | |
Log 2 INFO "stack ready for deployment" | |
} | |
Catch [Exception] | |
{ | |
$exMsg = $_.Exception.Message | |
HandleException "CheckStackState" $exMsg | |
} | |
} | |
function GetAsg ($status) | |
{ | |
try | |
{ | |
sleep -Seconds 3 # buffer to allow previous updates to asg values to propagate in AWS before making back-to-back calls | |
$asgs = Get-ASAutoScalingGroup | where {$_.AutoScalingGroupName.StartsWith("$environment-$product")} #must refresh every time called | |
foreach ($asg in $asgs) | |
{ | |
$tags = $asg.Tags | |
$asgStatus = ($tags | where {$_.Key -eq "status"}).Value | |
if ($asgStatus -eq $status ) | |
{ | |
return $asg | |
} | |
} | |
} | |
Catch [Exception] | |
{ | |
$exMsg = $_.Exception.Message | |
HandleException "GetAsg" $exMsg | |
} | |
} | |
function UpdateInactiveASG ($activeAsg,$inactiveAsg) | |
{ | |
try | |
{ | |
$inactiveAsgName = $inactiveAsg.AutoScalingGroupName | |
$wasUpdated = $false | |
if (($inactiveAsg.MaxSize) -ne ($activeAsg.MaxSize)) | |
{ | |
Log 2 INFO "updating inactive ASG MaxSize: $($inactiveAsg.MaxSize) to match active ASG MaxSize: $($activeAsg.MaxSize)" | |
Update-ASAutoScalingGroup -AutoScalingGroupName $inactiveAsgName -MaxSize $activeAsg.MaxSize | |
awsLog 3 | |
$wasUpdated = $true | |
} | |
if (($inactiveAsg.MinSize) -ne ($activeAsg.MinSize)) | |
{ | |
Log 2 INFO "updating inactive ASG MinSize: $($inactiveAsg.MinSize) to match active ASG MinSize: $($activeAsg.MinSize)" | |
Update-ASAutoScalingGroup -AutoScalingGroupName $inactiveAsgName -MinSize $activeAsg.MinSize | |
awsLog 3 | |
$wasUpdated = $true | |
} | |
if (($inactiveAsg.DesiredCapacity) -ne ($activeAsg.DesiredCapacity)) | |
{ | |
Log 2 INFO "updating inactive ASG DesiredCapacity: $($inactiveAsg.DesiredCapacity) to match active ASG DesiredCapacity: $($activeAsg.DesiredCapacity)" | |
Update-ASAutoScalingGroup -AutoScalingGroupName $inactiveAsgName -DesiredCapacity $activeAsg.DesiredCapacity | |
awsLog 3 | |
$wasUpdated = $true | |
} | |
return $wasUpdated | |
} | |
Catch [Exception] | |
{ | |
$exMsg = $_.Exception.Message | |
HandleException "UpdateInactiveASG" $exMsg | |
} | |
} | |
function UpdateAsgTag ($asg,$tagKey,$tagValue) | |
{ | |
try | |
{ | |
$asgTags = $asg.Tags | |
$asgName = $asg.AutoScalingGroupName | |
$currentTagValue = ($asgTags | where {$_.Key -eq $tagKey}).Value | |
if ($currentTagValue -ne $tagValue) | |
{ | |
# ASG names and zips are long values, broken up to avoid line wrapping | |
Log 2 INFO "updating tag for ASG: $asgName" | |
Log 3 INFO "tag: $tagKey, currentValue: $currentTagValue, newValue: $tagValue" | |
Set-ASTag -Tag @( @{ResourceType="auto-scaling-group"; ResourceId=$asgName; Key=$tagKey; Value=$tagValue; PropagateAtLaunch=$true} ) | |
} | |
} | |
Catch [Exception] | |
{ | |
$exMsg = $_.Exception.Message | |
HandleException "UpdateAsgTag" $exMsg | |
} | |
} | |
function CreateTempElb | |
{ | |
try | |
{ | |
$elbStack = Get-CFNStack | where {$_.StackName.StartsWith("$environment-$product-$uid-elb")} | |
$tempElbStackName = "$environment-$product-temp-elb" | |
$stackParams = $elbStack.Parameters | |
$uidParam = $stackParams | where {$_.ParameterKey -eq "uidParameter"} | |
$uidParam.ParameterValue = "temp" | |
$cfnStatusCheckInterval = 15 #seconds | |
$templateUrl = "https://s3.amazonaws.com/$environment-cfntemplates-$uid/$product/elb.json" | |
$resp = New-CFNStack -StackName $tempElbStackName -Parameter $stackParams -TemplateURL $templateUrl -Capabilities "CAPABILITY_IAM" | |
Log 2 INFO "CFN response: $resp" | |
do | |
{ | |
sleep -Seconds $cfnStatusCheckInterval | |
$stack = Get-CFNStack -StackName $tempElbStackName | |
$stackStatus = $stack.StackStatus | |
Log 2 INFO "...$stackStatus" | |
} | |
until | |
( | |
$stackStatus -eq "CREATE_COMPLETE" | |
) | |
} | |
Catch [Exception] | |
{ | |
$exMsg = $_.Exception.Message | |
HandleException "CreateTempElb" $exMsg | |
} | |
} | |
function UpdateElb ($asg,$elbName,$action) | |
{ | |
try | |
{ | |
if ($action -eq "mount") | |
{ | |
Add-ASLoadBalancer -LoadBalancerName $elbName -AutoScalingGroupName $asg.AutoScalingGroupName | |
} | |
if ($action -eq "dismount") | |
{ | |
Dismount-ASLoadBalancer -LoadBalancerName $elbName -AutoScalingGroupName $asg.AutoScalingGroupName | |
} | |
Log 2 INFO "action: $action" | |
Log 2 INFO "ELB: $elbName" | |
Log 2 INFO "ASG: $($asg.AutoScalingGroupName)" | |
awsLog 3 | |
} | |
Catch [Exception] | |
{ | |
$exMsg = $_.Exception.Message | |
HandleException "UpdateElb" $exMsg | |
} | |
} | |
function CheckInService ($elbName,$expectedInstances) | |
{ | |
try | |
{ | |
$retries = 1 | |
$elbInstanceHealthCheckInterval = 15 #seconds | |
$elbInstanceHealthRetries = 100 | |
do | |
{ | |
sleep -Seconds $elbInstanceHealthCheckInterval | |
$instances = Get-ELBInstanceHealth -LoadBalancerName $elbName | |
$instancesInService = $instances | where {$_.State -eq "InService"} | |
$retryString = $retries.ToString("000") | |
Log 2 INFO "retry $($retryString)/$($elbInstanceHealthRetries):`t$($instancesInService.count)/$expectedInstances instances in service" | |
$retries += 1 | |
} | |
until | |
( | |
(($instancesInService.count -eq $expectedInstances) -or ($retries -ge $elbInstanceHealthRetries)) | |
) | |
if ($retries -ge $elbInstanceHealthRetries) | |
{ | |
RollBack | |
} | |
} | |
Catch [Exception] | |
{ | |
$exMsg = $_.Exception.Message | |
HandleException "CheckInService" $exMsg | |
} | |
} | |
function UpdateAutoScaling ($asg,$action) | |
{ | |
try | |
{ | |
Log 2 INFO "action: $action" | |
Log 2 INFO "ASG: $($asg.AutoScalingGroupName)" | |
if ($action -eq "suspend") | |
{ | |
Suspend-ASProcess -AutoScalingGroupName $asg.AutoScalingGroupName | |
} | |
if ($action -eq "resume") | |
{ | |
Resume-ASProcess -AutoScalingGroupName $asg.AutoScalingGroupName | |
} | |
if ($action -eq "noCloudWatchAlarms") | |
{ | |
Suspend-ASProcess -AutoScalingGroupName $asg.AutoScalingGroupName -ScalingProcess AlarmNotification | |
} | |
awsLog 3 | |
} | |
Catch [Exception] | |
{ | |
$exMsg = $_.Exception.Message | |
HandleException "UpdateAutoScaling" $exMsg | |
} | |
} | |
function PurgeAsg ($asg) | |
{ | |
try | |
{ | |
$asgName = $asg.AutoScalingGroupName | |
Log 2 INFO "updating inactive ASG MinSize: $($asg.MinSize) to 0" | |
Update-ASAutoScalingGroup -AutoScalingGroupName $asgName -MinSize 0 | |
awsLog 3 | |
Log 2 INFO "updating inactive ASG DesiredCapacity: $($asg.DesiredCapacity) to 0" | |
Update-ASAutoScalingGroup -AutoScalingGroupName $asgName -DesiredCapacity 0 | |
awsLog 3 | |
Log 2 INFO "updating inactive ASG MaxSize: $($asg.MaxSize) to 0" | |
Update-ASAutoScalingGroup -AutoScalingGroupName $asgName -MaxSize 0 | |
awsLog 3 | |
} | |
Catch [Exception] | |
{ | |
$exMsg = $_.Exception.Message | |
HandleException "PurgeAsg" $exMsg | |
} | |
} | |
function DeleteTempElb ($recheck) | |
{ | |
try | |
{ | |
$tempElbStackName = "$environment-$product-temp-elb" | |
Log 2 INFO "deleting CFN stack $tempElbStackName" | |
Remove-CFNStack -StackName $tempElbStackName -Force | |
awsLog 3 | |
if ($recheck -eq "recheck") | |
{ | |
$retryThreshold = 20 #10m @ 30s. Usually takes 2m - 5m | |
do | |
{ | |
$tempElbStack = Get-CFNStack | where {$_.StackName.Contains($tempElbStackName)} | |
if($tempElbStack) | |
{ | |
Log 3 INFO "...$($tempElbStack.StackStatus.Value)" | |
if ($($tempElbStack.StackStatus.Value) -eq "DELETE_FAILED") | |
{ | |
Log 3 INFO "...(we'll see about that)" | |
Remove-CFNStack -StackName $tempElbStackName -Force | |
} | |
} | |
sleep -Seconds 30 | |
$retries += 1 | |
} | |
until | |
( | |
($tempElbStack -eq $null) -or ($retries -ge $retryThreshold) | |
) | |
if ($tempElbStack -eq $null) | |
{ | |
Log 3 INFO "...DELETED" | |
return $false #sets $isBad = $false in CheckStackState because tempElbStack is now gone. | |
} | |
if ($retries -ge $retryThreshold) | |
{ | |
return $true #sets $isBad = $true in CheckStackState because it refused to die. This should never happen. | |
} | |
} | |
} | |
Catch [Exception] | |
{ | |
$exMsg = $_.Exception.Message | |
HandleException "DeleteTempElb" $exMsg | |
} | |
} | |
function RollBack | |
{ | |
try | |
{ | |
Log 1 WARN "One or more instances unhealthy after retry threshold was reached. Rolling back." | |
$inactiveAsg = GetAsg inactive | |
$activeAsg = GetAsg active | |
UpdateAutoScaling $inactiveAsg resume | |
UpdateAutoScaling $activeAsg resume | |
PurgeAsg ($inactiveAsg) | |
$tempElbName = "$environment-$product-elb-temp" | |
UpdateElb $inactiveAsg $tempElbName dismount | |
#this throws if not actually attached, make it conditional (if attached) | |
#$activeElbName = "$environment-$product-elb-$uid" | |
#UpdateElb $inactiveAsg $activeElbName dismount | |
DeleteTempElb "noRecheck" | |
Log 1 INFO "Rollback complete, script exiting" | |
# "ROLLBACK" | Out-File -Append -FilePath .\deployTestLog.csv # FOR TESTING | |
exit 1 | |
} | |
Catch [Exception] | |
{ | |
$exMsg = $_.Exception.Message | |
HandleException "RollBack" $exMsg | |
} | |
} | |
# --------------------------------- script entry point --------------------------------- | |
# setup | |
Log 1 INFO "Setup:" | |
Setup $region | |
# if keys are passed, use them. | |
Set-DefaultAWSRegion -Region $region | |
if (($accessKey) -and ($secretKey)) | |
{ | |
Initialize-AWSDefaults -AccessKey $accessKey -SecretKey $secretKey -Region $region | |
} | |
# ensure stack is in good state for a deployment | |
Log 1 INFO "CheckStackState:" | |
CheckStackState | |
# get asgs | |
Log 1 INFO "GetAsgs:" | |
$activeAsg = GetAsg active | |
Log 2 INFO "active ASG: $($activeAsg.AutoScalingGroupName)" | |
$inactiveAsg = GetAsg inactive | |
Log 2 INFO "inactive ASG: $($inactiveAsg.AutoScalingGroupName)" | |
# clone active elb into a temp elb and attach to inactive asg to satisfy health check | |
# this is done like this because each ELB is ~$20/month | |
Log 1 INFO "CreateTempElb:" | |
CreateTempElb | |
# mount temp elb on inactive asg | |
$tempElbName = "$environment-$product-elb-temp" | |
Log 1 INFO "UpdateElb:" | |
UpdateElb $inactiveAsg $tempElbName mount | |
# disable autoscaling triggered by low CPU/mem/network CW alarms on inactive asg | |
# to prevent unwanted scale-down events while asg is waiting for instances to come in service | |
Log 1 INFO "UpdateAutoScaling:" | |
UpdateAutoScaling $inactiveAsg noCloudWatchAlarms | |
# update inactive asg with new deployment.zip tag and same des/min/max as active asg | |
Log 1 INFO "UpdateInactiveASG:" | |
UpdateAsgTag $inactiveAsg deploymentZip "$product/$zip_name" | |
$wasUpdated = UpdateInactiveASG $activeAsg $inactiveAsg | |
# ensure all instances behind temp elb are inService | |
# if activeAsg scales during this check, adjust and recheck before continuing | |
Log 1 INFO "CheckInService on ELB $($tempElbName):" | |
do | |
{ | |
$inactiveAsg = GetAsg inactive # to refresh DesiredCapacity | |
$expectedInstances = $inactiveAsg.DesiredCapacity | |
CheckInService $tempElbName $expectedInstances | |
$activeAsg = GetAsg active # to refresh desired/min/max values in case there was a scaling event during previous CheckInService | |
Log 1 INFO "Checking for min/max/desired changes caused by new scaling events in active ASG" | |
$wasUpdated = UpdateInactiveASG $activeAsg $inactiveAsg | |
if (!$wasUpdated) | |
{ | |
Log 1 INFO "No new scaling events found, continuing with deployment" | |
} | |
} | |
until | |
( | |
$wasUpdated -eq $False | |
) | |
# suspend scaling on active asg, not done earlier because time window required by CheckInService is too long | |
# to disable autoscaling. Done here to ensure scaling events from this point forward do not interfere w deployment. | |
# since elb heath checks are identical, this window is always be very brief ( ~30 seconds in testing) | |
Log 1 INFO "UpdateAutoScaling:" | |
UpdateAutoScaling $activeAsg suspend | |
# mount active elb on inactiveAsg | |
$activeElbName = "$environment-$product-elb-$uid" | |
Log 1 INFO "UpdateElb:" | |
UpdateElb $inactiveAsg $activeElbName mount | |
# ensure all instances behind active elb are inService | |
Log 1 INFO "CheckInService on ELB $($activeElbName):" | |
$expectedInstances = $inactiveAsg.DesiredCapacity + $activeAsg.DesiredCapacity | |
CheckInService $activeElbName $expectedInstances | |
# update asg status tags, and refresh asg objects | |
Log 1 INFO "UpdateAsgTag:" | |
UpdateAsgTag $inactiveAsg status active | |
UpdateAsgTag $activeAsg status inactive | |
$activeAsg = GetAsg active | |
$inactiveAsg = GetAsg inactive | |
# turn autoscaling back on for activeAsg | |
# turn autoscaling back on for inactiveAsg so its in a consistent state for next deployment | |
Log 1 INFO "UpdateAutoScaling:" | |
UpdateAutoScaling $activeAsg resume | |
UpdateAutoScaling $inactiveAsg resume | |
# dismount active elb from the inactiveAsg, and temp elb from the activeAsg | |
Log 1 INFO "UpdateElb:" | |
UpdateElb $inactiveAsg $activeElbName dismount | |
UpdateElb $activeAsg $tempElbName dismount | |
# set inactiveAsg des/min/max to 0 | |
Log 1 INFO "PurgeAsg:" | |
PurgeAsg $inactiveAsg | |
# delete temp elb stack | |
# noRecheck keeps it from blocking in Jenkins | |
Log 1 INFO "DeleteTempElb:" | |
DeleteTempElb "noRecheck" | |
# have a nice day! | |
Log 1 INFO "Deployment complete, script exiting" | |
#"SUCCESS" | Out-File -Append -FilePath .\deployTestLog.csv # FOR TESTING | |
exit 0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment