Last active
October 10, 2016 09:34
-
-
Save djenriquez/68ed05b648cbf7d5897e to your computer and use it in GitHub Desktop.
AWS ECS Container Autoscale Lambda function: Create a CloudWatch CPU metric for high and low alarms. Create SNS topics to trigger from each alarm. Subscribe the lambda function to the SNS topics. Let magic happen.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
console.log('Loading event'); | |
//Import AWS-SDK | |
var AWS = require('aws-sdk'); | |
//Fetch the assigned cpu requirements for a task definition | |
var fetchRequiredCPU = function(taskDefinitionData) | |
{ | |
var cpu = taskDefinitionData.taskDefinition.containerDefinitions[0].cpu; | |
return cpu; | |
}; | |
//Fetch the assigned memory requirements for a task definition | |
var fetchRequiredMemory = function(taskDefinitionData) | |
{ | |
var cpu = taskDefinitionData.taskDefinition.containerDefinitions[0].memory; | |
return cpu; | |
}; | |
//Parse the Alarm reason and retrieve the alarm CPU util using regex | |
var fetchUtilFromReason = function(newStateReason) | |
{ | |
var valueRegex = /Threshold Crossed: \d+ data\S+ \(([0-9\.E-]+)\)/g; | |
var cpuUtil = valueRegex.exec(newStateReason); | |
console.log("Alarm CPUUtil:", cpuUtil); | |
var cpuUtilFloat = parseFloat(cpuUtil[1]); | |
return cpuUtilFloat | 0; | |
}; | |
//Parse the Alarm reason and retrieve the CPU util goal using regex | |
var fetchDesiredUtilFromReason = function(newStateReason) | |
{ | |
var valueRegex = /Threshold Crossed: \d+ data\S+ \(([0-9\.E-]+)\).+\(([0-9\.]+)\)/g; | |
var cpuUtil = valueRegex.exec(newStateReason); | |
console.log("Desired CPUUtil:", cpuUtil); | |
var cpuUtilFloat = parseFloat(cpuUtil[2]); | |
return cpuUtilFloat | 0; | |
}; | |
//Use logic to figure out desired count of containers | |
//serviceData contains the service definition | |
//ecsService is the name of the ECS service | |
var calculateDesiredCount = function(desiredCPUUtil, serviceData, ecsServiceName, alarmCPUUtil) | |
{ | |
//Container Count = ((#running containers * cpuUtil)/desiredCPUUtil) - #running containers) | |
var runningCount = serviceData.runningCount; | |
//Retrieve desired count and increment | |
var desiredCount = ((runningCount * alarmCPUUtil)/desiredCPUUtil) | 0; | |
if(desiredCount <= 0) | |
return 1; | |
else | |
return desiredCount; | |
}; | |
var updateServiceDefinitionResp = function(err, data) | |
{ | |
if (err) | |
{ | |
console.log(err); | |
context.fail(); | |
} | |
else | |
{ | |
console.log("Successfully updated service definition:", data); | |
} | |
}; | |
var changeAlarmStateResp = function(err, data) | |
{ | |
if(err) | |
{ | |
console.log(err); | |
context.fail(); | |
} | |
else | |
{ | |
console.log("Successfully changed alarm state:", data); | |
} | |
}; | |
exports.handler = function(event, context) { | |
var ecsService = ""; | |
var ecsCluster = ""; | |
var message = JSON.parse(event.Records[0].Sns.Message); | |
console.log("Received message:", message); | |
var msgDimensions = message.Trigger.Dimensions; | |
var ecsRegion = message.Region.toLowerCase(); | |
var alarmCPUUtil = fetchUtilFromReason(message.NewStateReason); | |
var desiredCPUUtil = fetchDesiredUtilFromReason(message.NewStateReason); | |
var ecs = new AWS.ECS({region: ecsRegion}); | |
var cws = new AWS.CloudWatch({region: ecsRegion}); | |
//Fetch ECS Service name and ECS Cluster name | |
for(var i = 0; i < msgDimensions.length; i++) | |
{ | |
switch(msgDimensions[i].name) | |
{ | |
case "ServiceName": | |
ecsService = msgDimensions[i].value; | |
console.log("Parsing service:", ecsService); | |
break; | |
case "ClusterName": | |
ecsCluster = msgDimensions[i].value; | |
console.log("Parsing cluster:", ecsCluster); | |
break; | |
default: | |
break; | |
} | |
} | |
//If unable to get ECS service, log failure | |
if(ecsService == "" || ecsCluster == "") | |
{ | |
console.log("Unable to retrieve service name and/or cluster name from SNS event: ", message); | |
context.fail("Unable to retrieve service name and/or cluster name from SNS event: ", message); | |
} | |
//Grab service metadata | |
ecs.describeServices({services:[ecsService], cluster: ecsCluster}, function(err, data) | |
{ | |
if (err) | |
{ | |
console.log("Unable to retrieve service definition for:", ecsService); | |
context.fail(err, err.stack); | |
} | |
else | |
{ | |
var serviceDefinition = data.services[0]; | |
console.log("Retrieved service definition:", serviceDefinition); | |
var serviceCpuReq = 0; | |
var serviceMemReq = 0; | |
//Grab task definition required CPU and Memory | |
ecs.describeTaskDefinition({taskDefinition: serviceDefinition.taskDefinition}, function(err2, taskDefinition) | |
{ | |
if (err2) | |
{ | |
console.log("Unable to retrieve task definition for:", serviceDefinition.taskDefinition, err2, err2.stack); | |
context.fail(err2, err2.stack); | |
} | |
else | |
{ | |
console.log("Retrieved task definition:", taskDefinition); | |
serviceCpuReq = fetchRequiredCPU(taskDefinition); | |
serviceMemReq = fetchRequiredMemory(taskDefinition); | |
} | |
}); | |
//Find desired count required to equalize load | |
var desiredCount = calculateDesiredCount(desiredCPUUtil, serviceDefinition, ecsService, alarmCPUUtil); | |
console.log("Calculated desired count of", desiredCount); | |
//Calculate the total required cluster resources | |
var clusterReqCPU = serviceCpuReq * desiredCount; | |
var clusterReqMemory = serviceMemReq * desiredCount; | |
//Verify that any instance has enough resources | |
//Fetch all instances ARNs for this cluster | |
ecs.listContainerInstances({cluster: ecsCluster}, function(err3, instance_arns) | |
{ | |
if (err3) | |
{ | |
console.log("Unable to fetch container instances: ", err3); | |
} | |
else | |
{ | |
console.log("Retrieved container instance ARNs:", instance_arns); | |
//Fetch instance metadata | |
ecs.describeContainerInstances({containerInstances: instance_arns.containerInstanceArns, cluster: ecsCluster}, function(err4, instance_metadata) | |
{ | |
if(err4) | |
{ | |
console.log("Unable to describe container instances: ", err4, err); | |
} | |
else | |
{ | |
console.log("Retrieved container instance data: ", instance_metadata); | |
var clusterCPU = 0; | |
var clusterMemory = 0; | |
var instanceCPU = 0; | |
var instanceMemory = 0; | |
var isThereEnoughResources = false; | |
//loop through each instance and check if any have | |
//enough resources to run the service | |
for(var i = 0; i < instance_metadata.containerInstances.length; i++) | |
{ | |
resources = instance_metadata.containerInstances[i].remainingResources; | |
for(var j = 0; j < resources.length; j++) | |
{ | |
switch(resources[j].name) | |
{ | |
case "CPU": | |
instanceCPU = resources[j].integerValue; | |
console.log(instance_metadata.containerInstances[i].containerInstanceArn, "CPU:", instanceCPU); | |
break; | |
case "MEMORY": | |
instanceMemory = resources[j].integerValue; | |
console.log(instance_metadata.containerInstances[i].containerInstanceArn, "Memory:", instanceMemory); | |
break; | |
default: | |
break; | |
} | |
if(instanceCPU >= serviceCpuReq && instanceMemory >= serviceMemReq) | |
{ | |
clusterCPU = clusterCPU + instanceCPU; | |
clusterMemory = clusterMemory + instanceMemory; | |
} | |
} | |
//If there are not enough resources, maximize the cluster and let the ASG increment the instance count | |
if(clusterCPU < clusterReqCPU || clusterMemory < clusterReqMemory) | |
{ | |
//Maximize the cluster | |
var maxContainersByCPU = (clusterCPU / instanceCPU) | 0; | |
var maxContainersByMemory = (clusterMemory / instanceMemory) | 0; | |
if(maxContainersByCPU < maxContainersByMemory) | |
desiredCount = maxContainersByCPU; | |
else | |
desiredCount = maxContainersByMemory; | |
} | |
//Call API to update service with new desired count | |
ecs.updateService({service: ecsService, cluster: ecsCluster, desiredCount: desiredCount, taskDefinition: serviceDefinition.taskDefinition}, updateServiceDefinitionResp); | |
console.log("Incrementing service", ecsService, "on cluster", ecsCluster, "task count to", desiredCount, " Task definition ARN:", serviceDefinition.taskDefinition); | |
//Call cloudwatch alarm state service to temporarily disable the alarm | |
cws.setAlarmState({AlarmName: message.AlarmName, StateReason: 'Temporarily disabling for container autoscaling script', StateValue: 'OK'}, changeAlarmStateResp); | |
console.log("Success."); | |
context.succeed(); | |
} | |
} | |
}); | |
} | |
}); | |
} | |
}); | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment