Skip to content

Instantly share code, notes, and snippets.

@adamcousins
Created August 11, 2020 06:45
Show Gist options
  • Save adamcousins/115c4bfaa72b8978c6ca5ccbfb0b136d to your computer and use it in GitHub Desktop.
Save adamcousins/115c4bfaa72b8978c6ca5ccbfb0b136d to your computer and use it in GitHub Desktop.
ECS Cluster on EC2/ASG with CapacityProviders
AWSTemplateFormatVersion: 2010-09-09
Description: "Application: ECS Cluster on EC2 with ECS Capacity Providers into existing VPC."
Parameters:
#Networking
PrivateSubnet1Id:
Description: Logical ID of Private Subnet 1
Type: AWS::SSM::Parameter::Value<AWS::EC2::Subnet::Id>
PrivateSubnet2Id:
Description: Logical ID of Private Subnet 2
Type: AWS::SSM::Parameter::Value<AWS::EC2::Subnet::Id>
PrivateSubnet3Id:
Description: Logical ID of Private Subnet 3
Type: AWS::SSM::Parameter::Value<AWS::EC2::Subnet::Id>
ECSHostSecurityGroup:
Description: Logical ID of an EC2 Security Group
Type: AWS::EC2::SecurityGroup::Id
#ECS Cluster
ClusterName:
Description: Name of the ECS Cluster. Required to break the circular dependency
Type: String
Default: common
ContainerInsights:
Description: Enable or Disable ECS Container Insights
Type: String
Default: enabled
AllowedValues:
- enabled
- disabled
InstanceType:
Description: Which instance type should we use to build the ECS cluster?
Type: String
Default: t2.large
MinClusterSize:
Description: How many ECS hosts do you want to initially deploy?
Type: Number
Default: 1
MaxClusterSize:
Description: How many ECS MAX hosts do you want to deploy?
Type: Number
Default: 4
DesiredClusterSize:
Description: How many ECS Desired hosts should be there in the group?
Type: Number
Default: 2
ECSAMI:
Description: ECS-Optimized AMI ID
Type: AWS::SSM::Parameter::Value<AWS::EC2::Image::Id>
Default: /aws/service/ecs/optimized-ami/amazon-linux-2/recommended/image_id
#Logging
LogRetention:
Description: Time for rotating the Logs
Type: Number
Default: 7
#Monitoring
EventsTopicArn:
Description: Events topic ARN
Type: String
AlertsTopicArn:
Description: Alert topic ARN
Type: String
#Scaling
CapacityProviderTargetCapacity:
Description: Percentage of Computer to target forutilisation by the Capacity Provider
Type: String
Default: '100'
#Cloudwatch Metric Parameters
HighCpuThreshold:
Description: High Cpu Threshold for ECS Service
Type: String
Default: '80'
HighCpuEvaluationPeriod:
Description: High Cpu Evaluation Period for ECS Service
Type: String
Default: 300 #5 Minutes or multiples of 60
AllowedValues: [10, 30, 60,120,180,240,300,360,420,480,540,600,660,720,780,840,900,960,1020,1080,1140,1200,1260,1320,1380,1440,1500]
LowCpuThreshold:
Description: Low Cpu Threshold for ECS Service
Type: String
Default: '30'
LowCpuEvaluationPeriod:
Description: Low Cpu Evaluation Period for ECS Service
Type: String
Default: 300 #5 Minutes or multiples of 60
AllowedValues: [10, 30, 60,120,180,240,300,360,420,480,540,600,660,720,780,840,900,960,1020,1080,1140,1200,1260,1320,1380,1440,1500]
HighMemoryThreshold:
Description: High Memory Threshold for ECS Service
Type: String
Default: '80'
HighMemoryEvaluationPeriod:
Description: High Memory Evaluation Period for ECS Service
Type: String
Default: 300 #5 Minutes or multiples of 60
AllowedValues: [10, 30, 60,120,180,240,300,360,420,480,540,600,660,720,780,840,900,960,1020,1080,1140,1200,1260,1320,1380,1440,1500]
LowMemoryThreshold:
Description: Low Memory Threshold for ECS Service
Type: String
Default: '30'
LowMemoryEvaluationPeriod:
Description: Low Memory Evaluation Period for ECS Service
Type: String
Default: 300 #5 Minutes or multiples of 60
AllowedValues: [10, 30, 60,120,180,240,300,360,420,480,540,600,660,720,780,840,900,960,1020,1080,1140,1200,1260,1320,1380,1440,1500]
Resources:
## ECS Cluster Resources
ECSCapacityProvider:
Type: AWS::ECS::CapacityProvider
Properties:
AutoScalingGroupProvider:
AutoScalingGroupArn: !Ref ECSAutoScalingGroup
ManagedScaling:
MaximumScalingStepSize: 10
MinimumScalingStepSize: 1
Status: ENABLED
TargetCapacity: !Ref CapacityProviderTargetCapacity
ManagedTerminationProtection: 'DISABLED' #Disabled due to no support in ASG for ScaleIn Protection
# ECS Cluster
ECSCluster:
Type: AWS::ECS::Cluster
Properties:
ClusterName: !Ref ClusterName
CapacityProviders:
- !Ref ECSCapacityProvider
DefaultCapacityProviderStrategy:
- Base: 0
CapacityProvider: !Ref ECSCapacityProvider
Weight: 1
ClusterSettings:
- Name: containerInsights
Value: !Ref ContainerInsights
#ECS Cluster Auto Scaling Group
ECSAutoScalingGroup:
Type: AWS::AutoScaling::AutoScalingGroup
Properties:
VPCZoneIdentifier:
- !Ref PrivateSubnet1Id
- !Ref PrivateSubnet2Id
- !Ref PrivateSubnet3Id
LaunchConfigurationName: !Ref ECSLaunchConfiguration
MinSize: !Ref MinClusterSize
MaxSize: !Ref MaxClusterSize
DesiredCapacity: !Ref DesiredClusterSize
MetricsCollection:
- Granularity: "1Minute"
Metrics:
- "GroupMinSize"
- "GroupMaxSize"
- "GroupDesiredCapacity"
- "GroupInServiceInstances"
- "GroupPendingInstances"
- "GroupStandbyInstances"
- "GroupTerminatingInstances"
- "GroupTotalInstances"
NotificationConfigurations:
-
NotificationTypes:
- "autoscaling:EC2_INSTANCE_LAUNCH"
- "autoscaling:EC2_INSTANCE_TERMINATE"
TopicARN: !Ref EventsTopicArn
-
NotificationTypes:
- "autoscaling:EC2_INSTANCE_LAUNCH_ERROR"
- "autoscaling:EC2_INSTANCE_TERMINATE_ERROR"
TopicARN: !Ref AlertsTopicArn
Tags:
- Key: Name
Value: ASG-ECS-HOST
PropagateAtLaunch: true
- Key: ECSClusterName
Value: !Ref ClusterName
PropagateAtLaunch: true
CreationPolicy:
ResourceSignal:
Timeout: PT15M
UpdatePolicy:
AutoScalingRollingUpdate:
MinInstancesInService: 1
MaxBatchSize: 1
PauseTime: PT15M
SuspendProcesses:
- HealthCheck
- ReplaceUnhealthy
- AZRebalance
- AlarmNotification
- ScheduledActions
WaitOnResourceSignals: true
#Monitoring
ECSClusterCPUScaleOutAlarm:
Type: AWS::CloudWatch::Alarm
Properties:
AlarmDescription: Alarm if cpu utilization greater than 90% of reserved cpu
Namespace: AWS/EC2
MetricName: CPUUtilization
Dimensions:
- Name: AutoScalingGroupName
Value: !Ref ECSAutoScalingGroup
Statistic: Maximum
Period: !Ref HighCpuEvaluationPeriod
EvaluationPeriods: 1
Threshold: !Ref HighCpuThreshold
ComparisonOperator: GreaterThanThreshold
AlarmActions:
- !Ref AlertsTopicArn
OKActions:
- !Ref EventsTopicArn
ECSClusterCPUScaleInAlarm:
Type: AWS::CloudWatch::Alarm
Properties:
AlarmDescription: Alarm if cpu utilization greater than 70% of reserved cpu
Namespace: AWS/EC2
MetricName: CPUUtilization
Dimensions:
- Name: AutoScalingGroupName
Value: !Ref ECSAutoScalingGroup
Statistic: Maximum
Period: !Ref LowCpuEvaluationPeriod
EvaluationPeriods: 1
Threshold: !Ref LowCpuThreshold
ComparisonOperator: LessThanThreshold
AlarmActions:
- !Ref EventsTopicArn
OKActions:
- !Ref EventsTopicArn
# Memory Available Metric Alarm for the Cluster
MemoryAvailableTooHighAlarm:
Type: 'AWS::CloudWatch::Alarm'
Properties:
AlarmDescription: 'Average memory utilization over last 10 minutes too high, performance may suffer.'
ComparisonOperator: GreaterThanThreshold
Dimensions:
- Name: AutoScalingGroupName
Value: !Ref ECSAutoScalingGroup
EvaluationPeriods: 1
MetricName: MemoryAvailable
Namespace: 'System/Linux'
Period: !Ref HighMemoryEvaluationPeriod
Statistic: Average
Threshold: !Ref HighMemoryThreshold
AlarmActions:
- !Ref AlertsTopicArn
OKActions:
- !Ref EventsTopicArn
MemoryAvailableLowAlarm:
Type: 'AWS::CloudWatch::Alarm'
Properties:
AlarmDescription: 'Average memory utilization over last 10 minutes too high, performance may suffer.'
ComparisonOperator: LessThanThreshold
Dimensions:
- Name: AutoScalingGroupName
Value: !Ref ECSAutoScalingGroup
EvaluationPeriods: 1
MetricName: MemoryAvailable
Namespace: 'System/Linux'
Period: !Ref LowMemoryEvaluationPeriod
Statistic: Average
Threshold: !Ref LowMemoryThreshold
AlarmActions:
- !Ref EventsTopicArn
OKActions:
- !Ref EventsTopicArn
# Memory Reservation Alarm for the Cluster
MemoryReservationHighAlarm:
Type: 'AWS::CloudWatch::Alarm'
Properties:
AlarmDescription: 'Average memory utilization over last 10 minutes too high, performance may suffer.'
ComparisonOperator: GreaterThanThreshold
Dimensions:
- Name: ClusterName
Value: !Ref ECSCluster
EvaluationPeriods: 1
MetricName: MemoryReservation
Namespace: AWS/ECS
Period: !Ref HighMemoryEvaluationPeriod
Statistic: Average
Threshold: !Ref HighMemoryThreshold
AlarmActions:
- !Ref AlertsTopicArn
OKActions:
- !Ref EventsTopicArn
MemoryReservationLowAlarm:
Type: 'AWS::CloudWatch::Alarm'
Properties:
AlarmDescription: 'Average memory utilization over last 10 minutes too high, performance may suffer.'
ComparisonOperator: LessThanThreshold
Dimensions:
- Name: ClusterName
Value: !Ref ECSCluster
EvaluationPeriods: 1
MetricName: MemoryReservation
Namespace: AWS/ECS
Period: !Ref LowMemoryEvaluationPeriod
Statistic: Average
Threshold: !Ref LowMemoryThreshold
AlarmActions:
- !Ref EventsTopicArn
OKActions:
- !Ref EventsTopicArn
# Disk Space Utilization Metric Alarm for the Cluster
FreeableDiskspaceTooLowAlarm:
Type: 'AWS::CloudWatch::Alarm'
Properties:
AlarmDescription: 'Average disk space utilization over last 10 minutes too low, performance may suffer.'
ComparisonOperator: GreaterThanThreshold
Dimensions:
- Name: AutoScalingGroupName
Value: !Ref ECSAutoScalingGroup
EvaluationPeriods: 1
MetricName: DiskSpaceAvailable
Namespace: 'System/Linux'
Period: 600
Statistic: Average
Threshold: 80
AlarmActions:
- !Ref AlertsTopicArn
OKActions:
- !Ref EventsTopicArn
# Network Utilization Metric Alarm for the Cluster
NetworkUtilizationTooHighAlarm:
Type: 'AWS::CloudWatch::Alarm'
Properties:
AlarmDescription: 'EC2 High Network Utilization'
Metrics:
- Id: in
Label: NetworkIn
MetricStat:
Metric:
Namespace: 'AWS/EC2'
MetricName: NetworkIn
Dimensions:
- Name: AutoScalingGroupName
Value: !Ref ECSAutoScalingGroup
Period: 300
Stat: Sum
Unit: Bytes
ReturnData: false
- Id: out
Label: NetworkOut
MetricStat:
Metric:
Namespace: 'AWS/EC2'
MetricName: NetworkOut
Dimensions:
- Name: AutoScalingGroupName
Value: !Ref ECSAutoScalingGroup
Period: 300
Stat: Sum
Unit: Bytes
ReturnData: false
- Id: total
Label: 'NetworkTotal'
Expression: '(in+out)/300/1000/1000/1000*8' # Gbit/s
ReturnData: true
ComparisonOperator: GreaterThanThreshold
EvaluationPeriods: 6
Threshold: 0.048 # Gbit/s
AlarmActions:
- !Ref AlertsTopicArn
OKActions:
- !Ref EventsTopicArn
TreatMissingData: notBreaching
#Launch Configuration
ECSLaunchConfiguration:
Type: AWS::AutoScaling::LaunchConfiguration
Properties:
ImageId: !Ref ECSAMI
InstanceType: !Ref InstanceType
SecurityGroups:
- !Ref ECSHostSecurityGroup
IamInstanceProfile: !Ref ECSInstanceProfile
UserData:
Fn::Base64: !Sub |
#!/bin/bash
#Register instances into cluster
echo ECS_CLUSTER=${ClusterName} >> /etc/ecs/ecs.config
echo ECS_ENABLE_AWSLOGS_EXECUTIONROLE_OVERRIDE=true >> /etc/ecs/ecs.config
#Install bootstrapping tools and ssm agent
yum install -y https://s3.amazonaws.com/ec2-downloads-windows/SSMAgent/latest/linux_amd64/amazon-ssm-agent.rpm
yum install -y aws-cfn-bootstrap hibagent
#Execute bootstrapping
/opt/aws/bin/cfn-init -v --region ${AWS::Region} --stack ${AWS::StackName} --resource ECSLaunchConfiguration
/opt/aws/bin/cfn-signal -e $? --region ${AWS::Region} --stack ${AWS::StackName} --resource ECSAutoScalingGroup
/usr/bin/enable-ec2-spot-hibernation
Metadata:
AWS::CloudFormation::Init:
configSets:
default:
- cwmon
- awslogs
- cfnhup
cwmon:
commands:
downloadCloudWatchMonitoringScripts:
command: "sudo curl -o /tmp/CloudWatchMonitoringScripts-1.2.2.zip https://aws-cloudwatch.s3.amazonaws.com/downloads/CloudWatchMonitoringScripts-1.2.2.zip"
unzipCloudWatchMonitoringScripts:
command: "sudo unzip /tmp/CloudWatchMonitoringScripts-1.2.2.zip -d /usr/local/bin"
packages:
yum:
unzip: []
perl-Switch: []
perl-DateTime: []
perl-Sys-Syslog: []
perl-LWP-Protocol-https: []
perl-Digest-SHA.x86_64: []
files:
/etc/cron.d/cwmon:
content: |
# m h dom mon dow command
*/5 * * * * root /usr/local/bin/aws-scripts-mon/mon-put-instance-data.pl --mem-avail --swap-used --disk-space-avail --disk-path=/ --from-cron --auto-scaling=only
mode: '000644'
owner: root
group: root
awslogs:
packages:
yum:
awslogs: []
files:
"/etc/awslogs/awscli.conf":
content: !Sub |
[plugins]
cwlogs = cwlogs
[default]
region = ${AWS::Region}
mode: '000644'
owner: root
group: root
"/etc/awslogs/awslogs.conf":
content: !Sub |
[general]
state_file = /var/lib/awslogs/agent-state
[/var/log/dmesg]
file = /var/log/dmesg
log_group_name = ${dmesgCWLogsGroup}
log_stream_name = {instance_id}
[/var/log/messages]
file = /var/log/messages
log_group_name = ${messagesCWLogsGroup}
log_stream_name = {instance_id}
datetime_format = %b %d %H:%M:%S
[/var/log/docker]
file = /var/log/docker
log_group_name = ${dockerCWLogsGroup}
log_stream_name = {instance_id}
datetime_format = %Y-%m-%dT%H:%M:%S.%f
[/var/log/ecs/ecs-init.log]
file = /var/log/ecs/ecs-init.log.*
log_group_name = ${ecsinitCWLogsGroup}
log_stream_name = {instance_id}
datetime_format = %Y-%m-%dT%H:%M:%SZ
[/var/log/ecs/ecs-agent.log]
file = /var/log/ecs/ecs-agent.log.*
log_group_name = ${ecsagentCWLogsGroup}
log_stream_name = {instance_id}
datetime_format = %Y-%m-%dT%H:%M:%SZ
[/var/log/ecs/audit.log]
file = /var/log/ecs/audit.log.*
log_group_name = ${ecsauditCWLogsGroup}
log_stream_name = {instance_id}
datetime_format = %Y-%m-%dT%H:%M:%SZ
[/var/log/secure]
file = /var/log/secure
log_group_name = ${secureCWLogsGroup}
log_stream_name = {instance_id}
datetime_format = %b %d %H:%M:%S
[/var/log/cfn-init]
file = /var/log/cfn-init.log
log_group_name = ${cfninitCWLogsGroup}
log_stream_name = {instance_id}
datetime_format = %b %d %H:%M:%S
[/var/log/cfn-init-cmd]
file = /var/log/cfn-init-cmd.log
log_group_name = ${cfninitcmdCWLogsGroup}
log_stream_name = {instance_id}
datetime_format = %b %d %H:%M:%S
mode: '000644'
owner: root
group: root
services:
sysvinit:
awslogsd: #Uncomment this block if using Amazon Linux 2
enabled: true
ensureRunning: true
files:
- /etc/awslogs/awslogs.conf
- /etc/awslogs/awscli.conf
# awslogs: #Uncomment this block if using Amazon Linux
# enabled: true
# ensureRunning: true
# files:
# - /etc/awslogs/awslogs.conf
# - /etc/awslogs/awscli.conf
cfnhup:
files:
"/etc/cfn/cfn-hup.conf":
mode: 000400
owner: root
group: root
content: !Sub |
[main]
stack=${AWS::StackId}
region=${AWS::Region}
interval=1
"/etc/cfn/hooks.d/cfn-auto-reloader.conf":
content: !Sub |
[cfn-auto-reloader-hook]
triggers=post.update
path=Resources.ECSLaunchConfiguration.Metadata.AWS::CloudFormation::Init
action=/opt/aws/bin/cfn-init -v --region ${AWS::Region} --stack ${AWS::StackName} --resource ECSLaunchConfiguration
"/etc/systemd/system/cfn-hup.service":
content: |
[Unit]
Description=Cloud formation helper daemon
[Service]
ExecStart=/opt/aws/bin/cfn-hup
Restart=always
Type=simple
[Install]
WantedBy=multi-user.target
services:
sysvinit:
cfn-hup:
enabled: true
ensureRunning: true
files:
- /etc/cfn/cfn-hup.conf
- /etc/cfn/hooks.d/cfn-auto-reloader.conf
##CW Log Group
dmesgCWLogsGroup:
Type: AWS::Logs::LogGroup
Properties:
LogGroupName: !Sub /ecs/${ClusterName}/hosts/var/log/dmesg
RetentionInDays: !Ref LogRetention
messagesCWLogsGroup:
Type: AWS::Logs::LogGroup
Properties:
LogGroupName: !Sub /ecs/${ClusterName}/hosts/var/log/messages
RetentionInDays: !Ref LogRetention
secureCWLogsGroup:
Type: AWS::Logs::LogGroup
Properties:
LogGroupName: !Sub /ecs/${ClusterName}/hosts/var/log/secure
RetentionInDays: !Ref LogRetention
cfninitCWLogsGroup:
Type: AWS::Logs::LogGroup
Properties:
LogGroupName: !Sub /ecs/${ClusterName}/hosts/var/log/cfn-init.log
RetentionInDays: !Ref LogRetention
cfninitcmdCWLogsGroup:
Type: AWS::Logs::LogGroup
Properties:
LogGroupName: !Sub /ecs/${ClusterName}/hosts/var/log/cfn-init-cmd.log
RetentionInDays: !Ref LogRetention
dockerCWLogsGroup:
Type: AWS::Logs::LogGroup
Properties:
LogGroupName: !Sub /ecs/${ClusterName}/hosts/var/log/docker
RetentionInDays: !Ref LogRetention
ecsinitCWLogsGroup:
Type: AWS::Logs::LogGroup
Properties:
LogGroupName: !Sub /ecs/${ClusterName}/hosts/var/log/ecs/ecs-init.log
RetentionInDays: !Ref LogRetention
ecsagentCWLogsGroup:
Type: AWS::Logs::LogGroup
Properties:
LogGroupName: !Sub /ecs/${ClusterName}/hosts/var/log/ecs/ecs-agent.log
RetentionInDays: !Ref LogRetention
ecsauditCWLogsGroup:
Type: AWS::Logs::LogGroup
Properties:
LogGroupName: !Sub /ecs/${ClusterName}/hosts/var/log/ecs/audit.log
RetentionInDays: !Ref LogRetention
## IAM Resources
ECSRole:
Type: AWS::IAM::Role
Properties:
Path: /
AssumeRolePolicyDocument: |
{
"Statement": [{
"Action": "sts:AssumeRole",
"Effect": "Allow",
"Principal": {
"Service": "ec2.amazonaws.com"
}
}]
}
ManagedPolicyArns:
- arn:aws:iam::aws:policy/service-role/AmazonEC2RoleforSSM
Policies:
- PolicyName: ecs-service
PolicyDocument: |
{
"Statement": [{
"Effect": "Allow",
"Action": [
"ecs:CreateCluster",
"ecs:DeregisterContainerInstance",
"ecs:DiscoverPollEndpoint",
"ecs:Poll",
"ecs:RegisterContainerInstance",
"ecs:StartTelemetrySession",
"ecs:UpdateContainerInstancesState",
"ecs:Submit*",
"ecs:StartTask",
"logs:CreateLogStream",
"logs:PutLogEvents",
"ecr:BatchCheckLayerAvailability",
"ecr:BatchGetImage",
"ecr:GetDownloadUrlForLayer",
"ecr:GetAuthorizationToken",
"ssm:DescribeAssociation",
"ssm:GetDeployablePatchSnapshotForInstance",
"ssm:GetDocument",
"ssm:GetManifest",
"ssm:GetParameters",
"ssm:ListAssociations",
"ssm:ListInstanceAssociations",
"ssm:PutInventory",
"ssm:PutComplianceItems",
"ssm:PutConfigurePackageResult",
"ssm:UpdateAssociationStatus",
"ssm:UpdateInstanceAssociationStatus",
"ssm:UpdateInstanceInformation",
"ec2messages:AcknowledgeMessage",
"ec2messages:DeleteMessage",
"ec2messages:FailMessage",
"ec2messages:GetEndpoint",
"ec2messages:GetMessages",
"ec2messages:SendReply",
"cloudwatch:PutMetricData",
"cloudwatch:GetMetricStatistics",
"cloudwatch:ListMetrics",
"ec2:DescribeTags",
"ec2:DescribeInstanceStatus",
"ds:CreateComputer",
"ds:DescribeDirectories",
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:DescribeLogGroups",
"logs:DescribeLogStreams",
"logs:PutLogEvents",
"s3:PutObject",
"s3:GetObject",
"s3:AbortMultipartUpload",
"s3:ListMultipartUploadParts",
"s3:ListBucket",
"s3:ListBucketMultipartUploads"
],
"Resource": "*"
}]
}
ECSInstanceProfile:
Type: AWS::IAM::InstanceProfile
Properties:
Path: /
Roles:
- !Ref ECSRole
Outputs:
Cluster:
Description: A reference to the ECS cluster
Value: !Ref ClusterName
ClusterArn:
Description: A reference to the ECS cluster
Value: !GetAtt ECSCluster.Arn
ECSAutoScalingGroupName:
Description: A reference to ECS AutoScaling Group Name
Value: !Ref ECSAutoScalingGroup
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment