Last active
April 29, 2021 13:39
-
-
Save toricls/54d91245abde4fe373aab440cf19e5d3 to your computer and use it in GitHub Desktop.
Resilient Fargate task scheduling with Step Functions and EventBridge
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
AWSTemplateFormatVersion: '2010-09-09' | |
Description: 'Sample Task Definition' | |
Resources: | |
TaskDefinition: | |
Type: AWS::ECS::TaskDefinition | |
Properties: | |
RequiresCompatibilities: | |
- "FARGATE" | |
Cpu: 256 | |
Memory: 512 | |
NetworkMode: awsvpc | |
ContainerDefinitions: | |
- Image: "amazon/amazon-ecs-sample" | |
Name: "fargate-app" | |
PortMappings: | |
- ContainerPort: 80 | |
HostPort: 80 | |
Protocol: tcp | |
Essential: true | |
EntryPoint: | |
- 'sh' | |
- '-c' | |
Command: | |
- /bin/sh -c "if [ ${IS_STANDALONE_EXECUTION} == 1 ]; then echo 'Hello from AWS Step Functions!'; else /usr/sbin/apache2 -D FOREGROUND; fi" | |
Outputs: | |
TaskDefinitionArn: | |
Value: !Ref TaskDefinition |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
AWSTemplateFormatVersion: '2010-09-09' | |
Description: 'Sample Task Definition' | |
Resources: | |
SNSTopic: | |
Type: AWS::SNS::Topic | |
Outputs: | |
SNSTopicArn: | |
Value: !Ref SNSTopic |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
AWSTemplateFormatVersion: '2010-09-09' | |
Description: 'Creates once-per-15-min Fargate task with Step Functions and EventBridge' | |
Parameters: | |
TaskDefinitionArn: | |
Type: 'String' | |
Description: 'Example: arn:aws:ecs:<region>:<account-id>:task-definition/<task-def-name>:<task-def-revision>' | |
PublicSubnets: | |
Type: 'List<AWS::EC2::Subnet::Id>' | |
Description: "This CloudFormation template requires 'public' subnets to run Fargate task. See the comment for the 'StateMachine' resource's 'AssignPublicIp' property." | |
SecurityGroups: | |
Type: 'List<AWS::EC2::SecurityGroup::Id>' | |
SNSTopicArn: | |
Type: 'String' | |
Description: 'An SNS topic ARN to notify ECS task success/failures, and also failures of the SFn state machine itself. Example: arn:aws:sns:<region>:<account-id>:<topic-name>' | |
Resources: | |
Cluster: | |
Type: 'AWS::ECS::Cluster' | |
Properties: | |
ClusterSettings: | |
- Name: containerInsights | |
Value: enabled | |
Rule: | |
Type: 'AWS::Events::Rule' | |
Properties: | |
ScheduleExpression: 'rate(15 minutes)' | |
State: ENABLED | |
Targets: | |
- Arn: !Ref StateMachine | |
Id: statemachine | |
RoleArn: !GetAtt 'RuleRole.Arn' | |
RuleRole: | |
Type: 'AWS::IAM::Role' | |
Properties: | |
AssumeRolePolicyDocument: | |
Statement: | |
- Effect: Allow | |
Principal: | |
Service: 'events.amazonaws.com' | |
Action: 'sts:AssumeRole' | |
Policies: | |
- PolicyName: EventRulePolicy | |
PolicyDocument: | |
Statement: | |
- Effect: Allow | |
Action: 'states:StartExecution' | |
Resource: !Ref StateMachine | |
StateMachine: | |
Type: 'AWS::StepFunctions::StateMachine' | |
Properties: | |
RoleArn: !GetAtt 'StateMachineRole.Arn' | |
LoggingConfiguration: | |
Destinations: | |
- CloudWatchLogsLogGroup: | |
LogGroupArn: !GetAtt StateMachineExecutionLogGroup.Arn | |
IncludeExecutionData: 'true' | |
Level: 'FATAL' | |
DefinitionSubstitutions: | |
Cluster: !GetAtt Cluster.Arn | |
TaskDefinition: !Ref TaskDefinitionArn | |
Subnets: !Join | |
- '","' | |
- !Ref PublicSubnets | |
AssignPublicIp: 'ENABLED' # Should be DISABLED if we use private subnets | |
SecurityGroups: !Join | |
- '","' | |
- !Ref SecurityGroups | |
Timeout: 900 # seconds | |
SNSTopicArn: !Ref SNSTopicArn | |
# The state machine definition below is a bit outdated, I'd add some changes if I run this today | |
## - Use 'CapacityProviderStrategy' instead of 'LaunchType' | |
## - Use specific 'PlatformVersion' and not LATEST | |
## - Enable long arn format to use tag propagation | |
DefinitionString: |- | |
{ | |
"Version": "1.0", | |
"Comment": "Run AWS Fargate task", | |
"TimeoutSeconds": ${Timeout}, | |
"StartAt": "Run Fargate Task", | |
"States": { | |
"Run Fargate Task": { | |
"Type": "Task", | |
"Resource": "arn:aws:states:::ecs:runTask.sync", | |
"Parameters": { | |
"LaunchType": "FARGATE", | |
"Cluster": "${Cluster}", | |
"TaskDefinition": "${TaskDefinition}", | |
"Group.$": "$$.Execution.Name", | |
"NetworkConfiguration": { | |
"AwsvpcConfiguration": { | |
"Subnets": ["${Subnets}"], | |
"AssignPublicIp": "${AssignPublicIp}", | |
"SecurityGroups": ["${SecurityGroups}"] | |
} | |
}, | |
"Overrides": { | |
"ContainerOverrides": [ | |
{ | |
"Name": "fargate-app", | |
"Environment": [ | |
{ | |
"Name": "IS_STANDALONE_EXECUTION", | |
"Value": "1" | |
} | |
] | |
} | |
] | |
} | |
}, | |
"Retry": [ | |
{ | |
"ErrorEquals": [ | |
"States.TaskFailed" | |
], | |
"IntervalSeconds": 3, | |
"MaxAttempts": 3, | |
"BackoffRate": 1.0 | |
}, | |
{ | |
"ErrorEquals": [ | |
"ECS.AmazonECSException" | |
], | |
"IntervalSeconds": 10, | |
"MaxAttempts": 3, | |
"BackoffRate": 2.0 | |
} | |
], | |
"Next": "Notify Success", | |
"Catch": [ | |
{ | |
"ErrorEquals": [ | |
"States.ALL" | |
], | |
"Next": "Notify Failure" | |
} | |
] | |
}, | |
"Notify Success": { | |
"Type": "Task", | |
"Resource": "arn:aws:states:::sns:publish", | |
"Parameters": { | |
"Message": "AWS Fargate Task started by Step Functions succeeded", | |
"MessageAttributes": { | |
"ExitCode": { | |
"DataType": "String", | |
"StringValue.$": "$.Containers[?(@.Name=='fargate-app')].ExitCode" | |
}, | |
"FullTaskResult": { | |
"DataType": "String", | |
"StringValue.$": "$" | |
} | |
}, | |
"TopicArn": "${SNSTopicArn}" | |
}, | |
"End": true | |
}, | |
"Notify Failure": { | |
"Type": "Task", | |
"Resource": "arn:aws:states:::sns:publish", | |
"Parameters": { | |
"Message": "AWS Fargate Task started by Step Functions failed", | |
"MessageAttributes": { | |
"ExitCode": { | |
"DataType": "String", | |
"StringValue.$": "$.Containers[?(@.Name=='fargate-app')].ExitCode" | |
}, | |
"FullTaskResult": { | |
"DataType": "String", | |
"StringValue.$": "$" | |
} | |
}, | |
"TopicArn": "${SNSTopicArn}" | |
}, | |
"End": true | |
} | |
} | |
} | |
StateMachineExecutionLogGroup: | |
Type: 'AWS::Logs::LogGroup' | |
Properties: | |
RetentionInDays: 7 | |
TaskExecutionRole: | |
Type: AWS::IAM::Role | |
Properties: | |
AssumeRolePolicyDocument: | |
Statement: | |
- Effect: Allow | |
Principal: | |
Service: ecs-tasks.amazonaws.com | |
Action: 'sts:AssumeRole' | |
ManagedPolicyArns: | |
- 'arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy' | |
# If the task uses Secrets Manager integration | |
#Policies: | |
#- PolicyDocument: | |
# Statement: | |
# - Effect: Allow | |
# Action: 'secretsmanager:GetSecretValue' | |
# Resource: | |
# - arn:aws:secretsmanager:<region>:<aws_account_id>:secret:secret_name | |
# - arn:aws:kms:<region>:<aws_account_id>:key/key_id | |
TaskRole: | |
Type: AWS::IAM::Role | |
Properties: | |
AssumeRolePolicyDocument: | |
Statement: | |
- Effect: Allow | |
Principal: | |
Service: ecs-tasks.amazonaws.com | |
Action: 'sts:AssumeRole' | |
#Policies: | |
#- | |
StateMachineRole: | |
Type: 'AWS::IAM::Role' | |
Properties: | |
AssumeRolePolicyDocument: | |
Version: '2012-10-17' | |
Statement: | |
- Effect: Allow | |
Principal: | |
Service: 'states.amazonaws.com' | |
Action: 'sts:AssumeRole' | |
Policies: | |
- PolicyName: StateMachine | |
PolicyDocument: | |
Statement: | |
- Effect: Allow | |
Action: 'iam:PassRole' | |
Resource: | |
- !GetAtt TaskExecutionRole.Arn | |
- !GetAtt TaskRole.Arn | |
- Effect: Allow | |
Action: 'ecs:RunTask' | |
Resource: !Ref TaskDefinitionArn | |
Condition: | |
ArnEquals: | |
'ecs:cluster': !GetAtt Cluster.Arn | |
- Effect: Allow | |
Action: | |
- 'ecs:StopTask' | |
- 'ecs:DescribeTasks' | |
Resource: '*' | |
Condition: | |
ArnEquals: | |
'ecs:cluster': !GetAtt Cluster.Arn | |
- Effect: Allow | |
Action: | |
- 'sns:Publish' | |
Resource: !Ref SNSTopicArn | |
- Effect: Allow | |
Action: | |
- 'logs:CreateLogDelivery' | |
- 'logs:GetLogDelivery' | |
- 'logs:UpdateLogDelivery' | |
- 'logs:DeleteLogDelivery' | |
- 'logs:ListLogDeliveries' | |
- 'logs:PutResourcePolicy' | |
- 'logs:DescribeResourcePolicies' | |
- 'logs:DescribeLogGroups' | |
Resource: '*' # CWL doesn't support resource-level permissions | |
- Effect: Allow | |
Action: | |
- 'events:PutTargets' | |
- 'events:PutRule' | |
- 'events:DescribeRule' | |
Resource: !Sub 'arn:${AWS::Partition}:events:${AWS::Region}:${AWS::AccountId}:rule/StepFunctionsGetEventsForECSTaskRule' | |
ExecutionsFailedAlarm: | |
Type: 'AWS::CloudWatch::Alarm' | |
Properties: | |
AlarmDescription: 'Failure while executing scheduled task.' | |
Namespace: 'AWS/States' | |
MetricName: ExecutionsFailed | |
Dimensions: | |
- Name: StateMachineArn | |
Value: !Ref StateMachine | |
Statistic: Sum | |
Period: 300 | |
DatapointsToAlarm: 1 | |
EvaluationPeriods: 1 | |
Threshold: 0 | |
TreatMissingData: notBreaching | |
ComparisonOperator: GreaterThanThreshold | |
AlarmActions: | |
- !Ref SNSTopicArn | |
ExecutionsTimeoutAlarm: | |
Type: 'AWS::CloudWatch::Alarm' | |
Properties: | |
AlarmDescription: 'Executing scheduled task timed out.' | |
Namespace: 'AWS/States' | |
MetricName: ExecutionsTimedOut | |
Dimensions: | |
- Name: StateMachineArn | |
Value: !Ref StateMachine | |
Statistic: Sum | |
Period: 300 | |
DatapointsToAlarm: 1 | |
EvaluationPeriods: 1 | |
Threshold: 0 | |
TreatMissingData: notBreaching | |
ComparisonOperator: GreaterThanThreshold | |
AlarmActions: | |
- !Ref SNSTopicArn |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment