Created January 14, 2017 00:54
Simple Opt-In Chaos Function
# choasfcn (python 2.7)
# purpose: test resiliency of the EC2 environment (opt-in with tag)
# implementation:
# 0) leverage CW Events to schedule trigger this function
# 1) retrieves list of instances that is participating
# 2) terminates a random sample of the participating instances
# error handling
# no safe guard (ex. deleting all instances related to the same application)
# if desired, modify trigger for a less predictable run schedule
import boto3,random,json,logging
logger = logging.getLogger()
def lambda_handler(event, context):'got event {}'.format(json.dumps(event)))
ec2 = boto3.resource('ec2')
# config
NUM_TO_KILL = 1 # can be fixed, or calculated (% of total fleet)
VPC_IDs = ['vpc-700cf415'] # comma separated list of VPCs, empty list for all VPCs
TAG_KEY = 'chaos-me' # opt-in with tag key, unset (comment line) to include all instances
# default participating criteria (running)
criteria = [{'Name': 'instance-state-name', 'Values': ['running']}]
# additional criteria
# opt-in with tag key
except NameError:
criteria.append({'Name':'tag-key', 'Values':[TAG_KEY]})
# limit to specific vpc
if len(VPC_IDs) > 0:
criteria.append({'Name':'vpc-id', 'Values':VPC_IDs})
# see filters:
### 1) retrieves list of instances that is participating ###'Criteria: {}'.format(criteria))
instances = ec2.instances.filter(Filters=criteria)
participants = [ for ins in instances]
count = len(participants)
### 2) terminate random sample of the participating instances ###
if count > NUM_TO_KILL:
lucky_indices = random.sample(xrange(count),NUM_TO_KILL)
chosen = [participants[i] for i in lucky_indices]
# print ID of lucky instances"Terminating Instances " + ','.join(chosen))
res = ec2.instances.filter(InstanceIds=chosen).terminate()
return res
else:'Nothing to do!')
return {}
