Skip to content

Instantly share code, notes, and snippets.

@sylwit
Created April 5, 2017 14:21
Show Gist options
  • Save sylwit/f9b0482c95e40d279ceabfd737216340 to your computer and use it in GitHub Desktop.
Save sylwit/f9b0482c95e40d279ceabfd737216340 to your computer and use it in GitHub Desktop.
Setup cloudwatch alarms with lambda function
# Create EC2 instance, EBS alarms for all resources in a region
import os
import boto3
# SNS Topic Definition for EC2, EBS
instance_ids = os.environ.get('INSTANCE_ID').split('#')
ec2_sns = os.environ.get('SNS_TOPIC_ARN') # arn:aws:sns:ca-central-1:476697408772:cloudwatch-monitoring
ebs_sns = os.environ.get('SNS_TOPIC_ARN') # arn:aws:sns:ca-central-1:476697408772:cloudwatch-monitoring
# AWS Account and Region Definition for Reboot Actions
# akid = os.environ.get('ACCOUNT_ID')
# region = os.environ.get('REGION_NAME')
# Create AWS clients
ec2 = boto3.client('ec2')
cw = boto3.client('cloudwatch')
def lambda_handler(event, context):
reservations = ec2.describe_instances(InstanceIds=instance_ids).get('Reservations', [])
instances = sum(
[
[i for i in r['Instances']]
for r in reservations
], [])
# [0].get('Instances')[0]
for instance in instances:
try:
for tag in instance.get('Tags'):
# print(tag)
if tag['Key'] == 'Name':
name_tag = tag['Value']
print "Found instance %s with name %s" % (instance['InstanceId'], name_tag)
# Create Metric "CPU Utilization Greater than 75% for 5+ Minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s High CPU Utilization Warning" % (name_tag, instance['InstanceId']),
AlarmDescription='CPU Utilization Greater than 75% for 5+ Minutes',
ActionsEnabled=True,
AlarmActions=[
ec2_sns,
],
MetricName='CPUUtilization',
Namespace='AWS/EC2',
Statistic='Average',
Dimensions=[
{
'Name': 'InstanceId',
'Value': instance['InstanceId']
},
],
Period=300,
EvaluationPeriods=1,
Threshold=75.0,
ComparisonOperator='GreaterThanOrEqualToThreshold'
)
if "t2" in instance['InstanceType']:
# Create Metric "CPU Credit Balance <= 25 for 30 Minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s Credit Balance Warning" % (name_tag, instance['InstanceId']),
AlarmDescription='CPU Credit Balance <= 25 for 30 Minutes',
ActionsEnabled=True,
AlarmActions=[
ec2_sns,
],
MetricName='CPUCreditBalance',
Namespace='AWS/EC2',
Statistic='Average',
Dimensions=[
{
'Name': 'InstanceId',
'Value': instance['InstanceId']
},
],
Period=300,
EvaluationPeriods=6,
Threshold=25.0,
ComparisonOperator='LessThanOrEqualToThreshold'
)
# Create Metric "Status Check Failed (System) for 5 Minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s System Check Failed" % (name_tag, instance['InstanceId']),
AlarmDescription='Status Check Failed (System) for 5 Minutes',
ActionsEnabled=True,
AlarmActions=[
ec2_sns,
],
MetricName='StatusCheckFailed_System',
Namespace='AWS/EC2',
Statistic='Average',
Dimensions=[
{
'Name': 'InstanceId',
'Value': instance['InstanceId']
},
],
Period=60,
EvaluationPeriods=5,
Threshold=1.0,
ComparisonOperator='GreaterThanOrEqualToThreshold'
)
# Create Metric "MemoryUtilization Greater than 85% for 5 Minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s High MemoryUtilization Warning" % (name_tag, instance['InstanceId']),
AlarmDescription='MemoryUtilization Greater than 85% for 5+ Minutes',
ActionsEnabled=True,
AlarmActions=[
ec2_sns,
],
MetricName='MemoryUtilization',
Namespace='System/Linux',
Statistic='Average',
Dimensions=[
{
'Name': 'InstanceId',
'Value': instance['InstanceId']
},
],
Period=300,
EvaluationPeriods=1,
Threshold=85.0,
ComparisonOperator='GreaterThanOrEqualToThreshold'
)
except Exception, e:
print("Error Encountered.")
print(e)
return
# TODO : Finish code after this todo when AWS lambda become available in Canada
# Enumerate EBS devices attached to EC2 instances
for instance in instances:
for dev in instance['BlockDeviceMappings']:
if dev.get('Ebs', None) is None:
continue
vol_id = dev['Ebs']['VolumeId']
print
"Found EBS volume %s on instance %s" % (
vol_id, instance['InstanceId'])
# Create Metric "Volume Idle Time <= 30 sec (of 5 minutes) for 30 Minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s High Volume Activity Warning" % (vol_id, instance['InstanceId']),
AlarmDescription='Volume Idle Time <= 30 sec (of 5 minutes) for 30 Minutes',
ActionsEnabled=True,
AlarmActions=[
ebs_sns,
],
MetricName='VolumeIdleTime',
Namespace='AWS/EBS',
Statistic='Average',
Dimensions=[
{
'Name': 'VolumeId',
'Value': vol_id
},
],
Period=300,
EvaluationPeriods=6,
Threshold=30.0,
ComparisonOperator='LessThanOrEqualToThreshold'
)
# Create Metric "Volume Idle Time <= 30 sec (of 5 minutes) for 60 Minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s High Volume Activity Critical" % (vol_id, instance['InstanceId']),
AlarmDescription='Volume Idle Time <= 30 sec (of 5 minutes) for 60 Minutes',
ActionsEnabled=True,
AlarmActions=[
ebs_sns,
],
MetricName='VolumeIdleTime',
Namespace='AWS/EBS',
Statistic='Average',
Dimensions=[
{
'Name': 'VolumeId',
'Value': vol_id
},
],
Period=300,
EvaluationPeriods=12,
Threshold=30.0,
ComparisonOperator='LessThanOrEqualToThreshold'
)
# Enumerate EC2 instances
reservations = ec2.describe_instances().get('Reservations', [])
instances = sum(
[
[i for i in r['Instances']]
for r in reservations
], [])
for instance in instances:
try:
for tag in instance['Tags']:
if tag['Key'] == 'Name':
name_tag = tag['Value']
print
"Found instance %s with name %s" % (instance['InstanceId'], name_tag)
# Create Metric "CPU Utilization Greater than 98% for 60+ Minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s High CPU Utilization Warning" % (name_tag, instance['InstanceId']),
AlarmDescription='CPU Utilization Greater than 98% for 60+ Minutes',
ActionsEnabled=True,
AlarmActions=[
],
MetricName='CPUUtilization',
Namespace='AWS/EC2',
Statistic='Average',
Dimensions=[
{
'Name': 'InstanceId',
'Value': instance['InstanceId']
},
],
Period=300,
EvaluationPeriods=12,
Threshold=98.0,
ComparisonOperator='GreaterThanOrEqualToThreshold'
)
# Create Metric "CPU Utilization Greater than 98% for 120+ Minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s High CPU Utilization Critical" % (name_tag, instance['InstanceId']),
AlarmDescription='CPU Utilization Greater than 98% for 120+ Minutes',
ActionsEnabled=True,
AlarmActions=[
ec2_sns,
],
MetricName='CPUUtilization',
Namespace='AWS/EC2',
Statistic='Average',
Dimensions=[
{
'Name': 'InstanceId',
'Value': instance['InstanceId']
},
],
Period=300,
EvaluationPeriods=24,
Threshold=98.0,
ComparisonOperator='GreaterThanOrEqualToThreshold'
)
# Create CPU Credit Alarms only on T2 instances
if "t2" in instance['InstanceType']:
# Create Metric "CPU Credit Balance <= 25 for 30 Minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s Credit Balance Warning" % (name_tag, instance['InstanceId']),
AlarmDescription='CPU Credit Balance <= 10 for 30 Minutes',
ActionsEnabled=True,
AlarmActions=[
],
MetricName='CPUCreditBalance',
Namespace='AWS/EC2',
Statistic='Average',
Dimensions=[
{
'Name': 'InstanceId',
'Value': instance['InstanceId']
},
],
Period=300,
EvaluationPeriods=6,
Threshold=10.0,
ComparisonOperator='LessThanOrEqualToThreshold'
)
# Create Metric "CPU Credit Balance <= 5 for 10 Minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s Credit Balance Critical" % (name_tag, instance['InstanceId']),
AlarmDescription='CPU Credit Balance <= 5 for 10 Minutes',
ActionsEnabled=True,
AlarmActions=[
ec2_sns,
],
MetricName='CPUCreditBalance',
Namespace='AWS/EC2',
Statistic='Average',
Dimensions=[
{
'Name': 'InstanceId',
'Value': instance['InstanceId']
},
],
Period=300,
EvaluationPeriods=2,
Threshold=5.0,
ComparisonOperator='LessThanOrEqualToThreshold'
)
# Create Metric "Status Check Failed (System) for 5 Minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s System Check Failed" % (name_tag, instance['InstanceId']),
AlarmDescription='Status Check Failed (System) for 5 Minutes',
ActionsEnabled=True,
AlarmActions=[
ec2_sns,
"arn:aws:automate:%s:ec2:recover" % region,
],
MetricName='StatusCheckFailed_System',
Namespace='AWS/EC2',
Statistic='Average',
Dimensions=[
{
'Name': 'InstanceId',
'Value': instance['InstanceId']
},
],
Period=60,
EvaluationPeriods=3,
Threshold=1.0,
ComparisonOperator='GreaterThanOrEqualToThreshold'
)
# Create Metric "Status Check Failed (Instance) for 10 Minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s Instance Check Failed" % (name_tag, instance['InstanceId']),
AlarmDescription='Status Check Failed (Instance) for 10 Minutes',
ActionsEnabled=True,
AlarmActions=[
ec2_sns,
"arn:aws:swf:%s:%s:action/actions/AWS_EC2.InstanceId.Reboot/1.0" % (region, akid)
],
MetricName='StatusCheckFailed_Instance',
Namespace='AWS/EC2',
Statistic='Average',
Dimensions=[
{
'Name': 'InstanceId',
'Value': instance['InstanceId']
},
],
Period=60,
EvaluationPeriods=10,
Threshold=1.0,
ComparisonOperator='GreaterThanOrEqualToThreshold'
)
except Exception, e:
print("Error Encountered.")
print(e)
# Enumerate EBS devices attached to EC2 instances
for instance in instances:
for dev in instance['BlockDeviceMappings']:
if dev.get('Ebs', None) is None:
continue
vol_id = dev['Ebs']['VolumeId']
print
"Found EBS volume %s on instance %s" % (
vol_id, instance['InstanceId'])
# Create Metric "Volume Idle Time <= 30 sec (of 5 minutes) for 60 Minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s High Volume Activity Warning" % (vol_id, instance['InstanceId']),
AlarmDescription='Volume Idle Time <= 30 sec (of 5 minutes) for 60 Minutes',
ActionsEnabled=True,
AlarmActions=[
ebs_sns,
],
MetricName='VolumeIdleTime',
Namespace='AWS/EBS',
Statistic='Average',
Dimensions=[
{
'Name': 'VolumeId',
'Value': vol_id
},
],
Period=300,
EvaluationPeriods=12,
Threshold=30.0,
ComparisonOperator='LessThanOrEqualToThreshold'
)
# Create Metric "Volume Idle Time <= 30 sec (of 5 minutes) for 240 Minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s High Volume Activity Critical" % (vol_id, instance['InstanceId']),
AlarmDescription='Volume Idle Time <= 30 sec (of 5 minutes) for 240 Minutes',
ActionsEnabled=True,
AlarmActions=[
ebs_sns,
],
MetricName='VolumeIdleTime',
Namespace='AWS/EBS',
Statistic='Average',
Dimensions=[
{
'Name': 'VolumeId',
'Value': vol_id
},
],
Period=300,
EvaluationPeriods=48,
Threshold=30.0,
ComparisonOperator='LessThanOrEqualToThreshold'
)
# Enumerate EC2 instances
reservations = ec2.describe_instances().get('Reservations', [])
instances = sum(
[
[i for i in r['Instances']]
for r in reservations
], [])
for instance in instances:
try:
for tag in instance['Tags']:
if tag['Key'] == 'Name':
name_tag = tag['Value']
print
"Found instance %s with name %s" % (instance['InstanceId'], name_tag)
# Create Metric "CPU Utilization Greater than 98% for 60+ Minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s High CPU Utilization Warning" % (name_tag, instance['InstanceId']),
AlarmDescription='CPU Utilization Greater than 98% for 60+ Minutes',
ActionsEnabled=True,
AlarmActions=[
],
MetricName='CPUUtilization',
Namespace='AWS/EC2',
Statistic='Average',
Dimensions=[
{
'Name': 'InstanceId',
'Value': instance['InstanceId']
},
],
Period=300,
EvaluationPeriods=12,
Threshold=98.0,
ComparisonOperator='GreaterThanOrEqualToThreshold'
)
# Create Metric "CPU Utilization Greater than 98% for 120+ Minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s High CPU Utilization Critical" % (name_tag, instance['InstanceId']),
AlarmDescription='CPU Utilization Greater than 98% for 120+ Minutes',
ActionsEnabled=True,
AlarmActions=[
ec2_sns,
],
MetricName='CPUUtilization',
Namespace='AWS/EC2',
Statistic='Average',
Dimensions=[
{
'Name': 'InstanceId',
'Value': instance['InstanceId']
},
],
Period=300,
EvaluationPeriods=24,
Threshold=98.0,
ComparisonOperator='GreaterThanOrEqualToThreshold'
)
# Create CPU Credit Alarms only on T2 instances
if "t2" in instance['InstanceType']:
# Create Metric "CPU Credit Balance <= 25 for 30 Minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s Credit Balance Warning" % (name_tag, instance['InstanceId']),
AlarmDescription='CPU Credit Balance <= 10 for 30 Minutes',
ActionsEnabled=True,
AlarmActions=[
],
MetricName='CPUCreditBalance',
Namespace='AWS/EC2',
Statistic='Average',
Dimensions=[
{
'Name': 'InstanceId',
'Value': instance['InstanceId']
},
],
Period=300,
EvaluationPeriods=6,
Threshold=10.0,
ComparisonOperator='LessThanOrEqualToThreshold'
)
# Create Metric "CPU Credit Balance <= 5 for 10 Minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s Credit Balance Critical" % (name_tag, instance['InstanceId']),
AlarmDescription='CPU Credit Balance <= 5 for 10 Minutes',
ActionsEnabled=True,
AlarmActions=[
ec2_sns,
],
MetricName='CPUCreditBalance',
Namespace='AWS/EC2',
Statistic='Average',
Dimensions=[
{
'Name': 'InstanceId',
'Value': instance['InstanceId']
},
],
Period=300,
EvaluationPeriods=2,
Threshold=5.0,
ComparisonOperator='LessThanOrEqualToThreshold'
)
# Create Metric "Status Check Failed (System) for 5 Minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s System Check Failed" % (name_tag, instance['InstanceId']),
AlarmDescription='Status Check Failed (System) for 5 Minutes',
ActionsEnabled=True,
AlarmActions=[
ec2_sns,
"arn:aws:automate:%s:ec2:recover" % region,
],
MetricName='StatusCheckFailed_System',
Namespace='AWS/EC2',
Statistic='Average',
Dimensions=[
{
'Name': 'InstanceId',
'Value': instance['InstanceId']
},
],
Period=60,
EvaluationPeriods=3,
Threshold=1.0,
ComparisonOperator='GreaterThanOrEqualToThreshold'
)
# Create Metric "Status Check Failed (Instance) for 10 Minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s Instance Check Failed" % (name_tag, instance['InstanceId']),
AlarmDescription='Status Check Failed (Instance) for 10 Minutes',
ActionsEnabled=True,
AlarmActions=[
ec2_sns,
"arn:aws:swf:%s:%s:action/actions/AWS_EC2.InstanceId.Reboot/1.0" % (region, akid)
],
MetricName='StatusCheckFailed_Instance',
Namespace='AWS/EC2',
Statistic='Average',
Dimensions=[
{
'Name': 'InstanceId',
'Value': instance['InstanceId']
},
],
Period=60,
EvaluationPeriods=10,
Threshold=1.0,
ComparisonOperator='GreaterThanOrEqualToThreshold'
)
except Exception, e:
print("Error Encountered.")
print(e)
time.sleep(2)
# Enumerate EBS devices attached to EC2 instances
for instance in instances:
try:
for dev in instance['BlockDeviceMappings']:
if dev.get('Ebs', None) is None:
continue
vol_id = dev['Ebs']['VolumeId']
print
"Found EBS volume %s on instance %s" % (
vol_id, instance['InstanceId'])
# Create Metric "Volume Idle Time <= 30 sec (of 5 minutes) for 60 Minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s High Volume Activity Warning" % (vol_id, instance['InstanceId']),
AlarmDescription='Volume Idle Time <= 30 sec (of 5 minutes) for 60 Minutes',
ActionsEnabled=True,
AlarmActions=[
ebs_sns,
],
MetricName='VolumeIdleTime',
Namespace='AWS/EBS',
Statistic='Average',
Dimensions=[
{
'Name': 'VolumeId',
'Value': vol_id
},
],
Period=300,
EvaluationPeriods=12,
Threshold=30.0,
ComparisonOperator='LessThanOrEqualToThreshold'
)
# Create Metric "Volume Idle Time <= 30 sec (of 5 minutes) for 240 Minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s High Volume Activity Critical" % (vol_id, instance['InstanceId']),
AlarmDescription='Volume Idle Time <= 30 sec (of 5 minutes) for 240 Minutes',
ActionsEnabled=True,
AlarmActions=[
ebs_sns,
],
MetricName='VolumeIdleTime',
Namespace='AWS/EBS',
Statistic='Average',
Dimensions=[
{
'Name': 'VolumeId',
'Value': vol_id
},
],
Period=300,
EvaluationPeriods=48,
Threshold=30.0,
ComparisonOperator='LessThanOrEqualToThreshold'
)
# Create Metric "Burst Balance Exhausted on Volume for 10 minutes"
response = cw.put_metric_alarm(
AlarmName="%s %s Burst Balance Exhausted on Volume" % (vol_id, instance['InstanceId']),
AlarmDescription='Burst Balance Exhausted on Volume for 10 minutes',
ActionsEnabled=True,
AlarmActions=[
ebs_sns,
],
MetricName='BurstBalance',
Namespace='AWS/EBS',
Statistic='Average',
Dimensions=[
{
'Name': 'VolumeId',
'Value': vol_id
},
],
Period=300,
EvaluationPeriods=2,
Threshold=5.0,
ComparisonOperator='LessThanOrEqualToThreshold'
)
except Exception, e:
print("Error Encountered.")
print(e)
time.sleep(2)
@frank3427
Copy link

can you explain how one runs this?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment