Skip to content

Instantly share code, notes, and snippets.

@vvalorous
Created March 27, 2018 06:53
Show Gist options
  • Save vvalorous/730b03405b5f8380f88845d7382b88e6 to your computer and use it in GitHub Desktop.
Save vvalorous/730b03405b5f8380f88845d7382b88e6 to your computer and use it in GitHub Desktop.
Lambda Python Code to Monitor CloudWatch Metrics
#!/usr/bin/env python
import boto3
import json
import urllib
import urllib2
from datetime import datetime, timedelta
# #################################################
# Region where we read metrics from
AWS_REGION='us-west-1'
# PagerDuty API URL to trigger events
PAGERDUTY_URL='https://events.pagerduty.com/generic/2010-04-15/create_event.json'
# API Key we use to contact PagerDuty, this is the generic one that PagerDuty
# provides freely
PAGERDUTY_SERVICE_KEY='w_8PcNuhHa-y3xYdmc1x'
# We currently put metrics every 5 minutes in baseline and also run the lambda
# every 5 minutes. So it is OK to check the past 5 minutes for metrics
CLOUDWATCH_PERIOD=300
# These are all the metrics that we read in order to determine if there is an
# issue happening with our monitor
CLOUDWATCH_METRICS=[
{
'namespace': 'Test',
'metricname': 'TestValueWithNoDimensions'
},
{
'namespace': 'Test2',
'metricname': 'MyMetric',
'dimensions': [
{
'Name': 'Dimension1',
'Value': 'NameOfTheValue'
}
]
},
]
# #################################################
def pagerduty():
print 'INFO: Alerting on call support using PagerDuty'
headers = {
'content-type': 'application/json'
}
params = json.dumps({
'incident_key': 'MonitorOfMonitors'
'service_key': PAGERDUTY_SERVICE_KEY,
'event_type': 'trigger',
'description': 'FAILURE reading metrics from CloudWatch',
'client': 'CloudWatcher'
}).encode('utf8')
req = urllib2.Request(PAGERDUTY_URL, params, headers)
try:
print 'INFO: Creating alert'
rsp = urllib2.urlopen(req)
data = json.loads(rsp.read())
except urllib2.HTTPError as e:
print 'ERROR: Not able to create alert'
if hasattr(e, 'reason'):
print 'Reason: %s' % e.reason
except urllib2.URLError as e:
print 'ERROR: URL malformed'
if hasattr(e, 'reason'):
print 'Reason: %s' % e.reason
else:
if data['status'] == 'success':
print 'INFO: Alert was sent correctly'
else:
print 'ERROR: Non success response from PagerDuty'
print 'INFO: PagerDuty Response= %s' % data
# #################################################
def lambda_handler(event, context):
print 'INFO: Lambda handler activated'
print 'INFO: Event ID is %s' % event['id']
main()
# #################################################
def main():
print 'INFO: Starting task'
c = boto3.client('cloudwatch', region_name=AWS_REGION)
print 'INFO: Getting metrics from CloudWatch'
total_error = 0
for metric in CLOUDWATCH_METRICS:
print 'INFO: Reading metric %s from namespace %s' % (metric['metricname'], metric['namespace'])
if 'dimensions' in metric.keys():
response = c.get_metric_statistics(
Namespace=metric['namespace'],
MetricName=metric['metricname'],
Dimensions=metric['dimensions'],
StartTime=datetime.utcnow() - timedelta(seconds=CLOUDWATCH_PERIOD),
EndTime=datetime.utcnow(),
Period=CLOUDWATCH_PERIOD,
Statistics=['Average', 'Minimum', 'Maximum'],
Unit='Count'
)
else:
response = c.get_metric_statistics(
Namespace=metric['namespace'],
MetricName=metric['metricname'],
StartTime=datetime.utcnow() - timedelta(seconds=CLOUDWATCH_PERIOD),
EndTime=datetime.utcnow(),
Period=CLOUDWATCH_PERIOD,
Statistics=['Average', 'Minimum', 'Maximum'],
Unit='Count'
)
dp = response['Datapoints']
if len(dp) == 0:
print 'WARN: Response from CloudWatch was empty for metric %s in namespace %s' % (metric['metricname'], metric['namespace'])
total_error += 1
else:
print 'INFO: Metric %s in namespace %s is fine' % (metric['metricname'], metric['namespace'])
if total_error > 0:
print 'INFO: A total of %d metrics had errors' % total_error
pagerduty()
# #################################################
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment