Created
March 27, 2018 06:53
-
-
Save vvalorous/730b03405b5f8380f88845d7382b88e6 to your computer and use it in GitHub Desktop.
Lambda Python Code to Monitor CloudWatch Metrics
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import boto3 | |
import json | |
import urllib | |
import urllib2 | |
from datetime import datetime, timedelta | |
# ################################################# | |
# Region where we read metrics from | |
AWS_REGION='us-west-1' | |
# PagerDuty API URL to trigger events | |
PAGERDUTY_URL='https://events.pagerduty.com/generic/2010-04-15/create_event.json' | |
# API Key we use to contact PagerDuty, this is the generic one that PagerDuty | |
# provides freely | |
PAGERDUTY_SERVICE_KEY='w_8PcNuhHa-y3xYdmc1x' | |
# We currently put metrics every 5 minutes in baseline and also run the lambda | |
# every 5 minutes. So it is OK to check the past 5 minutes for metrics | |
CLOUDWATCH_PERIOD=300 | |
# These are all the metrics that we read in order to determine if there is an | |
# issue happening with our monitor | |
CLOUDWATCH_METRICS=[ | |
{ | |
'namespace': 'Test', | |
'metricname': 'TestValueWithNoDimensions' | |
}, | |
{ | |
'namespace': 'Test2', | |
'metricname': 'MyMetric', | |
'dimensions': [ | |
{ | |
'Name': 'Dimension1', | |
'Value': 'NameOfTheValue' | |
} | |
] | |
}, | |
] | |
# ################################################# | |
def pagerduty(): | |
print 'INFO: Alerting on call support using PagerDuty' | |
headers = { | |
'content-type': 'application/json' | |
} | |
params = json.dumps({ | |
'incident_key': 'MonitorOfMonitors' | |
'service_key': PAGERDUTY_SERVICE_KEY, | |
'event_type': 'trigger', | |
'description': 'FAILURE reading metrics from CloudWatch', | |
'client': 'CloudWatcher' | |
}).encode('utf8') | |
req = urllib2.Request(PAGERDUTY_URL, params, headers) | |
try: | |
print 'INFO: Creating alert' | |
rsp = urllib2.urlopen(req) | |
data = json.loads(rsp.read()) | |
except urllib2.HTTPError as e: | |
print 'ERROR: Not able to create alert' | |
if hasattr(e, 'reason'): | |
print 'Reason: %s' % e.reason | |
except urllib2.URLError as e: | |
print 'ERROR: URL malformed' | |
if hasattr(e, 'reason'): | |
print 'Reason: %s' % e.reason | |
else: | |
if data['status'] == 'success': | |
print 'INFO: Alert was sent correctly' | |
else: | |
print 'ERROR: Non success response from PagerDuty' | |
print 'INFO: PagerDuty Response= %s' % data | |
# ################################################# | |
def lambda_handler(event, context): | |
print 'INFO: Lambda handler activated' | |
print 'INFO: Event ID is %s' % event['id'] | |
main() | |
# ################################################# | |
def main(): | |
print 'INFO: Starting task' | |
c = boto3.client('cloudwatch', region_name=AWS_REGION) | |
print 'INFO: Getting metrics from CloudWatch' | |
total_error = 0 | |
for metric in CLOUDWATCH_METRICS: | |
print 'INFO: Reading metric %s from namespace %s' % (metric['metricname'], metric['namespace']) | |
if 'dimensions' in metric.keys(): | |
response = c.get_metric_statistics( | |
Namespace=metric['namespace'], | |
MetricName=metric['metricname'], | |
Dimensions=metric['dimensions'], | |
StartTime=datetime.utcnow() - timedelta(seconds=CLOUDWATCH_PERIOD), | |
EndTime=datetime.utcnow(), | |
Period=CLOUDWATCH_PERIOD, | |
Statistics=['Average', 'Minimum', 'Maximum'], | |
Unit='Count' | |
) | |
else: | |
response = c.get_metric_statistics( | |
Namespace=metric['namespace'], | |
MetricName=metric['metricname'], | |
StartTime=datetime.utcnow() - timedelta(seconds=CLOUDWATCH_PERIOD), | |
EndTime=datetime.utcnow(), | |
Period=CLOUDWATCH_PERIOD, | |
Statistics=['Average', 'Minimum', 'Maximum'], | |
Unit='Count' | |
) | |
dp = response['Datapoints'] | |
if len(dp) == 0: | |
print 'WARN: Response from CloudWatch was empty for metric %s in namespace %s' % (metric['metricname'], metric['namespace']) | |
total_error += 1 | |
else: | |
print 'INFO: Metric %s in namespace %s is fine' % (metric['metricname'], metric['namespace']) | |
if total_error > 0: | |
print 'INFO: A total of %d metrics had errors' % total_error | |
pagerduty() | |
# ################################################# | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment