Skip to content

Instantly share code, notes, and snippets.

@iandanforth
Created October 1, 2013 19:13
Show Gist options
  • Select an option

  • Save iandanforth/6783556 to your computer and use it in GitHub Desktop.

Select an option

Save iandanforth/6783556 to your computer and use it in GitHub Desktop.
Updated, smugmug specific boto data collection script.
#-------------------------------------------------------------------------------
# Copyright (C) 2013 Numenta Inc. All rights reserved.
#
# The information and source code contained herein is the
# exclusive property of Numenta Inc. No part of this software
# may be used, reproduced, stored or distributed in any form,
# without explicit written authorization from Numenta Inc.
#-------------------------------------------------------------------------------
desc = """
This tool will find instances with the tag specified in the accompanying
boto-config.yaml and then collect the last two weeks of cloudwatch data from
them."""
import optparse
import os
import boto
import sys
import csv
import yaml
from boto import ec2, rds, sqs
from boto.ec2 import cloudwatch
from datetime import timedelta, datetime
AWS_REGIONS = {
"ap-northeast-1":"Asia Pacific (Tokyo) Region",
"ap-southeast-1":"Asia Pacific (Singapore) Region",
"ap-southeast-2":"Asia Pacific (Sydney) Region",
"eu-west-1":"EU (Ireland) Region",
"sa-east-1":"South America (Sao Paulo) Region",
"us-east-1":"US East (Northern Virginia) Region",
"us-west-1":"US West (Northern California) Region",
"us-west-2":"US West (Oregon) Region"
}
def main(options):
# Load configuration
with open(options.configFile, 'r') as fh:
config = yaml.load(fh)
# Get credentials
AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY_ID']
# A code for the user whose data we're collecting
datasetCode = config['DatasetCode']
# The period of time we want data for
startTime = datetime(**config['StartTime'])
endTime = datetime(**config['EndTime'])
# Define the stats we want
statistics = ["Average","Minimum","Maximum"]
# Which services we'll pull from. NOTE: Only EC2 at the moment
services = config['Services']
for region in config['Regions']:
print "Now working on region: %s ..." % region
# Connect to cloudwatch
cwConn = cloudwatch.connect_to_region(region_name=region,
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
for k, v in services.iteritems():
print "Working on service: %s ..." % k
metricNames = v['Metrics']
tag = v['Tag']
if not tag:
print "WARNING: No tag specified this might pull data from *many* servers."
input = raw_input("Continue? [y/n]: ")
if input not in ['y', 'yes', 'Y', 'Yes', 'YES']:
sys.exit(1)
# Connect to service NOTE: EC2 specific for now
ec2Conn = ec2.connect_to_region(region_name=region,
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
# REMOVE KEY AS THIS IS NOT HOW SMUGMUG USES THEM
tag = None
filters = {}
if tag:
filters["tag-key"] = tag
# Stupid EC2 wrapper needs to be removed
reservations = ec2Conn.get_all_instances(filters=filters)
instances = []
for res in reservations:
for instance in res.instances:
role = ''
for tagName, tagValue in instance.tags.iteritems():
# Collect the tier (role) the server is labeled as
if tagName == 'tier':
role = tagValue
# Use this instance only if it's labeled properly
if ':numenta:' in tagValue and instance not in instances:
instances.append((instance, role))
# Loop over discovered instances
for instance, role in instances:
print "Retrieving data for Instance: %s ..." % instance.id
for metricName in metricNames:
print "Getting %s ..." % metricName
metric = cwConn.list_metrics(metric_name=metricName,
dimensions={"InstanceId": instance.id},
namespace="AWS/EC2")
if metric:
timeBlocks = genStartAndEndTimes(startTime, endTime)
result = []
# Break up 2 weeks into chunks below API limits
for block in timeBlocks:
rawdata = []
fromDate, toDate = block
rawdata = metric[0].query(start_time=fromDate, end_time=toDate,
statistics=statistics,
period=300)
if len(rawdata) == 0:
continue
# Sort by "Timestamp"
rawdata.sort(key=lambda row:row["Timestamp"])
# Append data to results
result.extend(rawdata)
# Write out results
outFileName = "%s_%s_%s_%s_%s.csv" % (datasetCode,
region,
role,
instance.id,
metricName)
with open(outFileName, 'w') as fh:
writer = csv.writer(fh)
# Headers
writer.writerow(result[0].keys())
writer.writerows([row.values() for row in result])
def genStartAndEndTimes(start, end):
'''
Returns a list of tuples (start, end).
start - datetime object - The start time of the full block
end - datetime object - The end time of the full block
The full block of time will be broken up into smaller blocks to deal with
the request limits of the CloudWatch CLI tool.
That limit appears to be ~1,400 records / request but is not published. Since
we are collecting records at 5 minute intervals this means we will produce
blocks a maximum of 5 * 1400 minutes long.
'''
# Output string format
# %f outputs 6 zeros so we hard code 3 here for compliance
fmt = "%Y-%m-%dT%H:%M:%S.000Z"
# Max time delta
dt = 5 * 1400
partialEnd = start
# The blocks of time we will return
blocks = []
while partialEnd < end:
# Find our new endpoint
partialEnd = start + timedelta(minutes = dt)
# Cap it
if partialEnd > end:
partialEnd = end
blocks.append((start, partialEnd))
start = partialEnd
return blocks
def verifyUserInput(options):
'''
Raises errors if options passed on command line are invalid
'''
if not options.configFile:
print "ERROR: -c is required to specify your config file."
sys.exit(1)
if __name__ == '__main__':
# Create a parser for command line arguments
parser = optparse.OptionParser(description = desc)
# Add options as needed for this tool
parser.add_option("-c", "--config", dest="configFile",
help="The configuration file to use.")
(options, args) = parser.parse_args()
# Check for malformed or invalid inputs
verifyUserInput(options)
main(options)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment