iandanforth · October 1, 2013 19:13
diff --git a/smg_boto_get_data.py b/smg_boto_get_data.py
 #-------------------------------------------------------------------------------
 # Copyright (C) 2013 Numenta Inc. All rights reserved.
 #
 # The information and source code contained herein is the
 # exclusive property of Numenta Inc.  No part of this software
 # may be used, reproduced, stored or distributed in any form,
 # without explicit written authorization from Numenta Inc.
 #-------------------------------------------------------------------------------
 desc = """
 This tool will find instances with the tag specified in the accompanying
 boto-config.yaml and then collect the last two weeks of cloudwatch data from
 them."""

 import optparse
 import os
 import boto
 import sys
 import csv
 import yaml

 from boto import ec2, rds, sqs
 from boto.ec2 import cloudwatch
 from datetime import timedelta, datetime

 AWS_REGIONS = {
  "ap-northeast-1":"Asia Pacific (Tokyo) Region",
  "ap-southeast-1":"Asia Pacific (Singapore) Region",
  "ap-southeast-2":"Asia Pacific (Sydney) Region",
  "eu-west-1":"EU (Ireland) Region",
  "sa-east-1":"South America (Sao Paulo) Region",
  "us-east-1":"US East (Northern Virginia) Region",
  "us-west-1":"US West (Northern California) Region",
  "us-west-2":"US West (Oregon) Region"
 }

 def main(options):
  
  # Load configuration
  with open(options.configFile, 'r') as fh:
    config = yaml.load(fh)
  
  # Get credentials
  AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
  AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY_ID']
  
  # A code for the user whose data we're collecting
  datasetCode = config['DatasetCode']
  
  # The period of time we want data for
  startTime = datetime(**config['StartTime'])
  endTime = datetime(**config['EndTime'])
  
  # Define the stats we want
  statistics = ["Average","Minimum","Maximum"]
  
  # Which services we'll pull from. NOTE: Only EC2 at the moment
  services = config['Services']

  for region in config['Regions']:
    print "Now working on region: %s ..." % region
    
    # Connect to cloudwatch
    cwConn = cloudwatch.connect_to_region(region_name=region,
                                 aws_access_key_id=AWS_ACCESS_KEY_ID,
                                 aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
  
    for k, v in services.iteritems():
      print "Working on service: %s ..." % k
      
      metricNames = v['Metrics']
      tag = v['Tag']
      if not tag:
        print "WARNING: No tag specified this might pull data from *many* servers."
        input = raw_input("Continue? [y/n]: ")
        if input not in ['y', 'yes', 'Y', 'Yes', 'YES']:
          sys.exit(1)
      
      
      # Connect to service NOTE: EC2 specific for now
      ec2Conn = ec2.connect_to_region(region_name=region,
                                 aws_access_key_id=AWS_ACCESS_KEY_ID,
                                 aws_secret_access_key=AWS_SECRET_ACCESS_KEY)
      
      # REMOVE KEY AS THIS IS NOT HOW SMUGMUG USES THEM
      tag = None
      filters = {}
      if tag:
        filters["tag-key"] = tag
      
      # Stupid EC2 wrapper needs to be removed  
      reservations = ec2Conn.get_all_instances(filters=filters)
      instances = []
      for res in reservations:
        for instance in res.instances:
          role = ''
          for tagName, tagValue in instance.tags.iteritems():
            # Collect the tier (role) the server is labeled as
            if tagName == 'tier':
              role = tagValue
            # Use this instance only if it's labeled properly
            if ':numenta:' in tagValue and instance not in instances:
              instances.append((instance, role))
       
      # Loop over discovered instances
      for instance, role in instances:
        print "Retrieving data for Instance: %s ..." % instance.id
        for metricName in metricNames:
          print "Getting %s ..." % metricName
          metric = cwConn.list_metrics(metric_name=metricName,
                               dimensions={"InstanceId": instance.id},
                               namespace="AWS/EC2")
          if metric:
            timeBlocks = genStartAndEndTimes(startTime, endTime)
    
            result = []
            # Break up 2 weeks into chunks below API limits
            for block in timeBlocks:
              rawdata = []
              fromDate, toDate = block
              rawdata = metric[0].query(start_time=fromDate, end_time=toDate,
                                     statistics=statistics,
                                     period=300)
              
              if len(rawdata) == 0:
                continue
        
              # Sort by "Timestamp"
              rawdata.sort(key=lambda row:row["Timestamp"])
        
              # Append data to results
              result.extend(rawdata)
            
            # Write out results
            outFileName = "%s_%s_%s_%s_%s.csv" % (datasetCode,
                                                  region,
                                                  role,
                                                  instance.id,
                                                  metricName)
            with open(outFileName, 'w') as fh:
              writer = csv.writer(fh)
              # Headers
              writer.writerow(result[0].keys())
              writer.writerows([row.values() for row in result])

 def genStartAndEndTimes(start, end):
  '''
  Returns a list of tuples (start, end).
  
  start - datetime object - The start time of the full block
  end - datetime object - The end time of the full block
  
  The full block of time will be broken up into smaller blocks to deal with
  the request limits of the CloudWatch CLI tool. 
  
  That limit appears to be ~1,400 records / request but is not published. Since
  we are collecting records at 5 minute intervals this means we will produce
  blocks a maximum of 5 * 1400 minutes long.
  '''
  
  # Output string format
  # %f outputs 6 zeros so we hard code 3 here for compliance
  fmt = "%Y-%m-%dT%H:%M:%S.000Z"
  
  # Max time delta
  dt = 5 * 1400
  partialEnd = start

  # The blocks of time we will return
  blocks = []
  
  while partialEnd < end:
    
    # Find our new endpoint
    partialEnd = start + timedelta(minutes = dt)
    
    # Cap it
    if partialEnd > end:
      partialEnd = end
    blocks.append((start, partialEnd))
    start = partialEnd  
  
  return blocks

 def verifyUserInput(options):
  '''
  Raises errors if options passed on command line are invalid
  '''
  
  if not options.configFile:
    print "ERROR: -c is required to specify your config file."
    sys.exit(1)

 if __name__ == '__main__':
  # Create a parser for command line arguments
  parser = optparse.OptionParser(description = desc)
  
  # Add options as needed for this tool
  parser.add_option("-c", "--config", dest="configFile",
                    help="The configuration file to use.")
  
  (options, args) = parser.parse_args()
  
  # Check for malformed or invalid inputs
  verifyUserInput(options)
  
  main(options)
	#-------------------------------------------------------------------------------
	# Copyright (C) 2013 Numenta Inc. All rights reserved.
	#
	# The information and source code contained herein is the
	# exclusive property of Numenta Inc. No part of this software
	# may be used, reproduced, stored or distributed in any form,
	# without explicit written authorization from Numenta Inc.
	#-------------------------------------------------------------------------------
	desc = """
	This tool will find instances with the tag specified in the accompanying
	boto-config.yaml and then collect the last two weeks of cloudwatch data from
	them."""

	import optparse
	import os
	import boto
	import sys
	import csv
	import yaml

	from boto import ec2, rds, sqs
	from boto.ec2 import cloudwatch
	from datetime import timedelta, datetime

	AWS_REGIONS = {
	"ap-northeast-1":"Asia Pacific (Tokyo) Region",
	"ap-southeast-1":"Asia Pacific (Singapore) Region",
	"ap-southeast-2":"Asia Pacific (Sydney) Region",
	"eu-west-1":"EU (Ireland) Region",
	"sa-east-1":"South America (Sao Paulo) Region",
	"us-east-1":"US East (Northern Virginia) Region",
	"us-west-1":"US West (Northern California) Region",
	"us-west-2":"US West (Oregon) Region"
	}

	def main(options):

	# Load configuration
	with open(options.configFile, 'r') as fh:
	config = yaml.load(fh)

	# Get credentials
	AWS_ACCESS_KEY_ID = os.environ['AWS_ACCESS_KEY_ID']
	AWS_SECRET_ACCESS_KEY = os.environ['AWS_SECRET_ACCESS_KEY_ID']

	# A code for the user whose data we're collecting
	datasetCode = config['DatasetCode']

	# The period of time we want data for
	startTime = datetime(**config['StartTime'])
	endTime = datetime(**config['EndTime'])

	# Define the stats we want
	statistics = ["Average","Minimum","Maximum"]

	# Which services we'll pull from. NOTE: Only EC2 at the moment
	services = config['Services']

	for region in config['Regions']:
	print "Now working on region: %s ..." % region

	# Connect to cloudwatch
	cwConn = cloudwatch.connect_to_region(region_name=region,
	aws_access_key_id=AWS_ACCESS_KEY_ID,
	aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

	for k, v in services.iteritems():
	print "Working on service: %s ..." % k

	metricNames = v['Metrics']
	tag = v['Tag']
	if not tag:
	print "WARNING: No tag specified this might pull data from many servers."
	input = raw_input("Continue? [y/n]: ")
	if input not in ['y', 'yes', 'Y', 'Yes', 'YES']:
	sys.exit(1)


	# Connect to service NOTE: EC2 specific for now
	ec2Conn = ec2.connect_to_region(region_name=region,
	aws_access_key_id=AWS_ACCESS_KEY_ID,
	aws_secret_access_key=AWS_SECRET_ACCESS_KEY)

	# REMOVE KEY AS THIS IS NOT HOW SMUGMUG USES THEM
	tag = None
	filters = {}
	if tag:
	filters["tag-key"] = tag

	# Stupid EC2 wrapper needs to be removed
	reservations = ec2Conn.get_all_instances(filters=filters)
	instances = []
	for res in reservations:
	for instance in res.instances:
	role = ''
	for tagName, tagValue in instance.tags.iteritems():
	# Collect the tier (role) the server is labeled as
	if tagName == 'tier':
	role = tagValue
	# Use this instance only if it's labeled properly
	if ':numenta:' in tagValue and instance not in instances:
	instances.append((instance, role))

	# Loop over discovered instances
	for instance, role in instances:
	print "Retrieving data for Instance: %s ..." % instance.id
	for metricName in metricNames:
	print "Getting %s ..." % metricName
	metric = cwConn.list_metrics(metric_name=metricName,
	dimensions={"InstanceId": instance.id},
	namespace="AWS/EC2")
	if metric:
	timeBlocks = genStartAndEndTimes(startTime, endTime)

	result = []
	# Break up 2 weeks into chunks below API limits
	for block in timeBlocks:
	rawdata = []
	fromDate, toDate = block
	rawdata = metric[0].query(start_time=fromDate, end_time=toDate,
	statistics=statistics,
	period=300)

	if len(rawdata) == 0:
	continue

	# Sort by "Timestamp"
	rawdata.sort(key=lambda row:row["Timestamp"])

	# Append data to results
	result.extend(rawdata)

	# Write out results
	outFileName = "%s_%s_%s_%s_%s.csv" % (datasetCode,
	region,
	role,
	instance.id,
	metricName)
	with open(outFileName, 'w') as fh:
	writer = csv.writer(fh)
	# Headers
	writer.writerow(result[0].keys())
	writer.writerows([row.values() for row in result])

	def genStartAndEndTimes(start, end):
	'''
	Returns a list of tuples (start, end).

	start - datetime object - The start time of the full block
	end - datetime object - The end time of the full block

	The full block of time will be broken up into smaller blocks to deal with
	the request limits of the CloudWatch CLI tool.

	That limit appears to be ~1,400 records / request but is not published. Since
	we are collecting records at 5 minute intervals this means we will produce
	blocks a maximum of 5 * 1400 minutes long.
	'''

	# Output string format
	# %f outputs 6 zeros so we hard code 3 here for compliance
	fmt = "%Y-%m-%dT%H:%M:%S.000Z"

	# Max time delta
	dt = 5 * 1400
	partialEnd = start

	# The blocks of time we will return
	blocks = []

	while partialEnd < end:

	# Find our new endpoint
	partialEnd = start + timedelta(minutes = dt)

	# Cap it
	if partialEnd > end:
	partialEnd = end
	blocks.append((start, partialEnd))
	start = partialEnd

	return blocks

	def verifyUserInput(options):
	'''
	Raises errors if options passed on command line are invalid
	'''

	if not options.configFile:
	print "ERROR: -c is required to specify your config file."
	sys.exit(1)

	if __name__ == '__main__':
	# Create a parser for command line arguments
	parser = optparse.OptionParser(description = desc)

	# Add options as needed for this tool
	parser.add_option("-c", "--config", dest="configFile",
	help="The configuration file to use.")

	(options, args) = parser.parse_args()

	# Check for malformed or invalid inputs
	verifyUserInput(options)

	main(options)
No results found