Skip to content

Instantly share code, notes, and snippets.

@arpchaudhary
Created January 27, 2019 13:59
Show Gist options
  • Save arpchaudhary/5406ea1b121f8552d873f11086c3f0c9 to your computer and use it in GitHub Desktop.
Save arpchaudhary/5406ea1b121f8552d873f11086c3f0c9 to your computer and use it in GitHub Desktop.
Utility file for getting Amazon S3 Usage from Cloudtrail logs
#!/usr/bin/env python
# This program would find the top n resources from a set of cloudtrail
# log files. Expects a local copy of the logs. You can do the same by:
# aws s3 sync s3://your.s3.cloudtrail.path /your/local/directory
#
# This script also expects the local AWS keys to deployed on the running machine
# Use this as your own caution
import argparse
import json
import os
import gzip
import boto3
import logging
def main():
parser = argparse.ArgumentParser(description='AWS Cloudtrail Object Frequency Counter')
parser.add_argument('logfiles', nargs='+', help='logfiles to process')
parser.add_argument("--debug", help='Print debug logs', action="store_true")
parser.add_argument("-n", "--number", type=int, default=100, help='Will show top n results')
parser.add_argument("-f", "--freq_barrier", type=int, default=1000, help='Assets below this frequency will not be considered for bandwidth calculation')
args = parser.parse_args()
if args.debug:
print "setting loglevel to debug"
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG)
logging.debug("Frequency barrier set to %d", args.freq_barrier)
logging.debug("Total records requested are top %d", args.number)
freq_counter = {}
bandwidth_counter = {}
record_counter = 0
file_counter = 0
for logfile in args.logfiles:
if os.path.isfile(logfile):
_, extension = os.path.splitext(logfile)
#Select relevant open method
file_opener = gzip.open if extension == ".gz" else open
with file_opener(logfile) as ifile:
for line in ifile:
data = json.loads(line)
#In log, a single file would contain a log record array
for record in data['Records']:
resources = record.get('resources', [])
if len(resources) > 0:
arn = resources[0].get("ARN", "N/A")
freq_counter[arn] = freq_counter.get(arn, 0) + 1
record_counter += 1
file_counter += 1
logging.debug("Processed file %s. Total Records are %d", logfile, record_counter)
logging.info("Processed %d records over %d files", record_counter, file_counter)
s3 = boto3.client('s3')
logging.debug("S3 client connected")
#This should help us output the result in a tabular format
max_arn_size = 0
max_bandwidth = 0
max_frequency = 0
#Find the bandwidth of all assets that have been accessed atleast args.freq_barrier times
logging.debug("Frequency process started")
for arn in freq_counter:
if max_arn_size < len(arn): max_arn_size = len(arn)
if freq_counter[arn] >= args.freq_barrier:
if max_frequency < freq_counter[arn]: max_frequency = freq_counter[arn]
_, obj_path = arn.split(':::', 1)
if obj_path:
bucket_name, key = obj_path.split('/', 1)
#Head call will just get the meta data of the object
#without actually making a call to download and transfer the whole object
resp = s3.head_object(Bucket = bucket_name, Key = key)
bandwidth_counter[arn] = (resp['ContentLength'] * freq_counter[arn])/ 2**20 #Converts from Bytes to MB
if max_bandwidth < bandwidth_counter[arn]: max_bandwidth = bandwidth_counter[arn]
logging.debug("Records left after frequecy barrier filtration are %d", len(bandwidth_counter))
#Formatted print for easy understanding
format_str = "{:<" + str(max_arn_size) + "} {:<" + str(len(str(max_frequency)) + 2) + "} {:<" + str(len(str(max_bandwidth))) + "} MBs"
header = format_str.format("Resource Name", "Freq", "Bandwidth")
print header
print '-'*len(header)
top_count = 0
for arn, bandwidth in sorted(bandwidth_counter.iteritems(), key=lambda (k,v): (v,k), reverse=True):
print format_str.format(arn, freq_counter[arn], bandwidth)
top_count += 1
if top_count >= args.number:
break;
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment