Created
January 27, 2019 13:59
-
-
Save arpchaudhary/5406ea1b121f8552d873f11086c3f0c9 to your computer and use it in GitHub Desktop.
Utility file for getting Amazon S3 Usage from Cloudtrail logs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# This program would find the top n resources from a set of cloudtrail | |
# log files. Expects a local copy of the logs. You can do the same by: | |
# aws s3 sync s3://your.s3.cloudtrail.path /your/local/directory | |
# | |
# This script also expects the local AWS keys to deployed on the running machine | |
# Use this as your own caution | |
import argparse | |
import json | |
import os | |
import gzip | |
import boto3 | |
import logging | |
def main(): | |
parser = argparse.ArgumentParser(description='AWS Cloudtrail Object Frequency Counter') | |
parser.add_argument('logfiles', nargs='+', help='logfiles to process') | |
parser.add_argument("--debug", help='Print debug logs', action="store_true") | |
parser.add_argument("-n", "--number", type=int, default=100, help='Will show top n results') | |
parser.add_argument("-f", "--freq_barrier", type=int, default=1000, help='Assets below this frequency will not be considered for bandwidth calculation') | |
args = parser.parse_args() | |
if args.debug: | |
print "setting loglevel to debug" | |
logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG) | |
logging.debug("Frequency barrier set to %d", args.freq_barrier) | |
logging.debug("Total records requested are top %d", args.number) | |
freq_counter = {} | |
bandwidth_counter = {} | |
record_counter = 0 | |
file_counter = 0 | |
for logfile in args.logfiles: | |
if os.path.isfile(logfile): | |
_, extension = os.path.splitext(logfile) | |
#Select relevant open method | |
file_opener = gzip.open if extension == ".gz" else open | |
with file_opener(logfile) as ifile: | |
for line in ifile: | |
data = json.loads(line) | |
#In log, a single file would contain a log record array | |
for record in data['Records']: | |
resources = record.get('resources', []) | |
if len(resources) > 0: | |
arn = resources[0].get("ARN", "N/A") | |
freq_counter[arn] = freq_counter.get(arn, 0) + 1 | |
record_counter += 1 | |
file_counter += 1 | |
logging.debug("Processed file %s. Total Records are %d", logfile, record_counter) | |
logging.info("Processed %d records over %d files", record_counter, file_counter) | |
s3 = boto3.client('s3') | |
logging.debug("S3 client connected") | |
#This should help us output the result in a tabular format | |
max_arn_size = 0 | |
max_bandwidth = 0 | |
max_frequency = 0 | |
#Find the bandwidth of all assets that have been accessed atleast args.freq_barrier times | |
logging.debug("Frequency process started") | |
for arn in freq_counter: | |
if max_arn_size < len(arn): max_arn_size = len(arn) | |
if freq_counter[arn] >= args.freq_barrier: | |
if max_frequency < freq_counter[arn]: max_frequency = freq_counter[arn] | |
_, obj_path = arn.split(':::', 1) | |
if obj_path: | |
bucket_name, key = obj_path.split('/', 1) | |
#Head call will just get the meta data of the object | |
#without actually making a call to download and transfer the whole object | |
resp = s3.head_object(Bucket = bucket_name, Key = key) | |
bandwidth_counter[arn] = (resp['ContentLength'] * freq_counter[arn])/ 2**20 #Converts from Bytes to MB | |
if max_bandwidth < bandwidth_counter[arn]: max_bandwidth = bandwidth_counter[arn] | |
logging.debug("Records left after frequecy barrier filtration are %d", len(bandwidth_counter)) | |
#Formatted print for easy understanding | |
format_str = "{:<" + str(max_arn_size) + "} {:<" + str(len(str(max_frequency)) + 2) + "} {:<" + str(len(str(max_bandwidth))) + "} MBs" | |
header = format_str.format("Resource Name", "Freq", "Bandwidth") | |
print header | |
print '-'*len(header) | |
top_count = 0 | |
for arn, bandwidth in sorted(bandwidth_counter.iteritems(), key=lambda (k,v): (v,k), reverse=True): | |
print format_str.format(arn, freq_counter[arn], bandwidth) | |
top_count += 1 | |
if top_count >= args.number: | |
break; | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment