arpchaudhary · January 27, 2019 13:59
diff --git a/cloudtrail-freq.py b/cloudtrail-freq.py
 #!/usr/bin/env python

 # This program would find the top n resources from a set of cloudtrail
 # log files. Expects a local copy of the logs. You can do the same by:
 # aws s3 sync s3://your.s3.cloudtrail.path /your/local/directory
 #
 # This script also expects the local AWS keys to deployed on the running machine
 # Use this as your own caution

 import argparse
 import json
 import os
 import gzip
 import boto3
 import logging

 def main():
 	parser = argparse.ArgumentParser(description='AWS Cloudtrail Object Frequency Counter')
 	parser.add_argument('logfiles', nargs='+', help='logfiles to process')
 	parser.add_argument("--debug", help='Print debug logs', action="store_true")
 	parser.add_argument("-n", "--number", type=int, default=100, help='Will show top n results')
 	parser.add_argument("-f", "--freq_barrier", type=int, default=1000, help='Assets below this frequency will not be considered for bandwidth calculation')
 	args = parser.parse_args()
 	
 	if args.debug:
 		print "setting loglevel to debug"
 		logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG)

 	logging.debug("Frequency barrier set to %d", args.freq_barrier)
 	logging.debug("Total records requested are top %d", args.number)
 	
 	freq_counter = {}
 	bandwidth_counter = {}
 	record_counter = 0
 	file_counter = 0

 	for logfile in args.logfiles:
 		if os.path.isfile(logfile):
 			_, extension = os.path.splitext(logfile)

 			#Select relevant open method
 			file_opener = gzip.open if extension == ".gz" else open
 			
 			with file_opener(logfile) as ifile:
 				for line in ifile:
 					data = json.loads(line)
 			
 					#In log, a single file would contain a log record array
 					for record in data['Records']:
 						resources = record.get('resources', [])
 						if len(resources) > 0:
 							arn = resources[0].get("ARN", "N/A")
 							freq_counter[arn] = freq_counter.get(arn, 0) + 1
 						record_counter += 1
 				file_counter += 1
 			
 			logging.debug("Processed file %s. Total Records are %d", logfile, record_counter)
 	logging.info("Processed %d records over %d files", record_counter, file_counter)
 	
 	s3 = boto3.client('s3')
 	logging.debug("S3 client connected")
 	
 	#This should help us output the result in a tabular format
 	max_arn_size = 0 
 	max_bandwidth = 0
 	max_frequency = 0
 	
 	#Find the bandwidth of all assets that have been accessed atleast args.freq_barrier times
 	logging.debug("Frequency process started")
 	for arn in freq_counter:
 		if max_arn_size < len(arn): max_arn_size = len(arn)
 			
 		if freq_counter[arn] >= args.freq_barrier:

 			if max_frequency < freq_counter[arn]: max_frequency = freq_counter[arn]
 			
 			_, obj_path = arn.split(':::', 1)		
 			if obj_path:
 				bucket_name, key = obj_path.split('/', 1)

 				#Head call will just get the meta data of the object 
 				#without actually making a call to download and transfer the whole object
 				resp = s3.head_object(Bucket = bucket_name, Key = key)
 				bandwidth_counter[arn] = (resp['ContentLength'] * freq_counter[arn])/ 2**20 #Converts from Bytes to MB

 				if max_bandwidth < bandwidth_counter[arn]: max_bandwidth = bandwidth_counter[arn]

 	logging.debug("Records left after frequecy barrier filtration are %d", len(bandwidth_counter))
 	
 	#Formatted print for easy understanding
 	format_str = "{:<" + str(max_arn_size) + "} {:<" + str(len(str(max_frequency)) + 2) + "} {:<" + str(len(str(max_bandwidth))) + "} MBs"
 	header = format_str.format("Resource Name", "Freq", "Bandwidth")
 	print header
 	print '-'*len(header)

 	top_count = 0
 	for arn, bandwidth in sorted(bandwidth_counter.iteritems(), key=lambda (k,v): (v,k), reverse=True):
 		print format_str.format(arn, freq_counter[arn], bandwidth)
 		top_count += 1
 		if top_count >= args.number:
 			break;

 if __name__ == '__main__':
 	main()
	#!/usr/bin/env python

	# This program would find the top n resources from a set of cloudtrail
	# log files. Expects a local copy of the logs. You can do the same by:
	# aws s3 sync s3://your.s3.cloudtrail.path /your/local/directory
	#
	# This script also expects the local AWS keys to deployed on the running machine
	# Use this as your own caution

	import argparse
	import json
	import os
	import gzip
	import boto3
	import logging

	def main():
	parser = argparse.ArgumentParser(description='AWS Cloudtrail Object Frequency Counter')
	parser.add_argument('logfiles', nargs='+', help='logfiles to process')
	parser.add_argument("--debug", help='Print debug logs', action="store_true")
	parser.add_argument("-n", "--number", type=int, default=100, help='Will show top n results')
	parser.add_argument("-f", "--freq_barrier", type=int, default=1000, help='Assets below this frequency will not be considered for bandwidth calculation')
	args = parser.parse_args()

	if args.debug:
	print "setting loglevel to debug"
	logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG)

	logging.debug("Frequency barrier set to %d", args.freq_barrier)
	logging.debug("Total records requested are top %d", args.number)

	freq_counter = {}
	bandwidth_counter = {}
	record_counter = 0
	file_counter = 0

	for logfile in args.logfiles:
	if os.path.isfile(logfile):
	_, extension = os.path.splitext(logfile)

	#Select relevant open method
	file_opener = gzip.open if extension == ".gz" else open

	with file_opener(logfile) as ifile:
	for line in ifile:
	data = json.loads(line)

	#In log, a single file would contain a log record array
	for record in data['Records']:
	resources = record.get('resources', [])
	if len(resources) > 0:
	arn = resources[0].get("ARN", "N/A")
	freq_counter[arn] = freq_counter.get(arn, 0) + 1
	record_counter += 1
	file_counter += 1

	logging.debug("Processed file %s. Total Records are %d", logfile, record_counter)
	logging.info("Processed %d records over %d files", record_counter, file_counter)

	s3 = boto3.client('s3')
	logging.debug("S3 client connected")

	#This should help us output the result in a tabular format
	max_arn_size = 0
	max_bandwidth = 0
	max_frequency = 0

	#Find the bandwidth of all assets that have been accessed atleast args.freq_barrier times
	logging.debug("Frequency process started")
	for arn in freq_counter:
	if max_arn_size < len(arn): max_arn_size = len(arn)

	if freq_counter[arn] >= args.freq_barrier:

	if max_frequency < freq_counter[arn]: max_frequency = freq_counter[arn]

	_, obj_path = arn.split(':::', 1)
	if obj_path:
	bucket_name, key = obj_path.split('/', 1)

	#Head call will just get the meta data of the object
	#without actually making a call to download and transfer the whole object
	resp = s3.head_object(Bucket = bucket_name, Key = key)
	bandwidth_counter[arn] = (resp['ContentLength'] * freq_counter[arn])/ 2**20 #Converts from Bytes to MB

	if max_bandwidth < bandwidth_counter[arn]: max_bandwidth = bandwidth_counter[arn]

	logging.debug("Records left after frequecy barrier filtration are %d", len(bandwidth_counter))

	#Formatted print for easy understanding
	format_str = "{:<" + str(max_arn_size) + "} {:<" + str(len(str(max_frequency)) + 2) + "} {:<" + str(len(str(max_bandwidth))) + "} MBs"
	header = format_str.format("Resource Name", "Freq", "Bandwidth")
	print header
	print '-'*len(header)

	top_count = 0
	for arn, bandwidth in sorted(bandwidth_counter.iteritems(), key=lambda (k,v): (v,k), reverse=True):
	print format_str.format(arn, freq_counter[arn], bandwidth)
	top_count += 1
	if top_count >= args.number:
	break;

	if __name__ == '__main__':
	main()