Skip to content

Instantly share code, notes, and snippets.

@nh13
Created February 26, 2018 20:08
Show Gist options
  • Select an option

  • Save nh13/8399e8c9ed0393342702ea7013371e22 to your computer and use it in GitHub Desktop.

Select an option

Save nh13/8399e8c9ed0393342702ea7013371e22 to your computer and use it in GitHub Desktop.
Summarize the sizes of S3 path prefixes
#!/bin/python
import sys
import argparse
from pathlib import Path
from collections import OrderedDict
import math
units_and_size = [
('Bytes', 1),
('KiB', 1024),
('MiB', 1024*1024),
('GiB', 1024*1024*1024),
('TiB', 1024*1024*1024*1024),
('PiB', 1024*1024*1024*1024*1024)
]
to_bytes = OrderedDict(units_and_size + [('Byte', 1)])
def bytes_to_human(b):
idx = 0
while 1024 <= b:
idx += 1
b = b / 1024.0
return f'{b:.1f} {units_and_size[idx][0]}'
def print_bucket_data(bucket, bucket_data, total_size, min_size=1, line_length=60):
def print_str(s):
num_dashes = (line_length - len(s) - 2) / 2
print('-'*math.floor(num_dashes) + f' {s} ' + '-'*math.ceil(num_dashes))
num_dashes = (line_length - len(bucket) - 2) / 2
print('-'*line_length)
print_str(bucket)
print('-'*line_length)
print()
def sort_key(item):
''' Longer paths first, and then smaller sizes first '''
k, v = item
return (-len(Path(k).parents), v)
level = None
for k, v in sorted(bucket_data.items(), key=sort_key):
if v < min_size:
continue
if level is None or level > len(k.parents):
if level is not None:
print('')
level = len(k.parents)
print_str(str(level))
print(f'{bytes_to_human(v):<12s} : s3://{bucket}/{k}')
print(f'{bytes_to_human(total_size):<12s} : s3://{bucket}')
print()
def main(args=None):
if args is None:
args = sys.argv[1:]
parser = argparse.ArgumentParser()
parser.add_argument('-b', '--bucket-file', help='One or more files containing the output of aws s3 ls --summarize --human-readable --recursive.', nargs='+', type=Path, required=True)
parser.add_argument('-d', '--max-depth', help='The maximum depth to display', default=math.inf, type=int)
parser.add_argument('-m', '--min-size', help='The minimum size to display', default='1 Bytes', type=str)
parser.add_argument('-s', '--stop-after', help='Stop after this number of objects', default=math.inf, type=int)
args = parser.parse_args()
max_depth = args.max_depth
min_size_value, min_size_units = args.min_size.split()
min_size = float(min_size_value) * to_bytes[min_size_units]
for bucket_path in args.bucket_file:
bucket = bucket_path.with_suffix('').name
with bucket_path.open('r') as fh:
bucket_data = {}
total_size = 0
for i, line in enumerate(fh):
line = line.rstrip('\r\n')
if line == '':
break
try:
date, time, value, units, path = line.split(maxsplit=4)
except ValueError as e:
sys.stderr.write(f'Error: on line {i+1}: {line}\n')
raise e
units = to_bytes[units]
size = float(value) * units
paths = [path] + list(Path(path).parents)[:-1]
total_size += size
if 0 < max_depth and max_depth != math.inf:
start_idx = len(paths)-max_depth
if start_idx < 0:
start_idx = 0
paths = paths[start_idx:]
for path in paths:
path = Path(path)
if not path in bucket_data:
bucket_data[path] = size
else:
bucket_data[path] = bucket_data[path] + size
if args.stop_after <= i:
break
print_bucket_data(bucket=bucket, bucket_data=bucket_data, total_size=total_size, min_size=min_size)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment