Created
February 26, 2018 20:08
-
-
Save nh13/8399e8c9ed0393342702ea7013371e22 to your computer and use it in GitHub Desktop.
Summarize the sizes of S3 path prefixes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/python | |
| import sys | |
| import argparse | |
| from pathlib import Path | |
| from collections import OrderedDict | |
| import math | |
| units_and_size = [ | |
| ('Bytes', 1), | |
| ('KiB', 1024), | |
| ('MiB', 1024*1024), | |
| ('GiB', 1024*1024*1024), | |
| ('TiB', 1024*1024*1024*1024), | |
| ('PiB', 1024*1024*1024*1024*1024) | |
| ] | |
| to_bytes = OrderedDict(units_and_size + [('Byte', 1)]) | |
| def bytes_to_human(b): | |
| idx = 0 | |
| while 1024 <= b: | |
| idx += 1 | |
| b = b / 1024.0 | |
| return f'{b:.1f} {units_and_size[idx][0]}' | |
| def print_bucket_data(bucket, bucket_data, total_size, min_size=1, line_length=60): | |
| def print_str(s): | |
| num_dashes = (line_length - len(s) - 2) / 2 | |
| print('-'*math.floor(num_dashes) + f' {s} ' + '-'*math.ceil(num_dashes)) | |
| num_dashes = (line_length - len(bucket) - 2) / 2 | |
| print('-'*line_length) | |
| print_str(bucket) | |
| print('-'*line_length) | |
| print() | |
| def sort_key(item): | |
| ''' Longer paths first, and then smaller sizes first ''' | |
| k, v = item | |
| return (-len(Path(k).parents), v) | |
| level = None | |
| for k, v in sorted(bucket_data.items(), key=sort_key): | |
| if v < min_size: | |
| continue | |
| if level is None or level > len(k.parents): | |
| if level is not None: | |
| print('') | |
| level = len(k.parents) | |
| print_str(str(level)) | |
| print(f'{bytes_to_human(v):<12s} : s3://{bucket}/{k}') | |
| print(f'{bytes_to_human(total_size):<12s} : s3://{bucket}') | |
| print() | |
| def main(args=None): | |
| if args is None: | |
| args = sys.argv[1:] | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('-b', '--bucket-file', help='One or more files containing the output of aws s3 ls --summarize --human-readable --recursive.', nargs='+', type=Path, required=True) | |
| parser.add_argument('-d', '--max-depth', help='The maximum depth to display', default=math.inf, type=int) | |
| parser.add_argument('-m', '--min-size', help='The minimum size to display', default='1 Bytes', type=str) | |
| parser.add_argument('-s', '--stop-after', help='Stop after this number of objects', default=math.inf, type=int) | |
| args = parser.parse_args() | |
| max_depth = args.max_depth | |
| min_size_value, min_size_units = args.min_size.split() | |
| min_size = float(min_size_value) * to_bytes[min_size_units] | |
| for bucket_path in args.bucket_file: | |
| bucket = bucket_path.with_suffix('').name | |
| with bucket_path.open('r') as fh: | |
| bucket_data = {} | |
| total_size = 0 | |
| for i, line in enumerate(fh): | |
| line = line.rstrip('\r\n') | |
| if line == '': | |
| break | |
| try: | |
| date, time, value, units, path = line.split(maxsplit=4) | |
| except ValueError as e: | |
| sys.stderr.write(f'Error: on line {i+1}: {line}\n') | |
| raise e | |
| units = to_bytes[units] | |
| size = float(value) * units | |
| paths = [path] + list(Path(path).parents)[:-1] | |
| total_size += size | |
| if 0 < max_depth and max_depth != math.inf: | |
| start_idx = len(paths)-max_depth | |
| if start_idx < 0: | |
| start_idx = 0 | |
| paths = paths[start_idx:] | |
| for path in paths: | |
| path = Path(path) | |
| if not path in bucket_data: | |
| bucket_data[path] = size | |
| else: | |
| bucket_data[path] = bucket_data[path] + size | |
| if args.stop_after <= i: | |
| break | |
| print_bucket_data(bucket=bucket, bucket_data=bucket_data, total_size=total_size, min_size=min_size) | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment