Skip to content

Instantly share code, notes, and snippets.

@vincentsarago
Created October 13, 2020 16:39
Show Gist options
  • Save vincentsarago/082ced9e6b1611997893cce6ae61e429 to your computer and use it in GitHub Desktop.
Save vincentsarago/082ced9e6b1611997893cce6ae61e429 to your computer and use it in GitHub Desktop.
import sys
import zlib
import json
from concurrent import futures
from urllib.parse import urlparse
import boto3
import click
client = boto3.client('s3')
# Utily functions
def unzipInvent(gzip_buffer):
"""Parse zipped content
"""
return zlib.decompress(gzip_buffer, zlib.MAX_WBITS | 16).decode()
# s3://sentinel-cogs-inventory/sentinel-cogs/sentinel-cogs/2020-10-12T00-00Z/manifest.json
@click.command()
@click.argument("manifest", type=str)
def main(manifest):
parsed = urlparse(manifest)
bucket = parsed.netloc
key = parsed.path.strip("/")
# read manifest
click.echo("Reading manifest...")
response = client.get_object(Bucket=bucket, Key=key)
# Get list of data files
invent = json.loads(response['Body'].read()).get('files')
keys = [x['key'] for x in invent]
click.echo(f"Found {len(keys)} data files")
def worker(key):
"""Download data files and parse the content."""
response = client.get_object(Bucket=bucket, Key=key)
list_files = unzipInvent(response['Body'].read()).splitlines()
# Extract size only
# e.g: "sentinel-cogs","sentinel-s2-l2a-cogs/2019/S2B_29NNG_20190819_0_L2A/SCL.tif","1367123","2020-05-25T19:54:42.000Z"
total_size = sum(list(
map(lambda x: int(x.replace('"', '').split(',')[2]), list_files)
))
return total_size
concurrent = 10
click.echo("Reading data files...")
with futures.ThreadPoolExecutor(max_workers=concurrent) as executor:
future_work = [executor.submit(worker, key) for key in keys]
with click.progressbar(
futures.as_completed(future_work),
file=sys.stderr,
length=len(keys),
show_percent=True
) as future:
for res in future:
pass
responses = [future.result() for future in future_work]
# each worker returns a list so we have to concatenate
total_size = sum(responses)
click.echo(json.dumps(total_size, indent=4))
# eg. python get_bucket_size.py s3://sentinel-cogs-inventory/sentinel-cogs/sentinel-cogs/2020-10-12T00-00Z/manifest.json
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment