Created
October 13, 2020 16:39
-
-
Save vincentsarago/082ced9e6b1611997893cce6ae61e429 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import zlib | |
import json | |
from concurrent import futures | |
from urllib.parse import urlparse | |
import boto3 | |
import click | |
client = boto3.client('s3') | |
# Utily functions | |
def unzipInvent(gzip_buffer): | |
"""Parse zipped content | |
""" | |
return zlib.decompress(gzip_buffer, zlib.MAX_WBITS | 16).decode() | |
# s3://sentinel-cogs-inventory/sentinel-cogs/sentinel-cogs/2020-10-12T00-00Z/manifest.json | |
@click.command() | |
@click.argument("manifest", type=str) | |
def main(manifest): | |
parsed = urlparse(manifest) | |
bucket = parsed.netloc | |
key = parsed.path.strip("/") | |
# read manifest | |
click.echo("Reading manifest...") | |
response = client.get_object(Bucket=bucket, Key=key) | |
# Get list of data files | |
invent = json.loads(response['Body'].read()).get('files') | |
keys = [x['key'] for x in invent] | |
click.echo(f"Found {len(keys)} data files") | |
def worker(key): | |
"""Download data files and parse the content.""" | |
response = client.get_object(Bucket=bucket, Key=key) | |
list_files = unzipInvent(response['Body'].read()).splitlines() | |
# Extract size only | |
# e.g: "sentinel-cogs","sentinel-s2-l2a-cogs/2019/S2B_29NNG_20190819_0_L2A/SCL.tif","1367123","2020-05-25T19:54:42.000Z" | |
total_size = sum(list( | |
map(lambda x: int(x.replace('"', '').split(',')[2]), list_files) | |
)) | |
return total_size | |
concurrent = 10 | |
click.echo("Reading data files...") | |
with futures.ThreadPoolExecutor(max_workers=concurrent) as executor: | |
future_work = [executor.submit(worker, key) for key in keys] | |
with click.progressbar( | |
futures.as_completed(future_work), | |
file=sys.stderr, | |
length=len(keys), | |
show_percent=True | |
) as future: | |
for res in future: | |
pass | |
responses = [future.result() for future in future_work] | |
# each worker returns a list so we have to concatenate | |
total_size = sum(responses) | |
click.echo(json.dumps(total_size, indent=4)) | |
# eg. python get_bucket_size.py s3://sentinel-cogs-inventory/sentinel-cogs/sentinel-cogs/2020-10-12T00-00Z/manifest.json | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment