Last active
October 6, 2023 18:40
-
-
Save alukach/1a2b8b6366410fb94fa5cee7f72ee304 to your computer and use it in GitHub Desktop.
Parsing S3 Inventory results in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
""" | |
A utility to stream records from one or many S3 Inventory reports, with a progress bar. | |
./parse-inventory-progress s3://my-bucket/path/to/my/inventory/2019-12-15T00-00Z/manifest.json > out.csv | |
""" | |
import json | |
import csv | |
import gzip | |
import sys | |
import urllib.parse | |
import boto3 | |
from tqdm import tqdm | |
s3 = boto3.resource('s3') | |
def list_keys(bucket, manifest_key): | |
manifest = json.load(s3.Object(bucket, manifest_key).get()['Body']) | |
for obj in manifest['files']: | |
gzip_obj = s3.Object(bucket_name=bucket, key=obj['key']) | |
buffer = gzip.open(gzip_obj.get()["Body"], mode='rt') | |
reader = csv.reader(buffer) | |
for row in reader: | |
yield row | |
if __name__ == '__main__': | |
""" | |
Call with S3 urls. | |
e.g. ./parse-inventory s3://my-bucket/path/to/my/inventory/2019-12-15T00-00Z/manifest.json | |
""" | |
for s3_url in sys.argv[1:]: | |
url = urllib.parse.urlparse(s3_url) | |
rows = tqdm( | |
list_keys(url.hostname, url.path.lstrip('/')), | |
desc=s3_url, | |
dynamic_ncols=True, | |
) | |
for bucket, key, *rest in rows: | |
print(bucket, key, *rest) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
""" | |
A utility to stream records from one or many S3 Inventory reports. | |
./parse-inventory s3://my-bucket/path/to/my/inventory/2019-12-15T00-00Z/manifest.json > out.csv | |
""" | |
import json | |
import csv | |
import gzip | |
import sys | |
import urllib.parse | |
import boto3 | |
s3 = boto3.resource('s3') | |
def list_keys(bucket, manifest_key): | |
manifest = json.load(s3.Object(bucket, manifest_key).get()['Body']) | |
for obj in manifest['files']: | |
gzip_obj = s3.Object(bucket_name=bucket, key=obj['key']) | |
buffer = gzip.open(gzip_obj.get()["Body"], mode='rt') | |
reader = csv.reader(buffer) | |
for row in reader: | |
yield row | |
if __name__ == '__main__': | |
for s3_url in sys.argv[1:]: | |
url = urllib.parse.urlparse(s3_url) | |
for bucket, key, *rest in list_keys(url.hostname, url.path.lstrip('/')): | |
print(bucket, key, *rest) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment