Skip to content

Instantly share code, notes, and snippets.

@mbiemann
Last active April 27, 2023 10:31
Show Gist options
  • Save mbiemann/a0681f20ae489a52617db8f0bc87d434 to your computer and use it in GitHub Desktop.
Save mbiemann/a0681f20ae489a52617db8f0bc87d434 to your computer and use it in GitHub Desktop.
Get all S3 Objects saving partial to avoid restart from begin if interrupt
bucket = 'bucket_name'
prefix = 'path/folder/'
file_meta = './local/folder_meta.json'
file_detail = './local/folder_detail.json'
file_partial = './local/folder_partial.json'
stop_key = 'path/folder/partition9/filename9999.csv.gz'
data_meta = {}
data_detail = {}
try:
data_meta = load(open(file_meta, 'r'))
data_detail = load(open(file_detail, 'r'))
except FileNotFoundError:
pass
if not data_meta or not data_detail:
files = []
parts = set()
nextt = ''
try:
try:
d = load(open(file_partial, 'r'))
except JSONDecodeError:
d = load(open(f'{file_partial}.bkp', 'r'))
files = d['files']
parts = set(d['parts'])
nextt = loads(d['nextt'])
except FileNotFoundError:
pass
if nextt != 'STOP':
params = {}
params['Bucket'] = bucket
params['Prefix'] = prefix
if nextt:
params['ContinuationToken'] = nextt
for page in s3_client.get_paginator('list_objects_v2').paginate(**params):
key = ''
for content in page['Contents']:
key = content['Key']
files.append(key)
parts.add(key.split('/')[2])
dump(
{
'files': files,
'parts': list(parts),
'nextt': dumps(page['NextContinuationToken'] if 'NextContinuationToken' in page else 'STOP')
},
open(file_partial, 'w')
)
print(f'{datetime.now().isoformat()} - partial file {file_partial} saved.')
copyfile(file_partial, f'{file_partial}.bkp')
if key == stop_key:
break
files.sort()
parts = list(parts)
parts.sort()
data_detail['files'] = files
data_detail['parts'] = parts
dump(data_detail, open(file_detail, 'w'))
data_meta['files_update'] = datetime.utcnow().isoformat()
data_meta['files_count'] = len(files)
data_meta['parts_update'] = datetime.utcnow().isoformat()
data_meta['parts_count'] = len(parts)
dump(data_meta, open(file_meta, 'w'), indent=4)
remove(file_partial)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment