Skip to content

Instantly share code, notes, and snippets.

@BrandonLWhite
Created May 6, 2020 21:25
Show Gist options
  • Save BrandonLWhite/8c9622c78c86edfff6374745326019f7 to your computer and use it in GitHub Desktop.
Save BrandonLWhite/8c9622c78c86edfff6374745326019f7 to your computer and use it in GitHub Desktop.
Python S3 Object Listing Performance
import boto3
import logging
import os
import time
TIER = os.environ.get('TIER')
MAX_OBJECTS = 10000
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
S3_MERGE_BUCKET = "prod-transactions-merge.upside-services.com"
S3_MERGE_PENDING_PREFIX = 'pending'
def handler(event, context):
""" Lambda handler """
start_time = time.monotonic()
logger.info(f'Loading processors for crawler in {TIER}.')
#count = demoListObjects()
count = demoObjectsFilter()
print(f'Found {count} files')
elapsed_time = time.monotonic() - start_time
print(f'Elapsed time: {elapsed_time} s')
def count_files(pages, limit):
count = 0
for page in pages:
count += len(page['Contents'])
print(count)
if count >= limit: return count
return count
# About 6s on my system.
# WAY slower in Lambda -- like 32s
def demoListObjects():
bucket = S3_MERGE_BUCKET
prefix = S3_MERGE_PENDING_PREFIX
s3_client = boto3.client('s3')
paginator = s3_client.get_paginator("list_objects_v2")
pages = paginator.paginate(Bucket=bucket, Prefix=prefix)
return count_files(pages, MAX_OBJECTS)
# This is actually slower. About 7.5s on my system.
# Similar to demoListObjects in lambda though -- 32s.
def demoObjectsFilter():
s3_resource = boto3.resource('s3')
bucket = s3_resource.Bucket(S3_MERGE_BUCKET)
object_infos = bucket.objects.filter(Prefix=S3_MERGE_PENDING_PREFIX).limit(count=MAX_OBJECTS)
count = 0
for object_info in object_infos:
count += 1
return count
@BrandonLWhite
Copy link
Author

Python performance for listing S3 objects is very slow.
10k files: 32s
177,779 files: 527s

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment