Created
May 6, 2020 21:25
-
-
Save BrandonLWhite/8c9622c78c86edfff6374745326019f7 to your computer and use it in GitHub Desktop.
Python S3 Object Listing Performance
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import boto3 | |
import logging | |
import os | |
import time | |
TIER = os.environ.get('TIER') | |
MAX_OBJECTS = 10000 | |
logger = logging.getLogger(__name__) | |
logger.setLevel(logging.INFO) | |
S3_MERGE_BUCKET = "prod-transactions-merge.upside-services.com" | |
S3_MERGE_PENDING_PREFIX = 'pending' | |
def handler(event, context): | |
""" Lambda handler """ | |
start_time = time.monotonic() | |
logger.info(f'Loading processors for crawler in {TIER}.') | |
#count = demoListObjects() | |
count = demoObjectsFilter() | |
print(f'Found {count} files') | |
elapsed_time = time.monotonic() - start_time | |
print(f'Elapsed time: {elapsed_time} s') | |
def count_files(pages, limit): | |
count = 0 | |
for page in pages: | |
count += len(page['Contents']) | |
print(count) | |
if count >= limit: return count | |
return count | |
# About 6s on my system. | |
# WAY slower in Lambda -- like 32s | |
def demoListObjects(): | |
bucket = S3_MERGE_BUCKET | |
prefix = S3_MERGE_PENDING_PREFIX | |
s3_client = boto3.client('s3') | |
paginator = s3_client.get_paginator("list_objects_v2") | |
pages = paginator.paginate(Bucket=bucket, Prefix=prefix) | |
return count_files(pages, MAX_OBJECTS) | |
# This is actually slower. About 7.5s on my system. | |
# Similar to demoListObjects in lambda though -- 32s. | |
def demoObjectsFilter(): | |
s3_resource = boto3.resource('s3') | |
bucket = s3_resource.Bucket(S3_MERGE_BUCKET) | |
object_infos = bucket.objects.filter(Prefix=S3_MERGE_PENDING_PREFIX).limit(count=MAX_OBJECTS) | |
count = 0 | |
for object_info in object_infos: | |
count += 1 | |
return count |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Python performance for listing S3 objects is very slow.
10k files: 32s
177,779 files: 527s