Last active
March 24, 2025 05:22
-
-
Save tkanhe/b2915557f165fc5f0469e917add240a3 to your computer and use it in GitHub Desktop.
Python script to copy S3 files between AWS accounts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import concurrent.futures | |
import boto3 | |
from botocore.exceptions import ClientError | |
def copy_object( | |
destination_client, | |
source_bucket, | |
src_key, | |
destination_bucket, | |
destination_prefix, | |
source_prefix, | |
): | |
"""Copy a single S3 object between buckets.""" | |
# Construct the destination key while preserving the folder structure | |
dest_key = src_key.replace(source_prefix, destination_prefix, 1) | |
copy_source = {"Bucket": source_bucket, "Key": src_key} | |
try: | |
destination_client.copy_object( | |
CopySource=copy_source, | |
Bucket=destination_bucket, | |
Key=dest_key, | |
ACL="bucket-owner-full-control", # Ensure destination account has full control | |
) | |
return f"Copied {src_key} to {destination_bucket}/{dest_key}" | |
except ClientError as e: | |
return f"Error copying {src_key}: {e}" | |
def copy_s3_objects( | |
source_client, | |
destination_client, | |
source_bucket, | |
source_prefix, | |
destination_bucket, | |
destination_prefix=None, | |
max_workers=10, | |
): | |
"""Copy multiple S3 objects between buckets using parallel processing.""" | |
if destination_prefix is None: | |
destination_prefix = source_prefix | |
# Use paginator to handle more than 1,000 objects | |
paginator = source_client.get_paginator("list_objects_v2") | |
page_iterator = paginator.paginate(Bucket=source_bucket, Prefix=source_prefix) | |
# Collect all keys for progress tracking and to avoid duplicate API calls | |
all_keys = [] | |
print("Listing objects to copy...") | |
for page in page_iterator: | |
if "Contents" in page: | |
all_keys.extend([obj["Key"] for obj in page["Contents"]]) | |
total_objects = len(all_keys) | |
if total_objects == 0: | |
print("No objects found in the source prefix.") | |
return | |
print(f"Found {total_objects} objects to copy") | |
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: | |
futures = [ | |
executor.submit( | |
copy_object, | |
destination_client, | |
source_bucket, | |
src_key, | |
destination_bucket, | |
destination_prefix, | |
source_prefix, | |
) | |
for src_key in all_keys | |
] | |
# Track progress and handle results | |
completed = 0 | |
success = 0 | |
failure = 0 | |
for future in concurrent.futures.as_completed(futures): | |
result = future.result() | |
completed += 1 | |
if "Error" in result: | |
failure += 1 | |
else: | |
success += 1 | |
# Print progress periodically | |
if completed % 10 == 0 or completed == total_objects: | |
print(f"Progress: {completed}/{total_objects} ({completed / total_objects * 100:.1f}%)") | |
print(result) | |
# Print summary | |
print(f"\nCopy operation completed: {success} successful, {failure} failed, {total_objects} total") | |
if __name__ == "__main__": | |
# Define your AWS credentials for both accounts | |
source_creds = { | |
"aws_access_key_id": "SOURCE_ACCESS_KEY", | |
"aws_secret_access_key": "SOURCE_SECRET_KEY", | |
# Uncomment if using temporary credentials | |
# 'aws_session_token': 'SOURCE_SESSION_TOKEN', | |
} | |
dest_creds = { | |
"aws_access_key_id": "DEST_ACCESS_KEY", | |
"aws_secret_access_key": "DEST_SECRET_KEY", | |
# Uncomment if using temporary credentials | |
# 'aws_session_token': 'DEST_SESSION_TOKEN', | |
} | |
# Define your bucket and prefix information | |
destination_bucket = "demo-data" | |
destination_prefix = "avue/resumes/" | |
source_bucket = "tk-demo" | |
source_prefix = "avue/resumes/" | |
# Create a separate AWS client for the source account | |
source_client = boto3.client("s3", region_name="us-east-1", **source_creds) | |
# Create a separate AWS client for the destination account | |
destination_client = boto3.client("s3", region_name="us-east-1", **dest_creds) | |
# Run the copy operation with parallel processing | |
copy_s3_objects( | |
source_client, | |
destination_client, | |
source_bucket, | |
source_prefix, | |
destination_bucket, | |
destination_prefix, | |
max_workers=20, # Adjust based on your needs | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment