Skip to content

Instantly share code, notes, and snippets.

@tkanhe
Last active March 24, 2025 05:22
Show Gist options
  • Save tkanhe/b2915557f165fc5f0469e917add240a3 to your computer and use it in GitHub Desktop.
Save tkanhe/b2915557f165fc5f0469e917add240a3 to your computer and use it in GitHub Desktop.
Python script to copy S3 files between AWS accounts
import concurrent.futures
import boto3
from botocore.exceptions import ClientError
def copy_object(
destination_client,
source_bucket,
src_key,
destination_bucket,
destination_prefix,
source_prefix,
):
"""Copy a single S3 object between buckets."""
# Construct the destination key while preserving the folder structure
dest_key = src_key.replace(source_prefix, destination_prefix, 1)
copy_source = {"Bucket": source_bucket, "Key": src_key}
try:
destination_client.copy_object(
CopySource=copy_source,
Bucket=destination_bucket,
Key=dest_key,
ACL="bucket-owner-full-control", # Ensure destination account has full control
)
return f"Copied {src_key} to {destination_bucket}/{dest_key}"
except ClientError as e:
return f"Error copying {src_key}: {e}"
def copy_s3_objects(
source_client,
destination_client,
source_bucket,
source_prefix,
destination_bucket,
destination_prefix=None,
max_workers=10,
):
"""Copy multiple S3 objects between buckets using parallel processing."""
if destination_prefix is None:
destination_prefix = source_prefix
# Use paginator to handle more than 1,000 objects
paginator = source_client.get_paginator("list_objects_v2")
page_iterator = paginator.paginate(Bucket=source_bucket, Prefix=source_prefix)
# Collect all keys for progress tracking and to avoid duplicate API calls
all_keys = []
print("Listing objects to copy...")
for page in page_iterator:
if "Contents" in page:
all_keys.extend([obj["Key"] for obj in page["Contents"]])
total_objects = len(all_keys)
if total_objects == 0:
print("No objects found in the source prefix.")
return
print(f"Found {total_objects} objects to copy")
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [
executor.submit(
copy_object,
destination_client,
source_bucket,
src_key,
destination_bucket,
destination_prefix,
source_prefix,
)
for src_key in all_keys
]
# Track progress and handle results
completed = 0
success = 0
failure = 0
for future in concurrent.futures.as_completed(futures):
result = future.result()
completed += 1
if "Error" in result:
failure += 1
else:
success += 1
# Print progress periodically
if completed % 10 == 0 or completed == total_objects:
print(f"Progress: {completed}/{total_objects} ({completed / total_objects * 100:.1f}%)")
print(result)
# Print summary
print(f"\nCopy operation completed: {success} successful, {failure} failed, {total_objects} total")
if __name__ == "__main__":
# Define your AWS credentials for both accounts
source_creds = {
"aws_access_key_id": "SOURCE_ACCESS_KEY",
"aws_secret_access_key": "SOURCE_SECRET_KEY",
# Uncomment if using temporary credentials
# 'aws_session_token': 'SOURCE_SESSION_TOKEN',
}
dest_creds = {
"aws_access_key_id": "DEST_ACCESS_KEY",
"aws_secret_access_key": "DEST_SECRET_KEY",
# Uncomment if using temporary credentials
# 'aws_session_token': 'DEST_SESSION_TOKEN',
}
# Define your bucket and prefix information
destination_bucket = "demo-data"
destination_prefix = "avue/resumes/"
source_bucket = "tk-demo"
source_prefix = "avue/resumes/"
# Create a separate AWS client for the source account
source_client = boto3.client("s3", region_name="us-east-1", **source_creds)
# Create a separate AWS client for the destination account
destination_client = boto3.client("s3", region_name="us-east-1", **dest_creds)
# Run the copy operation with parallel processing
copy_s3_objects(
source_client,
destination_client,
source_bucket,
source_prefix,
destination_bucket,
destination_prefix,
max_workers=20, # Adjust based on your needs
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment