Skip to content

Instantly share code, notes, and snippets.

@huevos-y-bacon
Created October 25, 2024 11:44
Show Gist options
  • Save huevos-y-bacon/cb8ccde5aa22dd2385794a7951a83c51 to your computer and use it in GitHub Desktop.
Save huevos-y-bacon/cb8ccde5aa22dd2385794a7951a83c51 to your computer and use it in GitHub Desktop.
AWS S3 - Bucket Clone and Diff
#!/usr/bin/python
# -*- coding: ASCII -*-
#
# S3 Copy Bucket - Copy all objects of a S3 bucket
#
# Copyright (c) 2022 Carsten Grohmann
# License: MIT (see LICENSE.txt)
# THIS PROGRAM COMES WITH NO WARRANTY
import boto3
import sys
from multiprocessing import Pool
# if sys.version_info[0] == 2:
# from __future__ import print_function
# input = raw_input
bucket_src_name = sys.argv[1]
bucket_dest_name = sys.argv[2]
session = boto3.session.Session()
s3client = session.client('s3')
def copy(key):
try:
s3client.copy_object(
Bucket=bucket_dest_name,
Key=key,
CopySource={'Bucket': bucket_src_name, 'Key': key},
MetadataDirective='COPY',
)
except Exception as e:
print("Exception occurred for key %s: %s" % (key, e))
return key
print("Copied object: %s" % key)
if __name__ == "__main__":
print("Query objects information from s3://%s" % bucket_src_name)
# fetch all objects
paginator = s3client.get_paginator('list_objects_v2')
results = paginator.paginate(Bucket=bucket_src_name).build_full_result()
keys = [i['Key'] for i in results['Contents']]
print("Copy %d objects from s3://%s to s3://%s" % (len(keys), bucket_src_name, bucket_dest_name))
input("Press Enter to continue or CTRL-C to abort ...")
p = Pool(10)
res = p.map(copy, keys)
for k in res:
if not k:
continue
print("Failed copy: %s" % k)
#!/usr/bin/env python3
import boto3
import sys
bucket_src_name = sys.argv[1]
bucket_dest_name = sys.argv[2]
def get_bucket_objects(bucket_name):
"""Retrieve all objects from an S3 bucket, excluding timestamps."""
s3 = boto3.client('s3')
paginator = s3.get_paginator('list_objects_v2')
objects = {}
for page in paginator.paginate(Bucket=bucket_name):
for obj in page.get('Contents', []):
objects[obj['Key']] = {
'Size': obj['Size'],
'ETag': obj['ETag']
}
return objects
def compare_buckets(bucket1_name, bucket2_name):
"""Compare the contents of two S3 buckets, ignoring timestamps."""
bucket1_objects = get_bucket_objects(bucket1_name)
bucket2_objects = get_bucket_objects(bucket2_name)
# Find objects that are only in one of the buckets
only_in_bucket1 = set(bucket1_objects) - set(bucket2_objects)
only_in_bucket2 = set(bucket2_objects) - set(bucket1_objects)
# Find objects that are in both buckets but differ in size or ETag
in_both_buckets_differ_size = [
key for key in bucket1_objects.keys() & bucket2_objects.keys()
if (bucket1_objects[key]['Size'] != bucket2_objects[key]['Size'])
]
in_both_buckets_differ_etag = [
key for key in bucket1_objects.keys() & bucket2_objects.keys()
if (bucket1_objects[key]['ETag'] != bucket2_objects[key]['ETag'])
]
print(f"Objects only in {bucket1_name}: {only_in_bucket1}")
print(f"Objects only in {bucket2_name}: {only_in_bucket2}")
print(f"Objects in both buckets but differ in size: {in_both_buckets_differ_size}")
print(f"Objects in both buckets but differ in etag: {in_both_buckets_differ_etag}")
if __name__ == "__main__":
print(f"Comparing objects in s3 between {bucket_src_name} and {bucket_dest_name}\n")
compare_buckets(bucket_src_name, bucket_dest_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment