Created
October 25, 2024 11:44
-
-
Save huevos-y-bacon/cb8ccde5aa22dd2385794a7951a83c51 to your computer and use it in GitHub Desktop.
AWS S3 - Bucket Clone and Diff
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: ASCII -*- | |
# | |
# S3 Copy Bucket - Copy all objects of a S3 bucket | |
# | |
# Copyright (c) 2022 Carsten Grohmann | |
# License: MIT (see LICENSE.txt) | |
# THIS PROGRAM COMES WITH NO WARRANTY | |
import boto3 | |
import sys | |
from multiprocessing import Pool | |
# if sys.version_info[0] == 2: | |
# from __future__ import print_function | |
# input = raw_input | |
bucket_src_name = sys.argv[1] | |
bucket_dest_name = sys.argv[2] | |
session = boto3.session.Session() | |
s3client = session.client('s3') | |
def copy(key): | |
try: | |
s3client.copy_object( | |
Bucket=bucket_dest_name, | |
Key=key, | |
CopySource={'Bucket': bucket_src_name, 'Key': key}, | |
MetadataDirective='COPY', | |
) | |
except Exception as e: | |
print("Exception occurred for key %s: %s" % (key, e)) | |
return key | |
print("Copied object: %s" % key) | |
if __name__ == "__main__": | |
print("Query objects information from s3://%s" % bucket_src_name) | |
# fetch all objects | |
paginator = s3client.get_paginator('list_objects_v2') | |
results = paginator.paginate(Bucket=bucket_src_name).build_full_result() | |
keys = [i['Key'] for i in results['Contents']] | |
print("Copy %d objects from s3://%s to s3://%s" % (len(keys), bucket_src_name, bucket_dest_name)) | |
input("Press Enter to continue or CTRL-C to abort ...") | |
p = Pool(10) | |
res = p.map(copy, keys) | |
for k in res: | |
if not k: | |
continue | |
print("Failed copy: %s" % k) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import boto3 | |
import sys | |
bucket_src_name = sys.argv[1] | |
bucket_dest_name = sys.argv[2] | |
def get_bucket_objects(bucket_name): | |
"""Retrieve all objects from an S3 bucket, excluding timestamps.""" | |
s3 = boto3.client('s3') | |
paginator = s3.get_paginator('list_objects_v2') | |
objects = {} | |
for page in paginator.paginate(Bucket=bucket_name): | |
for obj in page.get('Contents', []): | |
objects[obj['Key']] = { | |
'Size': obj['Size'], | |
'ETag': obj['ETag'] | |
} | |
return objects | |
def compare_buckets(bucket1_name, bucket2_name): | |
"""Compare the contents of two S3 buckets, ignoring timestamps.""" | |
bucket1_objects = get_bucket_objects(bucket1_name) | |
bucket2_objects = get_bucket_objects(bucket2_name) | |
# Find objects that are only in one of the buckets | |
only_in_bucket1 = set(bucket1_objects) - set(bucket2_objects) | |
only_in_bucket2 = set(bucket2_objects) - set(bucket1_objects) | |
# Find objects that are in both buckets but differ in size or ETag | |
in_both_buckets_differ_size = [ | |
key for key in bucket1_objects.keys() & bucket2_objects.keys() | |
if (bucket1_objects[key]['Size'] != bucket2_objects[key]['Size']) | |
] | |
in_both_buckets_differ_etag = [ | |
key for key in bucket1_objects.keys() & bucket2_objects.keys() | |
if (bucket1_objects[key]['ETag'] != bucket2_objects[key]['ETag']) | |
] | |
print(f"Objects only in {bucket1_name}: {only_in_bucket1}") | |
print(f"Objects only in {bucket2_name}: {only_in_bucket2}") | |
print(f"Objects in both buckets but differ in size: {in_both_buckets_differ_size}") | |
print(f"Objects in both buckets but differ in etag: {in_both_buckets_differ_etag}") | |
if __name__ == "__main__": | |
print(f"Comparing objects in s3 between {bucket_src_name} and {bucket_dest_name}\n") | |
compare_buckets(bucket_src_name, bucket_dest_name) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment