Last active
April 13, 2022 16:27
-
-
Save davidejones/9ea69eb94650fbb6697f2f1904946d11 to your computer and use it in GitHub Desktop.
Updates s3 metadata have an md5chksum
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import hashlib | |
import logging | |
import tempfile | |
import boto3 | |
def md5(file_handle, block_size=4096): | |
""" | |
Takes a file handle and creates a md5 hash | |
:param file_handle: file handle | |
:param block_size: the size of the chunks to read | |
returns md5 hash of file | |
""" | |
hash_md5 = hashlib.md5() | |
for block in iter(lambda: file_handle.read(block_size), b""): | |
hash_md5.update(block) | |
return hash_md5.hexdigest() | |
def update_md5_meta(args, client, key): | |
""" | |
Downloads the file from s3 and generates a md5 hash | |
Then updates the same file in s3 to have the md5sum as meta data | |
Note that updating using copy_object with replace directive has the side effect of fixing the ETag anyway | |
:param args: cli args bucket name, dryrun etc. | |
:param client: s3 boto3 client | |
:param key: name of s3 key we are interacting with | |
""" | |
obj = client.head_object(Bucket=args.bucket, Key=key) | |
current_etag = obj['ETag'] | |
metadata = obj["Metadata"] | |
if not args.dryrun: | |
with tempfile.TemporaryFile(mode='w+b') as file_handle: | |
client.download_fileobj(args.bucket, key, file_handle) | |
file_handle.seek(0) | |
metadata["md5chksum"] = md5(file_handle) | |
response = client.copy_object(Bucket=args.bucket, Key=key, CopySource=args.bucket + '/' + key, | |
Metadata=metadata, ContentType=obj['ContentType'], MetadataDirective='REPLACE') | |
etag = response.get('CopyObjectResult', {}).get('ETag', '') | |
logging.info(f"Updated '{key}' md5 meta to {metadata['md5chksum']} ETag changed from {current_etag} to {etag}") | |
else: | |
logging.info(f"Skipping '{key}' copy object in dry run") | |
def main(args): | |
""" | |
Finds all files in bucket that are uploaded in multipart and updates their metadata to have an md5sum | |
:param args: cli args bucket name, dryrun etc. | |
""" | |
client = boto3.client('s3') | |
paginator = client.get_paginator('list_objects') | |
page_iterator = paginator.paginate(Bucket=args.bucket, Prefix=args.prefix) | |
# If the entity tag is not an MD5 digest of the object data, it will contain one or more | |
# nonhexadecimal characters and/or will consist of less than 32 or more than 32 hexadecimal digits. | |
filtered_iterator = page_iterator.search("Contents[?contains(ETag, '-')]") | |
for key_data in filtered_iterator: | |
update_md5_meta(args, client, key_data['Key']) | |
logging.info("DONE!") | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='Updates md5sum metadata on s3 objects') | |
parser.add_argument('bucket', help='name of bucket') | |
parser.add_argument('-p', '--prefix', default="", help='prefix path in bucket to work on') | |
parser.add_argument('--dryrun', action='store_true', help='dry run mode is read only no modifications') | |
options = parser.parse_args() | |
fmt = f"[DRYRUN] {logging.BASIC_FORMAT}" if options.dryrun else logging.BASIC_FORMAT | |
logging.basicConfig(level=logging.INFO, format=fmt) | |
main(options) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Usage
Running on all files in a bucket
Running on all files in a bucket under a certain subdirectory
Running on a single file