Last active
April 1, 2021 00:32
-
-
Save ivanlonel/b127cf717fedf76c0316bb0930ad184f to your computer and use it in GitHub Desktop.
Renaming files in AWS S3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import boto3 | |
| import json | |
| import logging | |
| import argparse | |
| import itertools | |
| # Bucket.delete_objects allows a maximum of 1000 objects to be deleted in a single request | |
| def break_in_chunks(iterable, max_size): | |
| if max_size == 0: | |
| yield iterable | |
| return | |
| sentinel = object() | |
| chunks = itertools.zip_longest(*[iter(iterable)]*max_size, fillvalue=sentinel) | |
| try: | |
| chunk = next(chunks) | |
| except StopIteration: # iterable was empty or max_size < 0. | |
| return | |
| for lookahead in chunks: | |
| yield chunk | |
| chunk = lookahead | |
| # Filter last chunk to remove sentinel values filled by itertools.zip_longest | |
| yield (item for item in chunk if item is not sentinel) | |
| def copy_objects(dest_bucket, src_bucket_name, prefix, transform_function): | |
| for obj in dest_bucket.objects.filter(Prefix=prefix): | |
| original_key = obj.key | |
| modified_key = transform_function(original_key) | |
| if original_key != modified_key: | |
| #logging.debug((original_key, modified_key)) | |
| try: | |
| dest_bucket.copy(CopySource={'Bucket': src_bucket_name, 'Key': original_key}, Key=modified_key) | |
| except Exception as e: | |
| logging.error(f'Error copying object {original_key} to {modified_key}: {e!r}') | |
| else: | |
| yield {'Key': original_key} | |
| if __name__ == '__main__': | |
| parser = argparse.ArgumentParser(description="Batch rename files in S3 buckets.") | |
| parser.add_argument('-a', '--access-key-id', required=True, help='AWS_ACCESS_KEY_ID') | |
| parser.add_argument('-s', '--secret-access-key', required=True, help='AWS_SECRET_ACCESS_KEY') | |
| parser.add_argument('-r', '--region-name', help='AWS_REGION_NAME') | |
| parser.add_argument('-p', '--prefix', required=True, help='Common prefix of the files to be renamed') | |
| parser.add_argument('-b', '--bucket-name', required=True, help='Source bucket') | |
| parser.add_argument('-d', '--dest-bucket-name', help='Destination bucket, if different from source') | |
| parser.add_argument('-t', '--transform-function', type=eval, required=True, help=''' | |
| Transform function that receives the original file name and returns the new name. | |
| The string passed will be fed to the eval() builtin function. | |
| Valid examples are "lambda s: s.upper()" and "exec('import urllib') or urllib.parse.unquote". | |
| Yes, this is highly exploitable. | |
| ''') | |
| args = parser.parse_args() | |
| if args.dest_bucket_name is None: | |
| args.dest_bucket_name = args.bucket_name | |
| session = boto3.Session( | |
| aws_access_key_id=args.access_key_id, | |
| aws_secret_access_key=args.secret_access_key, | |
| region_name=args.region_name | |
| ) | |
| client = session.resource('s3') | |
| bucket = client.Bucket(args.dest_bucket_name) | |
| for batch in break_in_chunks(copy_objects(bucket, args.bucket_name, args.prefix, args.transform_function), 1000): | |
| print(json.dumps(bucket.delete_objects(Delete={'Objects': list(batch)}), indent='\t')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment