This is a rough cut of a script to download all the things from an AWS S3 bucket. I wrote it to move files from Digital Ocean to AWS proper.
Created
October 18, 2021 14:54
-
-
Save toadkicker/a0c40ab44b2702b82af3a2cdff0f7b22 to your computer and use it in GitHub Desktop.
Recustive S3 download
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import multiprocessing | |
import os | |
import sys | |
import boto3 | |
from botocore.client import Config | |
s3_client = None | |
spaces = None | |
def initialize(): | |
global s3_client | |
global spaces | |
# Initialize a session using DigitalOcean Spaces. This is just a plain S3 client so ymmv. | |
session = boto3.session.Session() | |
s3_client = session.client('s3', | |
region_name='nyc3', | |
endpoint_url='https://nyc3.digitaloceanspaces.com', | |
aws_access_key_id='<ENTER ACCESS KEY>', | |
aws_secret_access_key='<ENTER SECRET KEY>') | |
response = s3_client.list_buckets() | |
spaces = [space['Name'] for space in response['Buckets']] | |
def listObjects(bucket): | |
return s3_client.list_objects(Bucket=bucket) | |
def downloadFile(s3Metadata): | |
name = s3Metadata['Key'] | |
file = os.path.join(os.getcwd(), os.path.normpath(name)) | |
with open(file, 'wb') as fh: | |
# need to iterate over spaces | |
s3_client.download_fileobj(spaces[0], name, fh) | |
print("Downloaded %s" % file, '\n') | |
def createDirs(files_response): | |
files = [] | |
for f in files_response['Contents']: | |
file = os.path.join(os.getcwd(), os.path.normpath(f['Key'])) | |
files.append(file) | |
workingPath = os.path.split(file) | |
if not os.path.exists(workingPath[0]): | |
os.makedirs(workingPath[0]) | |
if __name__ == '__main__': | |
initialize() | |
pool = multiprocessing.Pool(multiprocessing.cpu_count(), initialize) | |
for space in spaces: | |
files_response = listObjects(space) | |
createDirs(files_response) | |
workers = pool.map(downloadFile, files_response['Contents']) | |
pool.close() | |
pool.join() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment