Last active
August 1, 2025 06:18
-
-
Save brayevalerien/507cb60b4cbc6e607d6eb67ca41fde5d to your computer and use it in GitHub Desktop.
Helper function for downloading and uploading files to S3 using boto3. Assumes AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY are set in the environment.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Helpers for downloading from and uploading to an S3 bucket. | |
Adapted from https://docs.runpod.io/serverless/storage/s3-api. | |
""" | |
import os | |
from typing import List, Optional | |
from urllib.parse import urlparse | |
from uuid import uuid4 | |
import boto3 | |
import requests | |
from botocore.exceptions import ClientError | |
def get_client(region_name: Optional[str] = None, endpoint_url: Optional[str] = None) -> boto3.client: | |
""" | |
Create and return a boto3 S3 client configured via environment variables. | |
Reads AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY from the environment. | |
Args: | |
region_name (str, optional): AWS region to configure the client for. | |
endpoint_url (str, optional): Custom S3-compatible endpoint URL (to use Runpod datacenters, set to https://REGION_NAME-s3api.runpod.io/). | |
Returns: | |
boto3.client: A configured S3 client instance. | |
Raises: | |
EnvironmentError: If AWS credentials are missing from the environment. | |
RuntimeError: If client creation fails for another reason. | |
""" | |
access_key = os.environ.get("AWS_ACCESS_KEY_ID") | |
secret_key = os.environ.get("AWS_SECRET_ACCESS_KEY") | |
if not access_key or not secret_key: | |
raise EnvironmentError("AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY must be set in the environment.") | |
try: | |
return boto3.client( | |
"s3", | |
aws_access_key_id=access_key, | |
aws_secret_access_key=secret_key, | |
region_name=region_name, | |
endpoint_url=endpoint_url, | |
) | |
except Exception as e: | |
raise RuntimeError(f"Error creating S3 client: {e}") | |
def upload_file(s3_client: boto3.client, bucket_name: str, file_path: str, key: Optional[str] = None) -> str: | |
""" | |
Upload a local file to an S3 bucket. | |
Args: | |
s3_client (boto3.client): An S3 client instance. | |
bucket_name (str): Name of the S3 bucket. | |
file_path (str): Path to the local file to upload. | |
key (str, optional): The S3 object key under which the file will be stored. | |
If None, uses the basename of file_path. | |
Returns: | |
str: a URL pointing to the uploaded file (bucket must be public for the URL to work) | |
Raises: | |
FileNotFoundError: If the specified local file does not exist. | |
RuntimeError: If the upload operation fails. | |
""" | |
if not os.path.isfile(file_path): | |
raise FileNotFoundError(f"Local file '{file_path}' does not exist.") | |
object_key = key or os.path.basename(file_path) | |
try: | |
s3_client.upload_file(file_path, bucket_name, object_key) | |
return f"http://{bucket_name}.s3.{s3_client.meta.region_name}.amazonaws.com/{key}" | |
except ClientError as e: | |
raise RuntimeError(f"Failed to upload '{file_path}' to 's3://{bucket_name}/{object_key}': {e}") | |
def upload_image_from_url( | |
s3_client: boto3.client, | |
image_url: str, | |
bucket_name: str, | |
key: Optional[str] = None, | |
key_prefix: str = "", | |
) -> str: | |
""" | |
Downloads an image from a URL and uploads it to S3. | |
Args: | |
s3_client (boto3.client): A configured S3 client. | |
image_url (str): Public HTTP(s) URL of the image. | |
bucket_name (str): Name of the destination S3 bucket. | |
key (str, optional): S3 object key. If not provided, a UUID-based key will be generated. | |
key_prefix (str, optional): Optional prefix for the key (e.g., "uploads/"). | |
Returns: | |
str: Public S3 URL to the uploaded image. | |
Raises: | |
Exception: If the image download or upload fails. | |
""" | |
response = requests.get(image_url, timeout=10) | |
if response.status_code != 200: | |
raise RuntimeError(f"Failed to download image from {image_url}: {response.status_code}") | |
content_type = response.headers.get("Content-Type", "application/octet-stream") | |
extension = content_type.split("/")[-1] if "/" in content_type else "bin" | |
object_key = key or f"{key_prefix}{uuid4()}.{extension}" | |
try: | |
s3_client.put_object( | |
Bucket=bucket_name, | |
Key=object_key, | |
Body=response.content, | |
ContentType=content_type, | |
) | |
except ClientError as e: | |
raise RuntimeError(f"Failed to upload image to s3://{bucket_name}/{object_key}: {e}") | |
return f"https://{bucket_name}.s3.{s3_client.meta.region_name}.amazonaws.com/{object_key}" | |
def download_file(s3_client: boto3.client, bucket_name: str, key: str, destination_path: str) -> None: | |
""" | |
Download a file from an S3 bucket to a local path. | |
Args: | |
s3_client (boto3.client): An S3 client instance. | |
bucket_name (str): Name of the S3 bucket. | |
key (str): S3 object key to download. | |
destination_path (str): Local filesystem path to save the downloaded file. | |
Raises: | |
RuntimeError: If the download operation fails. | |
""" | |
directory = os.path.dirname(destination_path) | |
if directory and not os.path.isdir(directory): | |
os.makedirs(directory, exist_ok=True) | |
try: | |
s3_client.download_file(bucket_name, key, destination_path) | |
except ClientError as e: | |
raise RuntimeError(f"Failed to download 's3://{bucket_name}/{key}' to '{destination_path}': {e}") | |
def download_files(s3_client: boto3.client, bucket_name: str, keys: List[str], destination_dir: str) -> None: | |
""" | |
Download multiple objects from S3 into a local directory. | |
Args: | |
s3_client (boto3.client): An S3 client instance. | |
bucket_name (str): Name of the S3 bucket. | |
keys (List[str]): List of S3 object keys to download. | |
destination_dir (str): Local directory to save all downloaded files. | |
Raises: | |
RuntimeError: If any download operation fails. | |
""" | |
if not os.path.isdir(destination_dir): | |
os.makedirs(destination_dir, exist_ok=True) | |
errors: List[str] = [] | |
for key in keys: | |
local_file = os.path.join(destination_dir, os.path.basename(key)) | |
try: | |
download_file(s3_client, bucket_name, key, local_file) | |
except RuntimeError as e: | |
errors.append(str(e)) | |
if errors: | |
combined = "\n".join(errors) | |
raise RuntimeError(f"Some files failed to download:\n{combined}") | |
def parse_s3_url(url: str) -> tuple[str, str]: | |
""" | |
Parse an S3 URL and return the bucket name and key. | |
Args: | |
url (str): The S3 URL. | |
Returns: | |
tuple: (bucket_name, key) | |
Raises: | |
ValueError: If the URL format is invalid or cannot be parsed. | |
""" | |
parsed = urlparse(url) | |
# check for virtual-hosted style: bucket is subdomain | |
if parsed.netloc.endswith(".amazonaws.com"): | |
parts = parsed.netloc.split(".") | |
if parts[1] == "s3": | |
bucket = parts[0] | |
key = parsed.path.lstrip("/") | |
return bucket, key | |
# check for path-style: bucket in the path | |
if parsed.netloc.startswith("s3.") and parsed.path: | |
path_parts = parsed.path.lstrip("/").split("/", 1) | |
if len(path_parts) == 2: | |
return path_parts[0], path_parts[1] | |
raise ValueError("Invalid or unsupported S3 URL format") | |
if __name__ == "__main__": | |
import argparse | |
parser = argparse.ArgumentParser(description="Upload and download files from S3 using boto.") | |
parser.add_argument("--url", help="S3 URL for the object (used with upload or download)") | |
parser.add_argument("--region", default=None, help="AWS region") | |
parser.add_argument("--endpoint", default=None, help="Custom S3 endpoint URL") | |
parser.add_argument("--upload", help="Path to a local file to upload") | |
parser.add_argument("--download", help="Local path to download a file to") | |
parser.add_argument("--multi", nargs="+", help="List of S3 keys to download") | |
parser.add_argument("--outdir", help="Directory to save downloaded files") | |
args = parser.parse_args() | |
s3 = get_client(region_name=args.region, endpoint_url=args.endpoint) | |
if args.upload: | |
if not args.url: | |
raise ValueError("--url must be provided with --upload") | |
bucket, key = parse_s3_url(args.url) | |
upload_file(s3, bucket, args.upload, key=key) | |
print(f"Uploaded {args.upload} to bucket {bucket} with key {key}.") | |
if args.download: | |
if not args.url: | |
raise ValueError("--url must be provided with --download") | |
bucket, key = parse_s3_url(args.url) | |
download_file(s3, bucket, key, args.download) | |
print(f"Downloaded {key} from bucket {bucket} to {args.download}.") | |
if args.multi: | |
if not args.outdir: | |
raise ValueError("--outdir must be provided with --multi") | |
# For multi download, --url is not used because multiple keys are specified | |
# You still need --bucket for this operation: | |
raise NotImplementedError("Multi-download with URL parsing is not supported; please use bucket name and keys.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment