Skip to content

Instantly share code, notes, and snippets.

@brayevalerien
Last active August 1, 2025 06:18
Show Gist options
  • Save brayevalerien/507cb60b4cbc6e607d6eb67ca41fde5d to your computer and use it in GitHub Desktop.
Save brayevalerien/507cb60b4cbc6e607d6eb67ca41fde5d to your computer and use it in GitHub Desktop.
Helper function for downloading and uploading files to S3 using boto3. Assumes AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY are set in the environment.
"""
Helpers for downloading from and uploading to an S3 bucket.
Adapted from https://docs.runpod.io/serverless/storage/s3-api.
"""
import os
from typing import List, Optional
from urllib.parse import urlparse
from uuid import uuid4
import boto3
import requests
from botocore.exceptions import ClientError
def get_client(region_name: Optional[str] = None, endpoint_url: Optional[str] = None) -> boto3.client:
"""
Create and return a boto3 S3 client configured via environment variables.
Reads AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY from the environment.
Args:
region_name (str, optional): AWS region to configure the client for.
endpoint_url (str, optional): Custom S3-compatible endpoint URL (to use Runpod datacenters, set to https://REGION_NAME-s3api.runpod.io/).
Returns:
boto3.client: A configured S3 client instance.
Raises:
EnvironmentError: If AWS credentials are missing from the environment.
RuntimeError: If client creation fails for another reason.
"""
access_key = os.environ.get("AWS_ACCESS_KEY_ID")
secret_key = os.environ.get("AWS_SECRET_ACCESS_KEY")
if not access_key or not secret_key:
raise EnvironmentError("AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY must be set in the environment.")
try:
return boto3.client(
"s3",
aws_access_key_id=access_key,
aws_secret_access_key=secret_key,
region_name=region_name,
endpoint_url=endpoint_url,
)
except Exception as e:
raise RuntimeError(f"Error creating S3 client: {e}")
def upload_file(s3_client: boto3.client, bucket_name: str, file_path: str, key: Optional[str] = None) -> str:
"""
Upload a local file to an S3 bucket.
Args:
s3_client (boto3.client): An S3 client instance.
bucket_name (str): Name of the S3 bucket.
file_path (str): Path to the local file to upload.
key (str, optional): The S3 object key under which the file will be stored.
If None, uses the basename of file_path.
Returns:
str: a URL pointing to the uploaded file (bucket must be public for the URL to work)
Raises:
FileNotFoundError: If the specified local file does not exist.
RuntimeError: If the upload operation fails.
"""
if not os.path.isfile(file_path):
raise FileNotFoundError(f"Local file '{file_path}' does not exist.")
object_key = key or os.path.basename(file_path)
try:
s3_client.upload_file(file_path, bucket_name, object_key)
return f"http://{bucket_name}.s3.{s3_client.meta.region_name}.amazonaws.com/{key}"
except ClientError as e:
raise RuntimeError(f"Failed to upload '{file_path}' to 's3://{bucket_name}/{object_key}': {e}")
def upload_image_from_url(
s3_client: boto3.client,
image_url: str,
bucket_name: str,
key: Optional[str] = None,
key_prefix: str = "",
) -> str:
"""
Downloads an image from a URL and uploads it to S3.
Args:
s3_client (boto3.client): A configured S3 client.
image_url (str): Public HTTP(s) URL of the image.
bucket_name (str): Name of the destination S3 bucket.
key (str, optional): S3 object key. If not provided, a UUID-based key will be generated.
key_prefix (str, optional): Optional prefix for the key (e.g., "uploads/").
Returns:
str: Public S3 URL to the uploaded image.
Raises:
Exception: If the image download or upload fails.
"""
response = requests.get(image_url, timeout=10)
if response.status_code != 200:
raise RuntimeError(f"Failed to download image from {image_url}: {response.status_code}")
content_type = response.headers.get("Content-Type", "application/octet-stream")
extension = content_type.split("/")[-1] if "/" in content_type else "bin"
object_key = key or f"{key_prefix}{uuid4()}.{extension}"
try:
s3_client.put_object(
Bucket=bucket_name,
Key=object_key,
Body=response.content,
ContentType=content_type,
)
except ClientError as e:
raise RuntimeError(f"Failed to upload image to s3://{bucket_name}/{object_key}: {e}")
return f"https://{bucket_name}.s3.{s3_client.meta.region_name}.amazonaws.com/{object_key}"
def download_file(s3_client: boto3.client, bucket_name: str, key: str, destination_path: str) -> None:
"""
Download a file from an S3 bucket to a local path.
Args:
s3_client (boto3.client): An S3 client instance.
bucket_name (str): Name of the S3 bucket.
key (str): S3 object key to download.
destination_path (str): Local filesystem path to save the downloaded file.
Raises:
RuntimeError: If the download operation fails.
"""
directory = os.path.dirname(destination_path)
if directory and not os.path.isdir(directory):
os.makedirs(directory, exist_ok=True)
try:
s3_client.download_file(bucket_name, key, destination_path)
except ClientError as e:
raise RuntimeError(f"Failed to download 's3://{bucket_name}/{key}' to '{destination_path}': {e}")
def download_files(s3_client: boto3.client, bucket_name: str, keys: List[str], destination_dir: str) -> None:
"""
Download multiple objects from S3 into a local directory.
Args:
s3_client (boto3.client): An S3 client instance.
bucket_name (str): Name of the S3 bucket.
keys (List[str]): List of S3 object keys to download.
destination_dir (str): Local directory to save all downloaded files.
Raises:
RuntimeError: If any download operation fails.
"""
if not os.path.isdir(destination_dir):
os.makedirs(destination_dir, exist_ok=True)
errors: List[str] = []
for key in keys:
local_file = os.path.join(destination_dir, os.path.basename(key))
try:
download_file(s3_client, bucket_name, key, local_file)
except RuntimeError as e:
errors.append(str(e))
if errors:
combined = "\n".join(errors)
raise RuntimeError(f"Some files failed to download:\n{combined}")
def parse_s3_url(url: str) -> tuple[str, str]:
"""
Parse an S3 URL and return the bucket name and key.
Args:
url (str): The S3 URL.
Returns:
tuple: (bucket_name, key)
Raises:
ValueError: If the URL format is invalid or cannot be parsed.
"""
parsed = urlparse(url)
# check for virtual-hosted style: bucket is subdomain
if parsed.netloc.endswith(".amazonaws.com"):
parts = parsed.netloc.split(".")
if parts[1] == "s3":
bucket = parts[0]
key = parsed.path.lstrip("/")
return bucket, key
# check for path-style: bucket in the path
if parsed.netloc.startswith("s3.") and parsed.path:
path_parts = parsed.path.lstrip("/").split("/", 1)
if len(path_parts) == 2:
return path_parts[0], path_parts[1]
raise ValueError("Invalid or unsupported S3 URL format")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Upload and download files from S3 using boto.")
parser.add_argument("--url", help="S3 URL for the object (used with upload or download)")
parser.add_argument("--region", default=None, help="AWS region")
parser.add_argument("--endpoint", default=None, help="Custom S3 endpoint URL")
parser.add_argument("--upload", help="Path to a local file to upload")
parser.add_argument("--download", help="Local path to download a file to")
parser.add_argument("--multi", nargs="+", help="List of S3 keys to download")
parser.add_argument("--outdir", help="Directory to save downloaded files")
args = parser.parse_args()
s3 = get_client(region_name=args.region, endpoint_url=args.endpoint)
if args.upload:
if not args.url:
raise ValueError("--url must be provided with --upload")
bucket, key = parse_s3_url(args.url)
upload_file(s3, bucket, args.upload, key=key)
print(f"Uploaded {args.upload} to bucket {bucket} with key {key}.")
if args.download:
if not args.url:
raise ValueError("--url must be provided with --download")
bucket, key = parse_s3_url(args.url)
download_file(s3, bucket, key, args.download)
print(f"Downloaded {key} from bucket {bucket} to {args.download}.")
if args.multi:
if not args.outdir:
raise ValueError("--outdir must be provided with --multi")
# For multi download, --url is not used because multiple keys are specified
# You still need --bucket for this operation:
raise NotImplementedError("Multi-download with URL parsing is not supported; please use bucket name and keys.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment