Skip to content

Instantly share code, notes, and snippets.

@ajorg
Last active June 3, 2026 02:17
Show Gist options
  • Select an option

  • Save ajorg/bb14202ad505126898ca91dcdbc05aa6 to your computer and use it in GitHub Desktop.

Select an option

Save ajorg/bb14202ad505126898ca91dcdbc05aa6 to your computer and use it in GitHub Desktop.
Wire copies something off the Internet into S3, in a Lambda function
# Copyright Andrew Jorgensen
# SPDX-License-Identifier: MIT
"""Copies data from a URL sources to S3 objects"""
import hashlib
import json
import os
import tempfile
from base64 import b64encode
from decimal import Decimal
from urllib.parse import urlparse
from urllib.request import Request, urlopen
import boto3
from botocore.exceptions import ClientError
NAME = "Wire"
VERSION = "1.0"
SOURCE = os.environ.get(
"SOURCE", "https://gist.github.com/ajorg/bb14202ad505126898ca91dcdbc05aa6"
)
CONTACT = os.environ.get("CONTACT", None)
if CONTACT:
SOURCE = ", ".join((SOURCE, CONTACT))
HEADERS = {"User-Agent": os.environ.get("USER_AGENT", f"{NAME}/{VERSION} ({SOURCE})")}
OBJECTS_WEEKLY = json.loads(os.environ.get("OBJECTS_WEEKLY", "{}"))
OBJECTS_DAILY = json.loads(os.environ.get("OBJECTS_DAILY", "{}"))
OBJECTS_HOURLY = json.loads(os.environ.get("OBJECTS_HOURLY", "{}"))
CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", 1024 * 1024))
TOPIC_ARN = os.environ.get("TOPIC_ARN", None)
S3 = boto3.client("s3")
SNS = boto3.client("sns")
def wire(destination, url):
"""Copies data from a URL source to an S3 object"""
print(f"Wiring {url} to {destination}")
parsed_d = urlparse(destination)
key = parsed_d.path.lstrip("/")
bucket = parsed_d.netloc
content_types = {
".csv": "text/csv",
".json": "application/json",
".zip": "application/zip",
}
content_type = content_types.get(
os.path.splitext(key)[1], "application/octet-stream"
)
headers = HEADERS.copy()
try:
obj = S3.head_object(Bucket=bucket, Key=key)
metadata = obj.get("Metadata", {})
if v := metadata.get("etag-origin"):
headers["If-None-Match"] = v
if v := metadata.get("last-modified-origin"):
headers["If-Modified-Since"] = v
etag = obj.get("ETag", "").strip('"')
except ClientError:
etag = None
try:
with urlopen(Request(url, headers=headers)) as response:
md5 = hashlib.md5()
size_bytes = 0
with tempfile.TemporaryFile() as f:
while chunk := response.read(CHUNK_SIZE):
f.write(chunk)
md5.update(chunk)
size_bytes += len(chunk)
digest = md5.hexdigest()
print(f"{Decimal(size_bytes) / 2**20:.2f}MiB, md5:{digest}")
if etag == digest:
print(f"Matched {key}")
return
f.seek(0)
metadata = {"Location": url}
if v := response.headers.get("ETag"):
metadata["ETag-Origin"] = v
if v := response.headers.get("Last-Modified"):
metadata["Last-Modified-Origin"] = v
S3.put_object(
Bucket=bucket,
Key=key,
ACL="public-read",
Body=f,
ContentLength=size_bytes,
ContentMD5=b64encode(md5.digest()).decode("us-ascii"),
ContentType=content_type,
Metadata=metadata,
)
print(f"Put {key}")
except Exception as e:
if getattr(e, "code", None) == 304:
print(f"Not Modified {key}")
else:
error = f"Error {url}: {e}"
print(error)
if TOPIC_ARN:
SNS.publish(TopicArn=TOPIC_ARN, Subject=NAME, Message=error)
def lambda_handler(event, context):
"""AWS Lambda handler function"""
# print(json.dumps(dict(os.environ), sort_keys=True))
print(json.dumps(event, sort_keys=True))
if "resources" not in event:
# Assume raw invocation
objects = event
else:
source = event["resources"][0]
source_name = source.split("/")[1]
if source_name in ("weekly",):
objects = OBJECTS_WEEKLY
elif source_name in ("1pm-UTC", "daily"):
objects = OBJECTS_DAILY
elif source_name in ("hourly",):
objects = OBJECTS_HOURLY
else:
raise ValueError(f"Unknown event source {source}")
print(json.dumps(objects, sort_keys=True))
for destination, url in objects.items():
try:
wire(destination, url)
except Exception as e:
print(e)
if __name__ == "__main__":
from argparse import ArgumentParser
parser = ArgumentParser(prog=NAME)
parser.add_argument("objects")
args = parser.parse_args()
lambda_handler(json.loads(args.objects), None)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment