Skip to content

Instantly share code, notes, and snippets.

@rajivnarayan
Last active July 20, 2024 19:04
Show Gist options
  • Save rajivnarayan/c38f01b89de852b3e7d459cfde067f3f to your computer and use it in GitHub Desktop.
Save rajivnarayan/c38f01b89de852b3e7d459cfde067f3f to your computer and use it in GitHub Desktop.
A Pydantic model for validating S3 URLs
import re
from pydantic import BaseModel, Field, computed_field
class S3Path(BaseModel):
"""Pydantic Model for S3 URLs"""
# https://docs.aws.amazon.com/AmazonS3/latest/userguide/bucketnamingrules.html
_s3_pattern = re.compile(
r'^s3://'
r'(?=[a-z0-9])' # Bucket name must start with a letter or digit
r'(?!(^xn--|sthree-|sthree-configurator|.+-s3alias$))' # Bucket name must not start with xn--, sthree-, sthree-configurator or end with -s3alias
r'(?!.*\.\.)' # Bucket name must not contain two adjacent periods
r'[a-z0-9][a-z0-9.-]{1,61}[a-z0-9]' # Bucket naming constraints
r'(?<!\.-$)' # Bucket name must not end with a period followed by a hyphen
r'(?<!\.$)' # Bucket name must not end with a period
r'(?<!-$)' # Bucket name must not end with a hyphen
r'(/([a-zA-Z0-9._-]+/?)*)?$' # key naming constraints
)
url : str = Field(...,
strip_whitespace = True,
pattern = _s3_pattern,
min_length=8,
max_length=1023)
def s3_url_parts(self):
#path_parts=self.url.replace('s3://', '').split("/")
path_parts=self.url[5:].split("/")
bucket=path_parts.pop(0)
key='/'.join(path_parts)
return (bucket, key)
@computed_field
@property
def bucket(self) -> str:
"""Add bucket name as a computed property"""
return self.s3_url_parts()[0]
@computed_field
@property
def key(self) -> str:
"""Add key as a computed property"""
return self.s3_url_parts()[1]
good_urls = [
's3://my-bucket/',
's3://my.bucket.name/my-key',
's3://123-bucket/another/key/path',
's3://bucket/this/is/a/folder/',
's3://my-bucket/some/key/path/file.txt',
's3://10-bucket-01/nested.folder/structure/file.jpg',
's3://my-bucket/with-key']
bad_urls = [
's3://',
's3://my',
'https://my-bucket/',
's3://bucket-name/with/white space/in/key',
's3:/my-bucket/',
's3://my_bucket/',
's3://My-Bucket/',
's3://123.bucketname./invalid',
's3://123.bucketname.-/invalid',
's3://123.bucket..name/invalid',
's3://-bucket-starts-with-hyphen/',
's3://bucket-ends-with-hyphen-/',
's3://my-bucket/ends/with/slash//',
's3://toolongnameofthebucketwhichexceedssixtyfourcharacterslongforthisexample/',
's3://my-bucket/?query=parameters',
's3://my-bucket/this/path/has/a/tab/char\there'
]
print(f'Validating {len(good_urls)} good urls')
good_fails = []
for url in good_urls:
try:
print(S3Path(url = url).model_dump())
except Exception as e:
print(f"Error for good {url}: {e}")
good_fails.append(url)
print(f'Validating {len(bad_urls)} bad urls')
bad_fails = []
for url in bad_urls:
try:
print(S3Path(url = url).model_dump())
print(f'Error bad url passes : {url}')
bad_fails.append(url)
except Exception as e:
pass
print(f'Good urls that failed validation: {good_fails}')
print(f'Bad urls that failed validation: {bad_fails}')
@rajivnarayan
Copy link
Author

See pydantic/pydantic#4271 for discussions on this issue

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment