Skip to content

Instantly share code, notes, and snippets.

@Sachaa-Thanasius
Forked from kenballus/rfc_3986_regex.py
Created June 18, 2024 00:41
Show Gist options
  • Save Sachaa-Thanasius/4e17c00dfed46143275844031766784c to your computer and use it in GitHub Desktop.
Save Sachaa-Thanasius/4e17c00dfed46143275844031766784c to your computer and use it in GitHub Desktop.
A direct translation from RFC3986's collected ABNF to python regexes.
# unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
UNRESERVED_PAT: str = r"([A-Za-z0-9\-\._~])"
# pct-encoded = "%" HEXDIG HEXDIG
PCT_ENCODED_PAT: str = r"(%[A-F0-9][A-F0-9])"
# sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
SUB_DELIMS_PAT: str = r"([!\$&'\(\)\*\+,;=])"
# pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
PCHAR_PAT: str = rf"({UNRESERVED_PAT}|{PCT_ENCODED_PAT}|{SUB_DELIMS_PAT}|:|@)"
# query = *( pchar / "/" / "?" )
QUERY_PAT: str = rf"({PCHAR_PAT}|/|\?)*"
# fragment = *( pchar / "/" / "?" )
FRAGMENT_PAT: str = rf"({PCHAR_PAT}|/|\?)*"
# scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
SCHEME_PAT: str = r"([A-Za-z][A-Za-z0-9\+\-\.]*)"
# segment = *pchar
SEGMENT_PAT: str = rf"({PCHAR_PAT}*)"
# segment-nz = 1*pchar
SEGMENT_NZ_PAT: str = rf"({PCHAR_PAT}+)"
# path-absolute = "/" [ segment-nz *( "/" segment ) ]
PATH_ABSOLUTE_PAT: str = rf"(/(?:{SEGMENT_NZ_PAT}(?:/{SEGMENT_PAT})*)?)"
# path-empty = 0<pchar>
PATH_EMPTY_PAT: str = r""
# path-rootless = segment-nz *( "/" segment )
PATH_ROOTLESS_PAT: str = rf"({SEGMENT_NZ_PAT}(?:/{SEGMENT_PAT})*)"
# path-abempty = *( "/" segment )
PATH_ABEMPTY_PAT: str = rf"((?:/{SEGMENT_PAT})*)"
# userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
USERINFO_PAT: str = rf"((?:{UNRESERVED_PAT}|{PCT_ENCODED_PAT}|{SUB_DELIMS_PAT}|:)*)"
# dec-octet = DIGIT ; 0-9
# / %x31-39 DIGIT ; 10-99
# / "1" 2DIGIT ; 100-199
# / "2" %x30-34 DIGIT ; 200-249
# / "25" %x30-35 ; 250-255
DEC_OCTET_PAT: str = rf"([0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])"
# IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
IPV4ADDRESS_PAT: str = rf"({DEC_OCTET_PAT}\.{DEC_OCTET_PAT}\.{DEC_OCTET_PAT}\.{DEC_OCTET_PAT})"
# h16 = 1*4HEXDIG
H16_PAT: str = r"([0-9A-F]{1,4})"
# ls32 = ( h16 ":" h16 ) / IPv4address
LS32_PAT: str = rf"({H16_PAT}:{H16_PAT}|{IPV4ADDRESS_PAT})"
# IPv6address = 6( h16 ":" ) ls32
# / "::" 5( h16 ":" ) ls32
# / [ h16 ] "::" 4( h16 ":" ) ls32
# / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32
# / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32
# / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32
# / [ *4( h16 ":" ) h16 ] "::" ls32
# / [ *5( h16 ":" ) h16 ] "::" h16
# / [ *6( h16 ":" ) h16 ] "::"
IPV6ADDRESS_PAT: str = "(" + r"|".join((
rf"(?:{H16_PAT}:){{6}}{LS32_PAT}",
rf"::(?:{H16_PAT}:){{5}}{LS32_PAT}",
rf"(?:{H16_PAT})?::(?:{H16_PAT}:){{4}}{LS32_PAT}",
rf"(?:(?:{H16_PAT}:){{0,1}}{H16_PAT})?::(?:{H16_PAT}:){{3}}{LS32_PAT}",
rf"(?:(?:{H16_PAT}:){{0,2}}{H16_PAT})?::(?:{H16_PAT}:){{2}}{LS32_PAT}",
rf"(?:(?:{H16_PAT}:){{0,3}}{H16_PAT})?::(?:{H16_PAT}:){{1}}{LS32_PAT}",
rf"(?:(?:{H16_PAT}:){{0,4}}{H16_PAT})?::{LS32_PAT}",
rf"(?:(?:{H16_PAT}:){{0,5}}{H16_PAT})?::{H16_PAT}",
rf"(?:(?:{H16_PAT}:){{0,6}}{H16_PAT})?::",
)) + ")"
# IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
# IPVFUTURE_PAT: str = rf"v[0-9A-F]+\.({UNRESERVED_PAT}|{SUB_DELIMS_PAT}|:)+"
# IP-literal = "[" ( IPv6address / IPvFuture ) "]"
# IP_LITERAL_PAT: str = rf"(\[(?:{IPV6ADDRESS_PAT}|{IPVFUTURE_PAT})\])"
# ipvfuture is often unimplemented, so omit it:
IP_LITERAL_PAT: str = rf"(\[{IPV6ADDRESS_PAT}\])"
# reg-name = *( unreserved / pct-encoded / sub-delims )
REG_NAME_PAT: str = rf"((?:{UNRESERVED_PAT}|{PCT_ENCODED_PAT}|{SUB_DELIMS_PAT})*)"
# host = IP-literal / IPv4address / reg-name
HOST_PAT: str = rf"({IP_LITERAL_PAT}|{IPV4ADDRESS_PAT}|{REG_NAME_PAT})"
# port = *DIGIT
# PORT_PAT: str = r"([0-9]*)"
# WHATWG version (fits in uint16_t):
PORT_PAT: str = r"(0*[1-9]?[0-9]?[0-9]?[0-9]?|0*6553[0-5]|0*655[0-2][0-9]|0*65[0-4][0-9][0-9]|0*6[0-4][0-9][0-9][0-9])"
# userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
USERINFO_PAT: str = rf"((?:{UNRESERVED_PAT}|{PCT_ENCODED_PAT}|{SUB_DELIMS_PAT}|:)*)"
# authority = [ userinfo "@" ] host [ ":" port ]
AUTHORITY_PAT: str = rf"((?:{USERINFO_PAT}@)?{HOST_PAT}(:{PORT_PAT})?)"
# hier-part = "//" authority path-abempty
# / path-absolute
# / path-rootless
# / path-empty
HIER_PART_PAT: str = rf"((?://{AUTHORITY_PAT}{PATH_ABEMPTY_PAT})|{PATH_ABSOLUTE_PAT}|{PATH_ROOTLESS_PAT}|{PATH_EMPTY_PAT})"
# URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
URI_PAT: str = rf"({SCHEME_PAT}:{HIER_PART_PAT}(?:\?{QUERY_PAT})?(?:#{FRAGMENT_PAT})?)"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment