Skip to content

Instantly share code, notes, and snippets.

@lilliemarck
Last active October 16, 2021 03:19
Show Gist options
  • Save lilliemarck/4515664 to your computer and use it in GitHub Desktop.
Save lilliemarck/4515664 to your computer and use it in GitHub Desktop.
URI Regular Expression
/**
* Regex for validating a URI or URI Reference as defined in RFC3986.
*
* A URI Reference is a URI without the scheme part. They are otherwise the
* same except that the first segment of a relative path of a URI reference
* can't contain a ':'.
*
* This syntax does not have IPv6 address support and does not include the IPv4
* rule because IPv4 addresses happens to be a subset of registered names.
* Although the final regex can be made a little bit more compact the intent is
* to follow the rules as closely as possible for readability and not introduce
* any new rules.
*/
#define URI SCHEME ":" HIER_PART "(\\?" QUERY ")?(#" FRAGMENT ")?"
#define HIER_PART "(//" AUTHORITY PATH_ABEMPTY "|" PATH_ABSOLUTE "|" PATH_ROOTLESS "|" PATH_EMPTY ")"
#define URI_REFERENCE URI "|" RELATIVE_REF
#define RELATIVE_REF RELATIVE_PART "(\\?" QUERY ")?(#" FRAGMENT ")?"
#define RELATIVE_PART "(//" AUTHORITY PATH_ABEMPTY "|" PATH_ABSOLUTE "|" PATH_NOSCHEME "|" PATH_EMPTY ")"
#define SCHEME "[A-Za-z][A-Za-z0-9+.-]*"
#define AUTHORITY "(" USERINFO "@)?" HOST "(:" PORT ")?"
#define USERINFO "(" UNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|:)*"
#define HOST "(" UNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS ")*"
#define PORT "[0-9]*"
#define PATH_ABEMPTY "(/" SEGMENT ")*"
#define PATH_ABSOLUTE "/(" SEGMENT_NZ "(/" SEGMENT ")*)?"
#define PATH_NOSCHEME SEGMENT_NZ_NC "(/" SEGMENT ")*"
#define PATH_ROOTLESS SEGMENT_NZ "(/" SEGMENT ")*"
#define PATH_EMPTY "()"
#define SEGMENT PCHAR "*"
#define SEGMENT_NZ PCHAR "+"
#define SEGMENT_NZ_NC "(" UNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|@)+"
#define PCHAR "(" UNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|[:@])"
#define QUERY "(" PCHAR "|[/?])*"
#define FRAGMENT "(" PCHAR "|[/?])*"
#define PCT_ENCODED "%[0-9A-Fa-f]{2}"
#define UNRESERVED "[A-Za-z0-9._~-]"
#define SUB_DELIMS "[!$&'()*+,;=]"
/**
* Syntax for a URI as defined in RFC3986.
*
* This syntax does not have IPv6 address support and does not include the IPv4
* rule because IPv4 addresses happens to be a subset of registered names.
*/
#define URI SCHEME ":" HIER_PART "(?:\\?" QUERY ")?(?:#" FRAGMENT ")?"
#define HIER_PART "(?://" AUTHORITY PATH_ABEMPTY "|" PATH_ABSOLUTE "|" PATH_ROOTLESS "|" PATH_EMPTY ")"
#define SCHEME "(?<scheme>[[:alpha:]][[:alnum:]+.-]*)"
#define AUTHORITY "(?<authority>(?:" USERINFO "@)?" HOST "(?::" PORT ")?)"
#define USERINFO "(?<userinfo>(?:" UNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|:)*)"
#define HOST "(?<host>(?:" UNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS ")*)"
#define PORT "(?<port>[0-9]*)"
#define PATH_ABEMPTY "(?<path>(?:/" SEGMENT ")*)"
#define PATH_ABSOLUTE "(?<path>/(?:" SEGMENT_NZ "(?:/" SEGMENT ")*)?)"
#define PATH_ROOTLESS "(?<path>" SEGMENT_NZ "(?:/" SEGMENT ")*)"
#define PATH_EMPTY "(?<path>)"
#define SEGMENT PCHAR "*"
#define SEGMENT_NZ PCHAR "+"
#define PCHAR "(?:" UNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|[:@])"
#define QUERY "(?<query>(?:" PCHAR "|[/?])*)"
#define FRAGMENT "(?<fragment>(?:" PCHAR "|[/?])*)"
#define PCT_ENCODED "%[[:xdigit:]]{2}"
#define UNRESERVED "[[:alnum:]._~-]"
#define SUB_DELIMS "[!$&'()*+,;=]"
@Miglecz
Copy link

Miglecz commented Sep 18, 2019

Evaluated URI reference:
[A-Za-z][A-Za-z0-9+.-]*:(//(([A-Za-z0-9._~-]|%[0-9A-Fa-f]{2}|[!$&'()*+,;=]|:)*@)?([A-Za-z0-9._~-]|%[0-9A-Fa-f]{2}|[!$&'()*+,;=])*(:[0-9]*)?(/([A-Za-z0-9._~-]|%[0-9A-Fa-f]{2}|[!$&'()*+,;=]|[:@])*)*|/(([A-Za-z0-9._~-]|%[0-9A-Fa-f]{2}|[!$&'()*+,;=]|[:@])+(/([A-Za-z0-9._~-]|%[0-9A-Fa-f]{2}|[!$&'()*+,;=]|[:@])*)*)?|([A-Za-z0-9._~-]|%[0-9A-Fa-f]{2}|[!$&'()*+,;=]|[:@])+(/([A-Za-z0-9._~-]|%[0-9A-Fa-f]{2}|[!$&'()*+,;=]|[:@])*)*|())(\?(([A-Za-z0-9._~-]|%[0-9A-Fa-f]{2}|[!$&'()*+,;=]|[:@])|[/?])*)?(#(([A-Za-z0-9._~-]|%[0-9A-Fa-f]{2}|[!$&'()*+,;=]|[:@])|[/?])*)?|(//(([A-Za-z0-9._~-]|%[0-9A-Fa-f]{2}|[!$&'()*+,;=]|:)*@)?([A-Za-z0-9._~-]|%[0-9A-Fa-f]{2}|[!$&'()*+,;=])*(:[0-9]*)?(/([A-Za-z0-9._~-]|%[0-9A-Fa-f]{2}|[!$&'()*+,;=]|[:@])*)*|/(([A-Za-z0-9._~-]|%[0-9A-Fa-f]{2}|[!$&'()*+,;=]|[:@])+(/([A-Za-z0-9._~-]|%[0-9A-Fa-f]{2}|[!$&'()*+,;=]|[:@])*)*)?|([A-Za-z0-9._~-]|%[0-9A-Fa-f]{2}|[!$&'()*+,;=]|@)+(/([A-Za-z0-9._~-]|%[0-9A-Fa-f]{2}|[!$&'()*+,;=]|[:@])*)*|())(\?(([A-Za-z0-9._~-]|%[0-9A-Fa-f]{2}|[!$&'()*+,;=]|[:@])|[/?])*)?(#(([A-Za-z0-9._~-]|%[0-9A-Fa-f]{2}|[!$&'()*+,;=]|[:@])|[/?])*)?

Visualised:
link

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment