-
-
Save greggirwin/207149d46441cd48a1426e60926a7d25 to your computer and use it in GitHub Desktop.
Red [ | |
title: "RFC3986 URL parser" | |
file: %url-parser.red | |
author: "@greggirwin" | |
date: 03-Oct-2018 | |
notes: { | |
Reference: https://tools.ietf.org/html/rfc3986#page-16 | |
Most rule names are taken from the RFC, with the goal of | |
making it easy to compare to the reference. Some rules | |
are simplified in this version (e.g. IP address literals). | |
Where pct-encoded rules are listed in the RFC, they are | |
omitted from parse rules here, as the input is dehexed | |
before being parsed. | |
Relative URI path references are not yet supported. | |
} | |
] | |
url-parser: object [ | |
;-- Parse Variables | |
=scheme: =user-info: =host: =port: =path: =query: =fragment: none | |
vars: [=scheme =user-info =host =port =path =query =fragment] | |
;-- General Character Sets | |
alpha: charset [#"a" - #"z" #"A" - #"Z"] | |
digit: charset "0123456789" | |
alpha-num: union alpha digit | |
;-- URL Character Sets | |
; The purpose of reserved characters is to provide a set of delimiting | |
; characters that are distinguishable from other data within a URI. | |
gen-delims: charset ":/?#[]@" | |
sub-delims: charset "!$&'()*+,;=" | |
reserved: [gen-delims | sub-delims] | |
unreserved: compose [alpha | digit | (charset "-._~")] | |
; Helper func for extending alpha-num | |
alpha-num+: func [more [string!]][union alpha-num charset more] | |
scheme-char: alpha-num+ "+-." | |
;-- URL Grammar | |
url-rules: [scheme-part hier-part opt query opt fragment] | |
scheme-part: [copy =scheme [alpha some scheme-char] #":"] | |
hier-part: ["//" authority path-abempty | path-absolute | path-rootless | path-empty] | |
; The authority component is preceded by a double slash ("//") and is | |
; terminated by the next slash ("/"), question mark ("?"), or number | |
; sign ("#") character, or by the end of the URI. | |
authority: [opt user-info host opt [":" port]] | |
; "user:password" format for user-info is deprecated. | |
user-info: [copy =user-info [any [unreserved | sub-delims | #":"] #"@"]] | |
; Host is not detailed per the RFC yet. It covers IPv6 addresses, which go in | |
; square brackets, making them a non-loadable URL in Red. They can also contain | |
; colons, which makes finding the port marker more involved. | |
IP-literal: [copy =IP-literal ["[" thru "]"]] ; simplified from [IPv6address | IPvFuture] | |
host: [ | |
IP-literal (=host: =IP-literal) | |
| copy =host any [unreserved | sub-delims] | |
] | |
port: [copy =port [1 5 digit]] | |
; path-abempty ; begins with "/" or is empty | |
; path-absolute ; begins with "/" but not "//" | |
; path-noscheme ; begins with a non-colon segment | |
; path-rootless ; begins with a segment | |
; path-empty ; zero characters | |
path-abempty: [copy =path any-segments | path-empty] ; (print ["path:" mold =path]) | |
path-absolute: [copy =path [#"/" opt [segment-nz any-segments]]] ; (print ["path-abs:" mold =path]) | |
;!! path-noscheme is only used in relative URIs, which aren't supported here yet. | |
;path-noscheme: [copy =path [segment-nz-nc any-segments]] ; (print ["path-no-scheme:" mold =path]) | |
path-rootless: [copy =path [segment-nz any-segments]] ; (print ["path-rootless:" mold =path]) | |
path-empty: [none] | |
any-segments: [any [#"/" segment]] | |
segment: [any pchar] | |
segment-nz: [some pchar] | |
segment-nz-nc: [some [unreserved | sub-delims | #"@"]] ; non-zero-length segment with no colon | |
pchar: [unreserved | sub-delims | #":" | #"@"] ; path characters | |
query: ["?" copy =query any [pchar | slash | #"?"]] | |
fragment: ["#" copy =fragment any [pchar | slash | #"?"]] | |
;-- Parse Function | |
parse-url: function [ | |
"Return object with URL components, or cause an error if not a valid URL" | |
url [url! string!] | |
/throw-error "Throw an error, instead of returning NONE." | |
/extern vars =path =host | |
][ | |
set vars none ; clear object level parse variables | |
either parse dehex url url-rules [ | |
if empty? =host [=host: none] | |
=path: either all [=path not empty? =path][ | |
split-path to file! =path | |
][ | |
[#[none] #[none]] | |
] | |
object [ | |
scheme: to word! =scheme | |
user-info: =user-info | |
host: =host | |
port: if =port [to integer! =port] | |
path: first =path | |
target: second =path | |
query: =query | |
fragment: =fragment | |
] | |
][ | |
if throw-error [ | |
make error! rejoin ["URL error: " url] | |
] | |
] | |
] | |
; Exported function (Rebol compatible name) | |
set 'decode-url function [ | |
"Decode a URL into an object containing its constituent parts" | |
url [url! string!] | |
][ | |
parse-url url | |
] | |
] | |
test-urls: reduce [ | |
foo:// ; no path | |
object [ | |
scheme: 'foo | |
user-info: none | |
host: none | |
port: none | |
path: none | |
target: none | |
query: none | |
fragment: none | |
] | |
foo:/a/b/c ; path-absolute | |
object [ | |
scheme: 'foo | |
user-info: none | |
host: none | |
port: none | |
path: %/a/b/ | |
target: %c | |
query: none | |
fragment: none | |
] | |
foo://example.com:8042/over/there?name=ferret#nose | |
object [ | |
scheme: 'foo | |
user-info: none | |
host: "example.com" | |
port: 8042 | |
path: %/over/ | |
target: %there | |
query: "name=ferret" | |
fragment: "nose" | |
] | |
ftp://ftp.is.co.za/rfc/rfc1808.txt | |
object [ | |
scheme: 'ftp | |
user-info: none | |
host: "ftp.is.co.za" | |
port: none | |
path: %/rfc/ | |
target: %rfc1808.txt | |
query: none | |
fragment: none | |
] | |
http://www.ietf.org/rfc/rfc2396.txt | |
object [ | |
scheme: 'http | |
user-info: none | |
host: "www.ietf.org" | |
port: none | |
path: %/rfc/ | |
target: %rfc2396.txt | |
query: none | |
fragment: none | |
] | |
to url! "ldap://[2001:db8::7]/c=GB?objectClass?one" ; not loadable Red | |
object [ | |
scheme: 'ldap | |
user-info: none | |
host: "[2001:db8::7]" | |
port: none | |
path: %/ | |
target: %c=GB | |
query: "objectClass?one" | |
fragment: none | |
] | |
mailto:[email protected] | |
object [ | |
scheme: 'mailto | |
user-info: none | |
host: none | |
port: none | |
path: %./ | |
target: %"[email protected]" | |
query: none | |
fragment: none | |
] | |
news:comp.infosystems.www.servers.unix | |
object [ | |
scheme: 'news | |
user-info: none | |
host: none | |
port: none | |
path: %./ | |
target: %comp.infosystems.www.servers.unix | |
query: none | |
fragment: none | |
] | |
tel:+1-816-555-1212 | |
object [ | |
scheme: 'tel | |
user-info: none | |
host: none | |
port: none | |
path: %./ | |
target: %+1-816-555-1212 | |
query: none | |
fragment: none | |
] | |
telnet://192.0.2.16:80/ | |
object [ | |
scheme: 'telnet | |
user-info: none | |
host: "192.0.2.16" | |
port: 80 | |
path: %/ | |
target: none | |
query: none | |
fragment: none | |
] | |
urn:oasis:names:specification:docbook:dtd:xml:4.1.2 | |
object [ | |
scheme: 'urn | |
user-info: none | |
host: none | |
port: none | |
path: %./ | |
target: %"oasis:names:specification:docbook:dtd:xml:4.1.2" | |
query: none | |
fragment: none | |
] | |
] | |
test-url-parser: function [input expected-result][ | |
if expected-result <> res: decode-url input [ | |
print [ | |
"parse-url failed for url:" mold url newline | |
"Expected:" mold expected-result newline | |
"Got:" mold res | |
] | |
] | |
] | |
foreach [url obj] test-urls [test-url-parser url obj] |
L100: using dehex
here is a problem (note that for url!
values, Red currently does it for you, which is also wrong).
Practical example: say you have a user name "[email protected]" and a password of "pass" for you FTP access. (This is a pretty common scenario with virtual hosting etc.). You want to express that as a URL, so you use:
ftp://me%40domain.com:[email protected]/
Red, and your code above if you pass a string!
, will convert it to ftp://[email protected]:[email protected]/
before parsing it which doesn't work.
The standard states that "the components and subcomponents significant to the scheme-specific dereferencing process (if any) must be parsed and separated before the percent-encoded octets within those components can be safely decoded", with the exception of "percent-encoded octets corresponding to characters in the unreserved set, which can be decoded at any time".
In my parser I solved this problem by having separate parse-uri
and decode-uri-fields
functions (separate because the latter makes assumptions that are not valid in general for all URIs, but are valid for common schemes like HTTP, FTP and so on; it could be argued that decode-uri-fields
is the most common case, and perhaps should happen by default, and there should be a refinement to turn it off).
L119:
reform
is not defined in Red.