-
-
Save greggirwin/207149d46441cd48a1426e60926a7d25 to your computer and use it in GitHub Desktop.
| Red [ | |
| title: "RFC3986 URL parser" | |
| file: %url-parser.red | |
| author: "@greggirwin" | |
| date: 03-Oct-2018 | |
| notes: { | |
| Reference: https://tools.ietf.org/html/rfc3986#page-16 | |
| Most rule names are taken from the RFC, with the goal of | |
| making it easy to compare to the reference. Some rules | |
| are simplified in this version (e.g. IP address literals). | |
| Where pct-encoded rules are listed in the RFC, they are | |
| omitted from parse rules here, as the input is dehexed | |
| before being parsed. | |
| Relative URI path references are not yet supported. | |
| } | |
| ] | |
| url-parser: object [ | |
| ;-- Parse Variables | |
| =scheme: =user-info: =host: =port: =path: =query: =fragment: none | |
| vars: [=scheme =user-info =host =port =path =query =fragment] | |
| ;-- General Character Sets | |
| alpha: charset [#"a" - #"z" #"A" - #"Z"] | |
| digit: charset "0123456789" | |
| alpha-num: union alpha digit | |
| ;-- URL Character Sets | |
| ; The purpose of reserved characters is to provide a set of delimiting | |
| ; characters that are distinguishable from other data within a URI. | |
| gen-delims: charset ":/?#[]@" | |
| sub-delims: charset "!$&'()*+,;=" | |
| reserved: [gen-delims | sub-delims] | |
| unreserved: compose [alpha | digit | (charset "-._~")] | |
| ; Helper func for extending alpha-num | |
| alpha-num+: func [more [string!]][union alpha-num charset more] | |
| scheme-char: alpha-num+ "+-." | |
| ;-- URL Grammar | |
| url-rules: [scheme-part hier-part opt query opt fragment] | |
| scheme-part: [copy =scheme [alpha some scheme-char] #":"] | |
| hier-part: ["//" authority path-abempty | path-absolute | path-rootless | path-empty] | |
| ; The authority component is preceded by a double slash ("//") and is | |
| ; terminated by the next slash ("/"), question mark ("?"), or number | |
| ; sign ("#") character, or by the end of the URI. | |
| authority: [opt user-info host opt [":" port]] | |
| ; "user:password" format for user-info is deprecated. | |
| user-info: [copy =user-info [any [unreserved | sub-delims | #":"] #"@"]] | |
| ; Host is not detailed per the RFC yet. It covers IPv6 addresses, which go in | |
| ; square brackets, making them a non-loadable URL in Red. They can also contain | |
| ; colons, which makes finding the port marker more involved. | |
| IP-literal: [copy =IP-literal ["[" thru "]"]] ; simplified from [IPv6address | IPvFuture] | |
| host: [ | |
| IP-literal (=host: =IP-literal) | |
| | copy =host any [unreserved | sub-delims] | |
| ] | |
| port: [copy =port [1 5 digit]] | |
| ; path-abempty ; begins with "/" or is empty | |
| ; path-absolute ; begins with "/" but not "//" | |
| ; path-noscheme ; begins with a non-colon segment | |
| ; path-rootless ; begins with a segment | |
| ; path-empty ; zero characters | |
| path-abempty: [copy =path any-segments | path-empty] ; (print ["path:" mold =path]) | |
| path-absolute: [copy =path [#"/" opt [segment-nz any-segments]]] ; (print ["path-abs:" mold =path]) | |
| ;!! path-noscheme is only used in relative URIs, which aren't supported here yet. | |
| ;path-noscheme: [copy =path [segment-nz-nc any-segments]] ; (print ["path-no-scheme:" mold =path]) | |
| path-rootless: [copy =path [segment-nz any-segments]] ; (print ["path-rootless:" mold =path]) | |
| path-empty: [none] | |
| any-segments: [any [#"/" segment]] | |
| segment: [any pchar] | |
| segment-nz: [some pchar] | |
| segment-nz-nc: [some [unreserved | sub-delims | #"@"]] ; non-zero-length segment with no colon | |
| pchar: [unreserved | sub-delims | #":" | #"@"] ; path characters | |
| query: ["?" copy =query any [pchar | slash | #"?"]] | |
| fragment: ["#" copy =fragment any [pchar | slash | #"?"]] | |
| ;-- Parse Function | |
| parse-url: function [ | |
| "Return object with URL components, or cause an error if not a valid URL" | |
| url [url! string!] | |
| /throw-error "Throw an error, instead of returning NONE." | |
| /extern vars =path =host | |
| ][ | |
| set vars none ; clear object level parse variables | |
| either parse dehex url url-rules [ | |
| if empty? =host [=host: none] | |
| =path: either all [=path not empty? =path][ | |
| split-path to file! =path | |
| ][ | |
| [#[none] #[none]] | |
| ] | |
| object [ | |
| scheme: to word! =scheme | |
| user-info: =user-info | |
| host: =host | |
| port: if =port [to integer! =port] | |
| path: first =path | |
| target: second =path | |
| query: =query | |
| fragment: =fragment | |
| ] | |
| ][ | |
| if throw-error [ | |
| make error! rejoin ["URL error: " url] | |
| ] | |
| ] | |
| ] | |
| ; Exported function (Rebol compatible name) | |
| set 'decode-url function [ | |
| "Decode a URL into an object containing its constituent parts" | |
| url [url! string!] | |
| ][ | |
| parse-url url | |
| ] | |
| ] | |
| test-urls: reduce [ | |
| foo:// ; no path | |
| object [ | |
| scheme: 'foo | |
| user-info: none | |
| host: none | |
| port: none | |
| path: none | |
| target: none | |
| query: none | |
| fragment: none | |
| ] | |
| foo:/a/b/c ; path-absolute | |
| object [ | |
| scheme: 'foo | |
| user-info: none | |
| host: none | |
| port: none | |
| path: %/a/b/ | |
| target: %c | |
| query: none | |
| fragment: none | |
| ] | |
| foo://example.com:8042/over/there?name=ferret#nose | |
| object [ | |
| scheme: 'foo | |
| user-info: none | |
| host: "example.com" | |
| port: 8042 | |
| path: %/over/ | |
| target: %there | |
| query: "name=ferret" | |
| fragment: "nose" | |
| ] | |
| ftp://ftp.is.co.za/rfc/rfc1808.txt | |
| object [ | |
| scheme: 'ftp | |
| user-info: none | |
| host: "ftp.is.co.za" | |
| port: none | |
| path: %/rfc/ | |
| target: %rfc1808.txt | |
| query: none | |
| fragment: none | |
| ] | |
| http://www.ietf.org/rfc/rfc2396.txt | |
| object [ | |
| scheme: 'http | |
| user-info: none | |
| host: "www.ietf.org" | |
| port: none | |
| path: %/rfc/ | |
| target: %rfc2396.txt | |
| query: none | |
| fragment: none | |
| ] | |
| to url! "ldap://[2001:db8::7]/c=GB?objectClass?one" ; not loadable Red | |
| object [ | |
| scheme: 'ldap | |
| user-info: none | |
| host: "[2001:db8::7]" | |
| port: none | |
| path: %/ | |
| target: %c=GB | |
| query: "objectClass?one" | |
| fragment: none | |
| ] | |
| mailto:[email protected] | |
| object [ | |
| scheme: 'mailto | |
| user-info: none | |
| host: none | |
| port: none | |
| path: %./ | |
| target: %"[email protected]" | |
| query: none | |
| fragment: none | |
| ] | |
| news:comp.infosystems.www.servers.unix | |
| object [ | |
| scheme: 'news | |
| user-info: none | |
| host: none | |
| port: none | |
| path: %./ | |
| target: %comp.infosystems.www.servers.unix | |
| query: none | |
| fragment: none | |
| ] | |
| tel:+1-816-555-1212 | |
| object [ | |
| scheme: 'tel | |
| user-info: none | |
| host: none | |
| port: none | |
| path: %./ | |
| target: %+1-816-555-1212 | |
| query: none | |
| fragment: none | |
| ] | |
| telnet://192.0.2.16:80/ | |
| object [ | |
| scheme: 'telnet | |
| user-info: none | |
| host: "192.0.2.16" | |
| port: 80 | |
| path: %/ | |
| target: none | |
| query: none | |
| fragment: none | |
| ] | |
| urn:oasis:names:specification:docbook:dtd:xml:4.1.2 | |
| object [ | |
| scheme: 'urn | |
| user-info: none | |
| host: none | |
| port: none | |
| path: %./ | |
| target: %"oasis:names:specification:docbook:dtd:xml:4.1.2" | |
| query: none | |
| fragment: none | |
| ] | |
| ] | |
| test-url-parser: function [input expected-result][ | |
| if expected-result <> res: decode-url input [ | |
| print [ | |
| "parse-url failed for url:" mold url newline | |
| "Expected:" mold expected-result newline | |
| "Got:" mold res | |
| ] | |
| ] | |
| ] | |
| foreach [url obj] test-urls [test-url-parser url obj] |
L100: using dehex here is a problem (note that for url! values, Red currently does it for you, which is also wrong).
Practical example: say you have a user name "[email protected]" and a password of "pass" for you FTP access. (This is a pretty common scenario with virtual hosting etc.). You want to express that as a URL, so you use:
ftp://me%40domain.com:[email protected]/
Red, and your code above if you pass a string!, will convert it to ftp://[email protected]:[email protected]/ before parsing it which doesn't work.
The standard states that "the components and subcomponents significant to the scheme-specific dereferencing process (if any) must be parsed and separated before the percent-encoded octets within those components can be safely decoded", with the exception of "percent-encoded octets corresponding to characters in the unreserved set, which can be decoded at any time".
In my parser I solved this problem by having separate parse-uri and decode-uri-fields functions (separate because the latter makes assumptions that are not valid in general for all URIs, but are valid for common schemes like HTTP, FTP and so on; it could be argued that decode-uri-fields is the most common case, and perhaps should happen by default, and there should be a refinement to turn it off).
L119:
reformis not defined in Red.