Last active
August 14, 2019 21:29
-
-
Save greggirwin/207149d46441cd48a1426e60926a7d25 to your computer and use it in GitHub Desktop.
Red URL Parser (now included in Red: %environment/networking.red)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Red [ | |
title: "RFC3986 URL parser" | |
file: %url-parser.red | |
author: "@greggirwin" | |
date: 03-Oct-2018 | |
notes: { | |
Reference: https://tools.ietf.org/html/rfc3986#page-16 | |
Most rule names are taken from the RFC, with the goal of | |
making it easy to compare to the reference. Some rules | |
are simplified in this version (e.g. IP address literals). | |
Where pct-encoded rules are listed in the RFC, they are | |
omitted from parse rules here, as the input is dehexed | |
before being parsed. | |
Relative URI path references are not yet supported. | |
} | |
] | |
url-parser: object [ | |
;-- Parse Variables | |
=scheme: =user-info: =host: =port: =path: =query: =fragment: none | |
vars: [=scheme =user-info =host =port =path =query =fragment] | |
;-- General Character Sets | |
alpha: charset [#"a" - #"z" #"A" - #"Z"] | |
digit: charset "0123456789" | |
alpha-num: union alpha digit | |
;-- URL Character Sets | |
; The purpose of reserved characters is to provide a set of delimiting | |
; characters that are distinguishable from other data within a URI. | |
gen-delims: charset ":/?#[]@" | |
sub-delims: charset "!$&'()*+,;=" | |
reserved: [gen-delims | sub-delims] | |
unreserved: compose [alpha | digit | (charset "-._~")] | |
; Helper func for extending alpha-num | |
alpha-num+: func [more [string!]][union alpha-num charset more] | |
scheme-char: alpha-num+ "+-." | |
;-- URL Grammar | |
url-rules: [scheme-part hier-part opt query opt fragment] | |
scheme-part: [copy =scheme [alpha some scheme-char] #":"] | |
hier-part: ["//" authority path-abempty | path-absolute | path-rootless | path-empty] | |
; The authority component is preceded by a double slash ("//") and is | |
; terminated by the next slash ("/"), question mark ("?"), or number | |
; sign ("#") character, or by the end of the URI. | |
authority: [opt user-info host opt [":" port]] | |
; "user:password" format for user-info is deprecated. | |
user-info: [copy =user-info [any [unreserved | sub-delims | #":"] #"@"]] | |
; Host is not detailed per the RFC yet. It covers IPv6 addresses, which go in | |
; square brackets, making them a non-loadable URL in Red. They can also contain | |
; colons, which makes finding the port marker more involved. | |
IP-literal: [copy =IP-literal ["[" thru "]"]] ; simplified from [IPv6address | IPvFuture] | |
host: [ | |
IP-literal (=host: =IP-literal) | |
| copy =host any [unreserved | sub-delims] | |
] | |
port: [copy =port [1 5 digit]] | |
; path-abempty ; begins with "/" or is empty | |
; path-absolute ; begins with "/" but not "//" | |
; path-noscheme ; begins with a non-colon segment | |
; path-rootless ; begins with a segment | |
; path-empty ; zero characters | |
path-abempty: [copy =path any-segments | path-empty] ; (print ["path:" mold =path]) | |
path-absolute: [copy =path [#"/" opt [segment-nz any-segments]]] ; (print ["path-abs:" mold =path]) | |
;!! path-noscheme is only used in relative URIs, which aren't supported here yet. | |
;path-noscheme: [copy =path [segment-nz-nc any-segments]] ; (print ["path-no-scheme:" mold =path]) | |
path-rootless: [copy =path [segment-nz any-segments]] ; (print ["path-rootless:" mold =path]) | |
path-empty: [none] | |
any-segments: [any [#"/" segment]] | |
segment: [any pchar] | |
segment-nz: [some pchar] | |
segment-nz-nc: [some [unreserved | sub-delims | #"@"]] ; non-zero-length segment with no colon | |
pchar: [unreserved | sub-delims | #":" | #"@"] ; path characters | |
query: ["?" copy =query any [pchar | slash | #"?"]] | |
fragment: ["#" copy =fragment any [pchar | slash | #"?"]] | |
;-- Parse Function | |
parse-url: function [ | |
"Return object with URL components, or cause an error if not a valid URL" | |
url [url! string!] | |
/throw-error "Throw an error, instead of returning NONE." | |
/extern vars =path =host | |
][ | |
set vars none ; clear object level parse variables | |
either parse dehex url url-rules [ | |
if empty? =host [=host: none] | |
=path: either all [=path not empty? =path][ | |
split-path to file! =path | |
][ | |
[#[none] #[none]] | |
] | |
object [ | |
scheme: to word! =scheme | |
user-info: =user-info | |
host: =host | |
port: if =port [to integer! =port] | |
path: first =path | |
target: second =path | |
query: =query | |
fragment: =fragment | |
] | |
][ | |
if throw-error [ | |
make error! rejoin ["URL error: " url] | |
] | |
] | |
] | |
; Exported function (Rebol compatible name) | |
set 'decode-url function [ | |
"Decode a URL into an object containing its constituent parts" | |
url [url! string!] | |
][ | |
parse-url url | |
] | |
] | |
test-urls: reduce [ | |
foo:// ; no path | |
object [ | |
scheme: 'foo | |
user-info: none | |
host: none | |
port: none | |
path: none | |
target: none | |
query: none | |
fragment: none | |
] | |
foo:/a/b/c ; path-absolute | |
object [ | |
scheme: 'foo | |
user-info: none | |
host: none | |
port: none | |
path: %/a/b/ | |
target: %c | |
query: none | |
fragment: none | |
] | |
foo://example.com:8042/over/there?name=ferret#nose | |
object [ | |
scheme: 'foo | |
user-info: none | |
host: "example.com" | |
port: 8042 | |
path: %/over/ | |
target: %there | |
query: "name=ferret" | |
fragment: "nose" | |
] | |
ftp://ftp.is.co.za/rfc/rfc1808.txt | |
object [ | |
scheme: 'ftp | |
user-info: none | |
host: "ftp.is.co.za" | |
port: none | |
path: %/rfc/ | |
target: %rfc1808.txt | |
query: none | |
fragment: none | |
] | |
http://www.ietf.org/rfc/rfc2396.txt | |
object [ | |
scheme: 'http | |
user-info: none | |
host: "www.ietf.org" | |
port: none | |
path: %/rfc/ | |
target: %rfc2396.txt | |
query: none | |
fragment: none | |
] | |
to url! "ldap://[2001:db8::7]/c=GB?objectClass?one" ; not loadable Red | |
object [ | |
scheme: 'ldap | |
user-info: none | |
host: "[2001:db8::7]" | |
port: none | |
path: %/ | |
target: %c=GB | |
query: "objectClass?one" | |
fragment: none | |
] | |
mailto:[email protected] | |
object [ | |
scheme: 'mailto | |
user-info: none | |
host: none | |
port: none | |
path: %./ | |
target: %"[email protected]" | |
query: none | |
fragment: none | |
] | |
news:comp.infosystems.www.servers.unix | |
object [ | |
scheme: 'news | |
user-info: none | |
host: none | |
port: none | |
path: %./ | |
target: %comp.infosystems.www.servers.unix | |
query: none | |
fragment: none | |
] | |
tel:+1-816-555-1212 | |
object [ | |
scheme: 'tel | |
user-info: none | |
host: none | |
port: none | |
path: %./ | |
target: %+1-816-555-1212 | |
query: none | |
fragment: none | |
] | |
telnet://192.0.2.16:80/ | |
object [ | |
scheme: 'telnet | |
user-info: none | |
host: "192.0.2.16" | |
port: 80 | |
path: %/ | |
target: none | |
query: none | |
fragment: none | |
] | |
urn:oasis:names:specification:docbook:dtd:xml:4.1.2 | |
object [ | |
scheme: 'urn | |
user-info: none | |
host: none | |
port: none | |
path: %./ | |
target: %"oasis:names:specification:docbook:dtd:xml:4.1.2" | |
query: none | |
fragment: none | |
] | |
] | |
test-url-parser: function [input expected-result][ | |
if expected-result <> res: decode-url input [ | |
print [ | |
"parse-url failed for url:" mold url newline | |
"Expected:" mold expected-result newline | |
"Got:" mold res | |
] | |
] | |
] | |
foreach [url obj] test-urls [test-url-parser url obj] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
L100: using
dehex
here is a problem (note that forurl!
values, Red currently does it for you, which is also wrong).Practical example: say you have a user name "[email protected]" and a password of "pass" for you FTP access. (This is a pretty common scenario with virtual hosting etc.). You want to express that as a URL, so you use:
ftp://me%40domain.com:[email protected]/
Red, and your code above if you pass a
string!
, will convert it toftp://[email protected]:[email protected]/
before parsing it which doesn't work.The standard states that "the components and subcomponents significant to the scheme-specific dereferencing process (if any) must be parsed and separated before the percent-encoded octets within those components can be safely decoded", with the exception of "percent-encoded octets corresponding to characters in the unreserved set, which can be decoded at any time".
In my parser I solved this problem by having separate
parse-uri
anddecode-uri-fields
functions (separate because the latter makes assumptions that are not valid in general for all URIs, but are valid for common schemes like HTTP, FTP and so on; it could be argued thatdecode-uri-fields
is the most common case, and perhaps should happen by default, and there should be a refinement to turn it off).