Last active
October 24, 2018 21:47
-
-
Save greggirwin/b08ffb5c9fa54a9b9387248387baf46d to your computer and use it in GitHub Desktop.
RFC3986 URL parser and separate tests
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Red [] | |
do %url-parser.red | |
test-urls: reduce [ | |
foo:// ; no path | |
object [ | |
scheme: 'foo | |
user-info: none | |
host: "" ; none ; this is tricky, the // means we'll always have a host, even if empty. | |
port: none | |
path: none | |
target: none | |
query: none | |
fragment: none | |
] | |
foo:/a/b/c ; path-absolute | |
object [ | |
scheme: 'foo | |
user-info: none | |
host: none | |
port: none | |
path: %/a/b/ | |
target: %c | |
query: none | |
fragment: none | |
] | |
foo://example.com:8042/over/there?name=ferret#nose | |
object [ | |
scheme: 'foo | |
user-info: none | |
host: "example.com" | |
port: 8042 | |
path: %/over/ | |
target: %there | |
query: "name=ferret" | |
fragment: "nose" | |
] | |
foo://example.com:8042/over/there?# | |
object [ | |
scheme: 'foo | |
user-info: none | |
host: "example.com" | |
port: 8042 | |
path: %/over/ | |
target: %there | |
query: "" | |
fragment: "" | |
] | |
; %2F gets decoded before we get it if in user-info | |
;foo://DEAD%[email protected]:8042/over/there?name=ferret%20face#nose%20ring | |
foo://[email protected]:8042/over/there?name=ferret%20face#nose%20ring | |
object [ | |
scheme: 'foo | |
user-info: "DEADBEEF==" | |
host: "example.com" | |
port: 8042 | |
path: %/over/ | |
target: %there | |
query: "name=ferret face" | |
fragment: "nose ring" | |
] | |
ftp://ftp.is.co.za/rfc/rfc1808.txt | |
object [ | |
scheme: 'ftp | |
user-info: none | |
host: "ftp.is.co.za" | |
port: none | |
path: %/rfc/ | |
target: %rfc1808.txt | |
query: none | |
fragment: none | |
] | |
http://www.ietf.org/rfc/rfc2396.txt | |
object [ | |
scheme: 'http | |
user-info: none | |
host: "www.ietf.org" | |
port: none | |
path: %/rfc/ | |
target: %rfc2396.txt | |
query: none | |
fragment: none | |
] | |
to url! "ldap://[2001:db8::7]/c=GB?objectClass?one" ; not loadable Red | |
object [ | |
scheme: 'ldap | |
user-info: none | |
host: "[2001:db8::7]" | |
port: none | |
path: %/ | |
target: %c=GB | |
query: "objectClass?one" | |
fragment: none | |
] | |
mailto:[email protected] | |
object [ | |
scheme: 'mailto | |
user-info: none | |
host: none | |
port: none | |
path: %./ | |
target: %"[email protected]" | |
query: none | |
fragment: none | |
] | |
news:comp.infosystems.www.servers.unix | |
object [ | |
scheme: 'news | |
user-info: none | |
host: none | |
port: none | |
path: %./ | |
target: %comp.infosystems.www.servers.unix | |
query: none | |
fragment: none | |
] | |
tel:+1-816-555-1212 | |
object [ | |
scheme: 'tel | |
user-info: none | |
host: none | |
port: none | |
path: %./ | |
target: %+1-816-555-1212 | |
query: none | |
fragment: none | |
] | |
telnet://192.0.2.16:80/ | |
object [ | |
scheme: 'telnet | |
user-info: none | |
host: "192.0.2.16" | |
port: 80 | |
path: %/ | |
target: none | |
query: none | |
fragment: none | |
] | |
urn:oasis:names:specification:docbook:dtd:xml:4.1.2 | |
object [ | |
scheme: 'urn | |
user-info: none | |
host: none | |
port: none | |
path: %./ | |
target: %"oasis:names:specification:docbook:dtd:xml:4.1.2" | |
query: none | |
fragment: none | |
] | |
http://www.rebol.com/ | |
object [ | |
scheme: 'http | |
user-info: none | |
host: "www.rebol.com" | |
port: none | |
path: %/ | |
target: none | |
query: none | |
fragment: none | |
] | |
"http://www.rebol.com/" | |
object [ | |
scheme: 'http | |
user-info: none | |
host: "www.rebol.com" | |
port: none | |
path: %/ | |
target: none | |
query: none | |
fragment: none | |
] | |
] | |
test-url-parser: function [input expected-result][ | |
if expected-result <> res: decode-url input [ | |
print [ | |
"parse-url failed for url:" mold input newline | |
"Expected:" mold expected-result newline | |
"Got:" mold res | |
] | |
] | |
if input <> new-url: encode-url res [ | |
print [ | |
"encode-url failed for url:" mold res newline | |
"Expected:" mold input newline | |
"Got:" mold new-url | |
] | |
] | |
] | |
foreach [url obj] test-urls [test-url-parser url obj] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Red [ | |
title: "RFC3986 URL parser" | |
file: %url-parser.red | |
author: "@greggirwin" | |
date: 03-Oct-2018 | |
notes: { | |
Reference: https://tools.ietf.org/html/rfc3986#page-16 | |
Most rule names are taken from the RFC, with the goal of | |
making it easy to compare to the reference. Some rules | |
are simplified in this version (e.g. IP address literals). | |
Where pct-encoded rules are listed in the RFC, they are | |
omitted from parse rules here, as the input is dehexed | |
before being parsed. | |
Relative URI path references are not yet supported. | |
} | |
] | |
url-parser: object [ | |
;-- Parse Variables | |
=scheme: =user-info: =host: =port: =path: =query: =fragment: none | |
vars: [=scheme =user-info =host =port =path =query =fragment] | |
;-- General Character Sets | |
alpha: charset [#"a" - #"z" #"A" - #"Z"] | |
digit: charset "0123456789" | |
alpha-num: union alpha digit | |
hex-digit: union digit charset [#"a" - #"f" #"A" - #"F"] | |
;-- URL Character Sets | |
; The purpose of reserved characters is to provide a set of delimiting | |
; characters that are distinguishable from other data within a URI. | |
gen-delims: charset ":/?#[]@" | |
sub-delims: charset "!$&'()*+,;=" | |
reserved: [gen-delims | sub-delims] | |
unreserved: compose [alpha | digit | (charset "-._~")] | |
pct-encoded: [#"%" 2 hex-digit] | |
; Helper func for extending alpha-num | |
alpha-num+: func [more [string!]][union alpha-num charset more] | |
scheme-char: alpha-num+ "+-." | |
;-- URL Grammar | |
url-rules: [scheme-part hier-part opt query opt fragment] ; mark: (print mark) | |
scheme-part: [copy =scheme [alpha some scheme-char] #":"] | |
hier-part: ["//" authority path-abempty | path-absolute | path-rootless | path-empty] | |
; The authority component is preceded by a double slash ("//") and is | |
; terminated by the next slash ("/"), question mark ("?"), or number | |
; sign ("#") character, or by the end of the URI. | |
authority: [opt user-info host opt [":" port]] | |
; "user:password" format for user-info is deprecated. | |
user-info: [ | |
;mark: (print mold mark) | |
copy =user-info [any [unreserved | pct-encoded | sub-delims | #":"] #"@"] | |
;(print mold =user-info) | |
(take/last =user-info) | |
] | |
; Host is not detailed per the RFC yet. It covers IPv6 addresses, which go in | |
; square brackets, making them a non-loadable URL in Red. They can also contain | |
; colons, which makes finding the port marker more involved. | |
; The percent encoded options for brackets here are a bit of a hack as well, | |
; because Red encodes them in URLs, even in the IP literal segment. | |
IP-literal: [copy =IP-literal [[#"[" | "%5B"] thru [#"]" | "%5D"]]] ; simplified from [IPv6address | IPvFuture] | |
host: [ | |
IP-literal (=host: =IP-literal) | |
| copy =host any [unreserved | pct-encoded | sub-delims] | |
;(print ["host:" mold =host]) | |
] | |
port: [copy =port [1 5 digit]] | |
; path-abempty ; begins with "/" or is empty | |
; path-absolute ; begins with "/" but not "//" | |
; path-noscheme ; begins with a non-colon segment | |
; path-rootless ; begins with a segment | |
; path-empty ; zero characters | |
path-abempty: [copy =path any-segments | path-empty] ; (print ["path:" mold =path]) | |
path-absolute: [copy =path [#"/" opt [segment-nz any-segments]]] ; (print ["path-abs:" mold =path]) | |
;!! path-noscheme is only used in relative URIs, which aren't supported here yet. | |
;path-noscheme: [copy =path [segment-nz-nc any-segments]] ; (print ["path-no-scheme:" mold =path]) | |
path-rootless: [copy =path [segment-nz any-segments]] ; (print ["path-rootless:" mold =path]) | |
path-empty: [none] | |
any-segments: [any [#"/" segment]] | |
segment: [any pchar] | |
segment-nz: [some pchar] | |
segment-nz-nc: [some [unreserved | pct-encoded | sub-delims | #"@"]] ; non-zero-length segment with no colon | |
pchar: [unreserved | pct-encoded | sub-delims | #":" | #"@"] ; path characters | |
query: ["?" copy =query any [pchar | slash | #"?"]] | |
fragment: ["#" copy =fragment any [pchar | slash | #"?"]] | |
;-- Parse Function | |
parse-url: function [ | |
"Return object with URL components, or cause an error if not a valid URL" | |
url [url! string!] | |
/throw-error "Throw an error, instead of returning NONE." | |
/extern vars =path =host | |
][ | |
set vars none ; clear object level parse variables | |
; We can't dehex before parsing, or invalid chars will show up which | |
; don't match the rules. Even forming the url messes it up. Only | |
; MOLD preserves the percent encoding. | |
;print ['input mold url] | |
if url? url [url: mold url] | |
either parse url url-rules [ | |
;if empty? =host [=host: none] | |
;if empty? =user-info [=user-info: none] | |
=path: either all [=path not empty? =path][ | |
split-path to file! dehex =path | |
][ | |
[#[none] #[none]] | |
] | |
;set 'dbg =path | |
;print ['scheme mold =scheme type? =scheme] | |
object [ | |
scheme: to word! form =scheme | |
user-info: if =user-info [dehex =user-info] | |
host: if =host [dehex =host] | |
port: if =port [to integer! =port] | |
path: first =path | |
target: second =path | |
query: if =query [dehex =query] | |
fragment: if =fragment [dehex =fragment] | |
] | |
][ | |
if throw-error [ | |
make error! reform ["URL error:" url] | |
] | |
] | |
] | |
; Exported function (Rebol compatible name) | |
set 'decode-url function [ | |
"Decode a URL into an object containing its constituent parts" | |
url [url! string!] | |
][ | |
parse-url url | |
] | |
; Note that we are careful to preserve the distinction between a component | |
; that is undefined, meaning that its separator was not present in the | |
; reference, and a component that is empty, meaning that the separator was | |
; present and was immediately followed by the next component separator or | |
; the end of the reference. | |
set 'encode-url function [url-obj [object!] "What you'd get from decode-url"][ | |
;result: make url! 256 ; pre-allocate for reasonable sized-urls | |
result: clear url-buffer:// | |
if url-obj/scheme [ | |
append result url-obj/scheme | |
append result #":" | |
] | |
; authority: user-info, host opt port | |
if url-obj/host [ | |
append result "//" | |
if url-obj/user-info [ | |
append result url-obj/user-info | |
append result #"@" | |
] | |
append result url-obj/host | |
if url-obj/port [ | |
append result #":" | |
append result url-obj/port | |
] | |
] | |
if all [url-obj/path url-obj/path <> %./] [ | |
append result url-obj/path | |
] | |
if url-obj/target [ | |
append result url-obj/target | |
] | |
if url-obj/query [ | |
append result #"?" | |
append result url-obj/query | |
] | |
if url-obj/fragment [ | |
append result #"#" | |
append result url-obj/fragment | |
] | |
copy result | |
] | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment