Skip to content

Instantly share code, notes, and snippets.

@greggirwin
Last active October 24, 2018 21:47
Show Gist options
  • Save greggirwin/b08ffb5c9fa54a9b9387248387baf46d to your computer and use it in GitHub Desktop.
Save greggirwin/b08ffb5c9fa54a9b9387248387baf46d to your computer and use it in GitHub Desktop.
RFC3986 URL parser and separate tests
Red []
do %url-parser.red
test-urls: reduce [
foo:// ; no path
object [
scheme: 'foo
user-info: none
host: "" ; none ; this is tricky, the // means we'll always have a host, even if empty.
port: none
path: none
target: none
query: none
fragment: none
]
foo:/a/b/c ; path-absolute
object [
scheme: 'foo
user-info: none
host: none
port: none
path: %/a/b/
target: %c
query: none
fragment: none
]
foo://example.com:8042/over/there?name=ferret#nose
object [
scheme: 'foo
user-info: none
host: "example.com"
port: 8042
path: %/over/
target: %there
query: "name=ferret"
fragment: "nose"
]
foo://example.com:8042/over/there?#
object [
scheme: 'foo
user-info: none
host: "example.com"
port: 8042
path: %/over/
target: %there
query: ""
fragment: ""
]
; %2F gets decoded before we get it if in user-info
;foo://DEAD%[email protected]:8042/over/there?name=ferret%20face#nose%20ring
foo://[email protected]:8042/over/there?name=ferret%20face#nose%20ring
object [
scheme: 'foo
user-info: "DEADBEEF=="
host: "example.com"
port: 8042
path: %/over/
target: %there
query: "name=ferret face"
fragment: "nose ring"
]
ftp://ftp.is.co.za/rfc/rfc1808.txt
object [
scheme: 'ftp
user-info: none
host: "ftp.is.co.za"
port: none
path: %/rfc/
target: %rfc1808.txt
query: none
fragment: none
]
http://www.ietf.org/rfc/rfc2396.txt
object [
scheme: 'http
user-info: none
host: "www.ietf.org"
port: none
path: %/rfc/
target: %rfc2396.txt
query: none
fragment: none
]
to url! "ldap://[2001:db8::7]/c=GB?objectClass?one" ; not loadable Red
object [
scheme: 'ldap
user-info: none
host: "[2001:db8::7]"
port: none
path: %/
target: %c=GB
query: "objectClass?one"
fragment: none
]
mailto:[email protected]
object [
scheme: 'mailto
user-info: none
host: none
port: none
path: %./
target: %"[email protected]"
query: none
fragment: none
]
news:comp.infosystems.www.servers.unix
object [
scheme: 'news
user-info: none
host: none
port: none
path: %./
target: %comp.infosystems.www.servers.unix
query: none
fragment: none
]
tel:+1-816-555-1212
object [
scheme: 'tel
user-info: none
host: none
port: none
path: %./
target: %+1-816-555-1212
query: none
fragment: none
]
telnet://192.0.2.16:80/
object [
scheme: 'telnet
user-info: none
host: "192.0.2.16"
port: 80
path: %/
target: none
query: none
fragment: none
]
urn:oasis:names:specification:docbook:dtd:xml:4.1.2
object [
scheme: 'urn
user-info: none
host: none
port: none
path: %./
target: %"oasis:names:specification:docbook:dtd:xml:4.1.2"
query: none
fragment: none
]
http://www.rebol.com/
object [
scheme: 'http
user-info: none
host: "www.rebol.com"
port: none
path: %/
target: none
query: none
fragment: none
]
"http://www.rebol.com/"
object [
scheme: 'http
user-info: none
host: "www.rebol.com"
port: none
path: %/
target: none
query: none
fragment: none
]
]
test-url-parser: function [input expected-result][
if expected-result <> res: decode-url input [
print [
"parse-url failed for url:" mold input newline
"Expected:" mold expected-result newline
"Got:" mold res
]
]
if input <> new-url: encode-url res [
print [
"encode-url failed for url:" mold res newline
"Expected:" mold input newline
"Got:" mold new-url
]
]
]
foreach [url obj] test-urls [test-url-parser url obj]
Red [
title: "RFC3986 URL parser"
file: %url-parser.red
author: "@greggirwin"
date: 03-Oct-2018
notes: {
Reference: https://tools.ietf.org/html/rfc3986#page-16
Most rule names are taken from the RFC, with the goal of
making it easy to compare to the reference. Some rules
are simplified in this version (e.g. IP address literals).
Where pct-encoded rules are listed in the RFC, they are
omitted from parse rules here, as the input is dehexed
before being parsed.
Relative URI path references are not yet supported.
}
]
url-parser: object [
;-- Parse Variables
=scheme: =user-info: =host: =port: =path: =query: =fragment: none
vars: [=scheme =user-info =host =port =path =query =fragment]
;-- General Character Sets
alpha: charset [#"a" - #"z" #"A" - #"Z"]
digit: charset "0123456789"
alpha-num: union alpha digit
hex-digit: union digit charset [#"a" - #"f" #"A" - #"F"]
;-- URL Character Sets
; The purpose of reserved characters is to provide a set of delimiting
; characters that are distinguishable from other data within a URI.
gen-delims: charset ":/?#[]@"
sub-delims: charset "!$&'()*+,;="
reserved: [gen-delims | sub-delims]
unreserved: compose [alpha | digit | (charset "-._~")]
pct-encoded: [#"%" 2 hex-digit]
; Helper func for extending alpha-num
alpha-num+: func [more [string!]][union alpha-num charset more]
scheme-char: alpha-num+ "+-."
;-- URL Grammar
url-rules: [scheme-part hier-part opt query opt fragment] ; mark: (print mark)
scheme-part: [copy =scheme [alpha some scheme-char] #":"]
hier-part: ["//" authority path-abempty | path-absolute | path-rootless | path-empty]
; The authority component is preceded by a double slash ("//") and is
; terminated by the next slash ("/"), question mark ("?"), or number
; sign ("#") character, or by the end of the URI.
authority: [opt user-info host opt [":" port]]
; "user:password" format for user-info is deprecated.
user-info: [
;mark: (print mold mark)
copy =user-info [any [unreserved | pct-encoded | sub-delims | #":"] #"@"]
;(print mold =user-info)
(take/last =user-info)
]
; Host is not detailed per the RFC yet. It covers IPv6 addresses, which go in
; square brackets, making them a non-loadable URL in Red. They can also contain
; colons, which makes finding the port marker more involved.
; The percent encoded options for brackets here are a bit of a hack as well,
; because Red encodes them in URLs, even in the IP literal segment.
IP-literal: [copy =IP-literal [[#"[" | "%5B"] thru [#"]" | "%5D"]]] ; simplified from [IPv6address | IPvFuture]
host: [
IP-literal (=host: =IP-literal)
| copy =host any [unreserved | pct-encoded | sub-delims]
;(print ["host:" mold =host])
]
port: [copy =port [1 5 digit]]
; path-abempty ; begins with "/" or is empty
; path-absolute ; begins with "/" but not "//"
; path-noscheme ; begins with a non-colon segment
; path-rootless ; begins with a segment
; path-empty ; zero characters
path-abempty: [copy =path any-segments | path-empty] ; (print ["path:" mold =path])
path-absolute: [copy =path [#"/" opt [segment-nz any-segments]]] ; (print ["path-abs:" mold =path])
;!! path-noscheme is only used in relative URIs, which aren't supported here yet.
;path-noscheme: [copy =path [segment-nz-nc any-segments]] ; (print ["path-no-scheme:" mold =path])
path-rootless: [copy =path [segment-nz any-segments]] ; (print ["path-rootless:" mold =path])
path-empty: [none]
any-segments: [any [#"/" segment]]
segment: [any pchar]
segment-nz: [some pchar]
segment-nz-nc: [some [unreserved | pct-encoded | sub-delims | #"@"]] ; non-zero-length segment with no colon
pchar: [unreserved | pct-encoded | sub-delims | #":" | #"@"] ; path characters
query: ["?" copy =query any [pchar | slash | #"?"]]
fragment: ["#" copy =fragment any [pchar | slash | #"?"]]
;-- Parse Function
parse-url: function [
"Return object with URL components, or cause an error if not a valid URL"
url [url! string!]
/throw-error "Throw an error, instead of returning NONE."
/extern vars =path =host
][
set vars none ; clear object level parse variables
; We can't dehex before parsing, or invalid chars will show up which
; don't match the rules. Even forming the url messes it up. Only
; MOLD preserves the percent encoding.
;print ['input mold url]
if url? url [url: mold url]
either parse url url-rules [
;if empty? =host [=host: none]
;if empty? =user-info [=user-info: none]
=path: either all [=path not empty? =path][
split-path to file! dehex =path
][
[#[none] #[none]]
]
;set 'dbg =path
;print ['scheme mold =scheme type? =scheme]
object [
scheme: to word! form =scheme
user-info: if =user-info [dehex =user-info]
host: if =host [dehex =host]
port: if =port [to integer! =port]
path: first =path
target: second =path
query: if =query [dehex =query]
fragment: if =fragment [dehex =fragment]
]
][
if throw-error [
make error! reform ["URL error:" url]
]
]
]
; Exported function (Rebol compatible name)
set 'decode-url function [
"Decode a URL into an object containing its constituent parts"
url [url! string!]
][
parse-url url
]
; Note that we are careful to preserve the distinction between a component
; that is undefined, meaning that its separator was not present in the
; reference, and a component that is empty, meaning that the separator was
; present and was immediately followed by the next component separator or
; the end of the reference.
set 'encode-url function [url-obj [object!] "What you'd get from decode-url"][
;result: make url! 256 ; pre-allocate for reasonable sized-urls
result: clear url-buffer://
if url-obj/scheme [
append result url-obj/scheme
append result #":"
]
; authority: user-info, host opt port
if url-obj/host [
append result "//"
if url-obj/user-info [
append result url-obj/user-info
append result #"@"
]
append result url-obj/host
if url-obj/port [
append result #":"
append result url-obj/port
]
]
if all [url-obj/path url-obj/path <> %./] [
append result url-obj/path
]
if url-obj/target [
append result url-obj/target
]
if url-obj/query [
append result #"?"
append result url-obj/query
]
if url-obj/fragment [
append result #"#"
append result url-obj/fragment
]
copy result
]
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment