Created
March 5, 2018 21:33
-
-
Save meijeru/202c25f124e075ff5be85dff1beeda15 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
REBOL [ | |
Title: "Red Lexical Scanner" | |
Author: "Nenad Rakocevic" | |
File: %lexer.r | |
Tabs: 4 | |
Rights: "Copyright (C) 2011-2015 Nenad Rakocevic. All rights reserved." | |
License: "BSD-3 - https://github.com/red/red/blob/master/BSD-3-License.txt" | |
Comments: {Adapted by Rudolf W. Meijer for making the Red concordance. | |
This involves parsing Red, Red/System and REBOL programs by one lexer. | |
The changes are: | |
- the header rule accepts any word or path followed by a block | |
- the escaped rule accepts anything within #[ and ] | |
- throw-error prints the filename; global source-file must be set | |
It is a testimony to the syntactic similarity of all three languages | |
that the changes are so few ... | |
}] | |
;-- Patch NEW-LINE and NEW-LINE? natives to accept paren! -- | |
append first find third :new-line block! paren! | |
append first find third :new-line? block! paren! | |
lexer: context [ | |
verbose: 0 | |
old-line: none | |
line: none ;-- source code lines counter | |
count?: yes ;-- if TRUE, lines counter is enabled | |
cnt: none ;-- counts nested {} in multi-line strings | |
pos: none ;-- source input position (error reporting) | |
mark: none ;-- use for keeping input cursor at same position | |
path: none ;-- path input position (error reporting) | |
s: none ;-- mark start position of new value | |
e: none ;-- mark end position of new value | |
series: none ;-- temporary hold last stack series | |
value: none ;-- new value | |
value2: none ;-- secondary new value | |
fail?: none ;-- used for failing some parsing rules | |
type: none ;-- define the type of the new value | |
rs?: no ;-- if TRUE, do lexing for Red/System | |
neg?: no ;-- if TRUE, denotes a negative number value | |
base: 16 ;-- binary base | |
otag: none | |
ot: none | |
ct: none | |
sep: none | |
year: none | |
month: none | |
day: none | |
hour: none | |
mn: none | |
sec: none | |
date: none | |
ee: none | |
;====== Parsing rules ====== | |
four: charset "01234" | |
half: charset "012345" | |
non-zero: charset "123456789" | |
digit: union non-zero charset "0" | |
dot: #"." | |
comma: #"," | |
byte: [ | |
"25" half | |
| #"2" four digit | |
| #"1" digit digit | |
| opt #"0" non-zero digit | |
| 0 2 #"0" digit | |
| 1 2 #"0" | |
] | |
hexa: union digit charset "ABCDEF" | |
hexa-char: union hexa charset "abcdef" | |
alpha: charset [#"A" - #"Z" #"a" - #"z"] | |
base64-char: union digit union alpha charset "+/=" | |
;-- UTF-8 encoding rules from: http://tools.ietf.org/html/rfc3629#section-4 | |
UTF-8-BOM: #{EFBBBF} | |
ws-ASCII: charset " ^-^M" ;-- ASCII common whitespaces | |
ws-U+2k: charset [#"^(80)" - #"^(8A)"] ;-- Unicode spaces in the U+2000-U+200A range | |
UTF8-tail: charset [#"^(80)" - #"^(BF)"] | |
UTF8-1: charset [#"^(00)" - #"^(7F)"] | |
UTF8-2: reduce [ | |
charset [#"^(C2)" - #"^(DF)"] | |
UTF8-tail | |
] | |
UTF8-3: reduce [ | |
#{E0} charset [#"^(A0)" - #"^(BF)"] UTF8-tail | |
'| charset [#"^(E1)" - #"^(EC)"] 2 UTF8-tail | |
'| #{ED} charset [#"^(80)" - #"^(9F)"] UTF8-tail | |
'| charset [#"^(EE)" - #"^(EF)"] 2 UTF8-tail | |
] | |
UTF8-4: reduce [ | |
#{F0} charset [#"^(90)" - #"^(BF)"] 2 UTF8-tail | |
'| charset [#"^(F1)" - #"^(F3)"] 3 UTF8-tail | |
'| #{F4} charset [#"^(80)" - #"^(8F)"] 2 UTF8-tail | |
] | |
UTF8-char: [pos: UTF8-1 | UTF8-2 | UTF8-3 | UTF8-4] | |
not-word-char: charset {/\^^,[](){}"#%$@:;} | |
not-word-1st: union union not-word-char digit charset {'} | |
not-file-char: charset {[](){}"@:;} | |
not-url-char: charset {[](){}";} | |
not-email-char: union not-file-char union ws-ASCII charset "<^/" | |
not-str-char: #"^"" | |
not-mstr-char: #"}" | |
not-tag-1st: complement union ws-ASCII charset "=><[](){};^"" | |
not-tag-char: complement charset ">" | |
tag-char: charset "<>" | |
caret-char: charset [#"^(40)" - #"^(5F)"] | |
non-printable-char: charset [#"^(00)" - #"^(1F)"] | |
pair-end: charset {^{"[]();:} | |
integer-end: charset {^{"[]();:xX} | |
path-end: charset {^{"[]();} | |
file-end: charset {^{[]();} | |
date-sep: charset "/-" | |
time-sep: charset "T/" | |
stop: none | |
control-char: reduce [ ;-- Control characters | |
charset [#"^(00)" - #"^(1F)"] ;-- C0 control codes | |
'| #"^(C2)" charset [#"^(80)" - #"^(9F)"] ;-- C1 control codes (UTF-8 encoded) | |
] | |
UTF8-filtered-char: [ | |
[pos: stop :pos (fail?: [end skip]) | UTF8-char e: (fail?: none)] | |
fail? | |
] | |
UTF8-printable: [ | |
[non-printable-char | not-str-char (fail?: [end skip]) | UTF8-char (fail?: none)] | |
fail? | |
] | |
;-- Whitespaces list from: http://en.wikipedia.org/wiki/Whitespace_character | |
ws: [ | |
#"^/" ( | |
if count? [ | |
line: line + 1 | |
stack/nl?: yes | |
] | |
) | |
| ws-ASCII ;-- only the common whitespaces are matched | |
| #{C2} [ | |
#{85} ;-- U+0085 (Newline) | |
| #{A0} ;-- U+00A0 (No-break space) | |
] | |
;| #{E1} [ | |
; #{9A80} ;-- U+1680 (Ogham space mark) | |
; | #{A08E} ;-- U+180E (Mongolian vowel separator) | |
;] | |
;| #{E2} [ | |
; #{80} [ | |
; ws-U+2k ;-- U+2000-U+200A range | |
; | #{A8} ;-- U+2028 (Line separator) | |
; | #{A9} ;-- U+2029 (Paragraph separator) | |
; | #{AF} ;-- U+202F (Narrow no-break space) | |
; ] | |
; | #{819F} ;-- U+205F (Medium mathematical space) | |
;] | |
;| #{E38080} ;-- U+3000 (Ideographic space) | |
] | |
newline-char: [ | |
#"^/" | |
| #{C285} ;-- U+0085 (Newline) | |
| #{E280} [ | |
#{A8} ;-- U+2028 (Line separator) | |
| #{A9} ;-- U+2029 (Paragraph separator) | |
] | |
] | |
counted-newline: [pos: #"^/" (line: line + 1)] | |
ws-no-count: [(count?: no) ws (count?: yes)] | |
any-ws: [pos: any ws] | |
symbol-rule: [ | |
(stop: [not-word-char | ws-no-count | control-char | tag-char] otag: #"<" ot: none) | |
some [ | |
otag ot: [#"/" (otag: [end skip] ot: back ot) :ot | none] ;-- a</b> | |
| #">" ct: (if ot [otag: [end skip] ct: back ot]) :ct ;-- a<b> | |
| UTF8-filtered-char | |
] e: | |
] | |
begin-symbol-rule: [ ;-- 1st char in symbols is restricted | |
(stop: [not-word-1st | ws-no-count | control-char]) | |
UTF8-filtered-char | |
opt symbol-rule | |
] | |
path-rule: [ | |
pos: slash :pos ( ;-- path detection barrier | |
stack/allocate block! 4 | |
stack/push to type copy/part s e ;-- push 1st path element | |
) | |
some [ | |
slash | |
s: [ | |
integer-number-rule | |
| begin-symbol-rule (type: word!) | |
| paren-rule (type: paren!) | |
| #":" s: begin-symbol-rule (type: get-word!) | |
;@@ add more datatypes here | |
] ( | |
stack/push either type = paren! [ ;-- append path element | |
value | |
][ | |
to type copy/part s e | |
] | |
type: path! | |
) | |
] | |
opt [#":" (type: set-path!)] | |
e: [path-end | ws-no-count | end | (pos: path throw-error)] :e ;-- detect invalid tail characters | |
(value: stack/pop type) | |
] | |
word-rule: [ | |
(type: word!) | |
#"%" [ws-no-count | pos: file-end :pos | end] (value: "%") ;-- special case for remainder op! | |
| path: s: begin-symbol-rule [ | |
url-rule | |
| path-rule ;-- path matched | |
| (value: copy/part s e) ;-- word matched | |
opt [#":" (type: set-word!)] | |
] | |
] | |
get-word-rule: [ | |
#":" (type: get-word!) s: begin-symbol-rule [ | |
path-rule ( | |
value/1: to get-word! value/1 ;-- workaround missing get-path! in R2 | |
) | |
| ( | |
type: get-word! | |
value: copy/part s e ;-- word matched | |
) | |
] | |
] | |
lit-word-rule: [ | |
#"'" (type: word!) [ | |
#"/" (type: lit-word! value: "/") | |
| s: begin-symbol-rule [ | |
path-rule (type: lit-path!) ;-- path matched | |
| ( | |
type: lit-word! | |
value: copy/part s e ;-- word matched | |
) | |
] | |
][s: #":" :s (throw-error) | none] | |
] | |
map-rule: [ | |
"#(" (stack/allocate block! 10) any-value #")" ( | |
stack/prefix #!map! | |
value: stack/pop block! | |
) | |
] | |
issue-rule: [#"#" (type: issue!) s: symbol-rule] | |
refinement-rule: [slash (type: refinement!) s: symbol-rule] | |
slash-rule: [s: [slash opt slash] e:] | |
hexa-rule: [2 8 hexa e: #"h" pos: [integer-end | ws-no-count | end ] :pos (type: integer!)] | |
sticky-word-rule: [ ;-- protect from sticky words typos | |
mark: [integer-end | ws-no-count | end | (pos: s throw-error)] :mark | |
] | |
tuple-value-rule: [ | |
(type: tuple!) | |
byte dot byte 1 12 [dot byte] e: | |
] | |
tuple-rule: [tuple-value-rule sticky-word-rule] | |
time-rule: [ | |
s: positive-integer-rule [ | |
decimal-number-rule (value: as-time 0 value load-number copy/part s e neg?) ;-- mm:ss.dd | |
| (value2: load-number copy/part s e) [ | |
#":" s: positive-integer-rule opt decimal-number-rule | |
(value: as-time value value2 load-number copy/part s e neg?) ;-- hh:mm:ss[.dd] | |
| (value: as-time value value2 0 neg?) ;-- hh:mm | |
] | |
] (type: time!) | |
] | |
month-rule: [(m: none) | |
"January" (m: 1) | |
| "February" (m: 2) | |
| "March" (m: 3) | |
| "April" (m: 4) | |
| "May" (m: 5) | |
| "June" (m: 6) | |
| "July" (m: 7) | |
| "August" (m: 8) | |
| "September" (m: 9) | |
| "October" (m: 10) | |
| "November" (m: 11) | |
| "December" (m: 12) | |
] | |
mon-rule: [(m: none) | |
"Jan" (m: 1) | |
| "Feb" (m: 2) | |
| "Mar" (m: 3) | |
| "Apr" (m: 4) | |
| "May" (m: 5) | |
| "Jun" (m: 6) | |
| "Jul" (m: 7) | |
| "Aug" (m: 8) | |
| "Sep" (m: 9) | |
| "Oct" (m: 10) | |
| "Nov" (m: 11) | |
| "Dec" (m: 12) | |
] | |
day-year-rule: [ | |
(neg?: no) opt [#"-" (neg?: yes)] | |
s: 3 4 digit e: (year: load-number copy/part s e if neg? [year: 65536 - year]) | |
| 1 2 digit e: ( | |
value: load-number copy/part s e no | |
either day [year: value + pick [2000 1900] 50 > value][day: value] | |
) | |
] | |
date-rule: [ | |
pos: [opt #"-" 1 4 digit date-sep | 8 digit #"T"] :pos [ ;-- quick lookhead | |
s: 8 digit ee: #"T" ( ;-- yyyymmddT | |
year: load-number copy/part s 4 | |
month: load-number copy/part skip s 4 2 | |
day: load-number copy/part skip s 6 2 | |
date: make date! reduce [day month year] | |
) :ee | |
| day-year-rule sep: date-sep (sep: sep/1) [ | |
s: 1 2 digit e: (month: load-number copy/part s e no) | |
| some alpha e: ( | |
fail?: either all [parse/all copy/part s e [month-rule | mon-rule] m][month: m none][[end skip]] | |
) fail? | |
] | |
sep day-year-rule ( | |
fail?: either all [day month year][ | |
date: make date! reduce [day month year] | |
none | |
][[end skip]] | |
) fail? | |
| s: 4 digit #"-" ( | |
year: load-number copy/part s 4 | |
date: make date! reduce [1 1 year] | |
)[ | |
"W" s: 2 digit (ee: none) opt [#"-" ee: non-zero] ( ;-- yyyy-Www | |
date: to-iso-week date load-number copy/part s 2 | |
if ee [date: to-weekday date to integer! s/4 - #"0"] ;-- yyyy-Www-d | |
) | |
| s: 3 digit (date: date + (load-number copy/part s 3) - 1) ;-- yyyy-ddd | |
] (month: -1) | |
]( | |
type: date! | |
if all [ | |
month <> -1 any [date/year <> year date/month <> month date/day <> day] | |
][throw-error] | |
day: month: year: none | |
) opt [ | |
time-sep (ee: no) [ | |
s: 6 digit opt [#"." 1 9 digit ee:] ( ;-- Thhmmss[.sss] | |
hour: load-number copy/part s 2 | |
mn: load-number copy/part skip s 2 2 | |
sec: load-number either ee [copy/part skip s 4 ee][copy/part skip s 4 2] | |
date/time: as-time hour mn sec no | |
) | |
| 4 digit ( ;-- Thhmm | |
hour: load-number copy/part s 2 | |
mn: load-number copy/part skip s 2 2 | |
date/time: as-time hour mn 0 no | |
) | |
| s: positive-integer-rule (value: load-number copy/part s e) | |
#":" [(neg?: no) time-rule (date/time: value) | (throw-error)] | |
] | |
opt [ | |
#"Z" | [#"-" (neg?: yes) | #"+" (neg?: no)][ | |
s: 4 digit ( ;-- +/-hhmm | |
hour: load-number copy/part s e: skip s 2 | |
mn: load-number copy/part e e: skip e 2 | |
) | |
| 1 2 digit e: (hour: load-number copy/part s e mn: none) ;-- +/-h, +/-hh | |
opt [#":" s: 2 digit e: (mn: load-number copy/part s e)] | |
]( | |
either all [mn find [15 45] mn: round/floor/to mn 15][ | |
date: reduce [#!date! date as-time hour mn 0 neg?] ;-- special encoding for 15/45 | |
][ | |
date/zone: as-time hour any [mn 0] 0 neg? | |
] | |
) | |
] | |
] sticky-word-rule (value: date) | |
] | |
positive-integer-rule: [digit any digit e: (type: integer!)] | |
integer-number-rule: [ | |
opt [#"-" (neg?: yes) | #"+" (neg?: no)] digit any [digit | #"'" digit] e: | |
(type: integer!) | |
] | |
integer-rule: [ | |
pos: decimal-special e: ;-- escape path for NaN, INFs | |
(type: issue! value: load-number copy/part s e) | |
| (neg?: no) integer-number-rule | |
opt [decimal-number-rule | decimal-exp-rule e: (type: decimal!)] | |
opt [#"%" e: (type: issue!)] | |
sticky-word-rule | |
(value: load-number copy/part s e) | |
opt [ | |
[#"x" | #"X"] ( | |
type: pair! | |
value2: to pair! reduce [value 0] | |
) | |
[s: integer-number-rule | (type: pair! throw-error)] | |
mark: [pair-end | ws-no-count | end | (type: pair! throw-error)] :mark | |
(value2/2: load-number copy/part s e value: value2) | |
] | |
opt [#":" [time-rule | (throw-error)]] | |
] | |
decimal-special: [ | |
s: "-0.0" | (neg?: no) opt [#"-" (neg?: yes)] "1.#" s: [ | |
[[#"N" | #"n"] [#"a" | #"A"] [#"N" | #"n"]] | |
| [[#"I" | #"i"] [#"N" | #"n"] [#"F" | #"f"]] | |
] | |
] | |
decimal-exp-rule: [ | |
[[#"e" | #"E"] opt [#"-" | #"+"] 1 3 digit] | |
] | |
decimal-number-rule: [ | |
[dot | comma] digit any [digit | #"'" digit] | |
opt decimal-exp-rule e: (type: decimal!) | |
] | |
decimal-rule: [ | |
decimal-number-rule opt [#"%" e: (type: issue!)] | |
sticky-word-rule | |
] | |
block-rule: [#"[" (stack/allocate block! 10) any-value #"]" (value: stack/pop block!)] | |
paren-rule: [#"(" (stack/allocate paren! 10) any-value #")" (value: stack/pop paren!)] | |
escaped-char: [ | |
"^^(" [ | |
[ ;-- special case first | |
"null" (value: #"^(00)") | |
| "back" (value: #"^(08)") | |
| "tab" (value: #"^(09)") | |
| "line" (value: #"^(0A)") | |
| "page" (value: #"^(0C)") | |
| "esc" (value: #"^(1B)") | |
| "del" (value: #"^~") | |
] | |
| pos: [2 6 hexa-char] e: ( ;-- Unicode values allowed up to 10FFFFh | |
either rs? [ | |
value: to-char to-integer debase/base copy/part pos e 16 | |
][value: encode-UTF8-char pos e] | |
) | |
] #")" | |
| #"^^" [ | |
[ | |
#"/" (value: #"^/") | |
| #"-" (value: #"^-") | |
| #"~" (value: #"^(del)") | |
| #"^^" (value: #"^^") ;-- caret escaping case | |
| #"{" (value: #"{") | |
| #"}" (value: #"}") | |
| #"^"" (value: #"^"") | |
] | |
| pos: caret-char (value: pos/1 - 64) | |
] | |
] | |
char-rule: [ | |
{#"} (type: char!) [ | |
s: escaped-char | |
| copy value UTF8-printable (value: as-binary value) | |
| #"^-" (value: s/1) | |
] {"} | |
] | |
line-string: [ | |
{"} s: (type: string! stop: [not-str-char | newline-char]) | |
any [{^^"} | escaped-char | UTF8-filtered-char] | |
e: {"} | |
] | |
nested-curly-braces: [ | |
(cnt: 1 fail?: none) | |
any [[ | |
counted-newline | |
| "^^^^" | |
| "^^{" | |
| "^^}" | |
| #"{" (cnt: cnt + 1) | |
| e: #"}" (if zero? cnt: cnt - 1 [fail?: [end skip]]) | |
| UTF8-char | |
] fail? | |
] | |
#"}" (old-line: line) | |
] | |
multiline-string: [#"{" s: (type: string!) nested-curly-braces] | |
string-rule: [line-string | multiline-string] | |
tag-rule: [ | |
#"<" s: not-tag-1st (type: tag!) | |
any [#"^"" thru #"^"" | #"'" thru #"'" | not-tag-char] e: #">" | |
] | |
email-rule: [ | |
(stop: [not-email-char]) | |
s: opt [some UTF8-filtered-char] #"@" (type: email!) | |
any UTF8-filtered-char e: (value: dehex copy/part s e) | |
] | |
base-2-rule: [ | |
"2#{" (type: binary!) [ | |
s: any [counted-newline | 8 [#"0" | #"1" ] | ws-no-count | comment-rule] | |
e: #"}" (base: 2) | |
| (pos: skip s -3 throw-error) | |
] | |
] | |
base-16-rule: [ | |
opt "16" "#{" (type: binary!) [ | |
s: any [counted-newline | 2 hexa-char | ws-no-count | comment-rule] | |
e: #"}" (base: 16) | |
| (pos: skip s -2 throw-error) | |
] | |
] | |
base-64-rule: [ | |
"64#{" (type: binary!) [ | |
s: any [counted-newline | base64-char | ws-no-count | comment-rule] | |
e: #"}" ( | |
cnt: offset? s e | |
if all [0 < cnt cnt < 4][pos: skip s -4 throw-error] | |
base: 64 | |
) | |
| (pos: skip s -4 throw-error) | |
] | |
] | |
binary-rule: [[base-16-rule | base-64-rule | base-2-rule] (old-line: line)] | |
file-rule: [ | |
pos: #"%" (type: file! stop: [not-file-char | ws-no-count]) [ | |
#"{" (throw-error) | |
| line-string e: (value: encode-file s e) | |
| s: any UTF8-filtered-char e: (value: copy/part s e) | |
] | |
] | |
url-rule: [ | |
#":" (type: url! stop: [not-url-char | ws-no-count]) | |
some UTF8-filtered-char e: (value: dehex copy/part s e) | |
] | |
escaped-rule: [ | |
"#[" | |
any-value | |
; any-ws [ | |
; "true" (value: true) | |
; | "false" (value: false) | |
; | s: [ | |
; "none!" | "logic!" | "block!" | "integer!" | "word!" | |
; | "set-word!" | "get-word!" | "lit-word!" | "refinement!" | |
; | "binary!" | "string!" | "char!" | "bitset!" | "path!" | |
; | "set-path!" | "lit-path!" | "native!" | "action!" | |
; | "issue!" | "paren!" | "function!" | |
; ] e: (value: get to word! copy/part s e) | |
; | "none" (value: none) | |
; ] any-ws | |
#"]" | |
] | |
comment-rule: [#";" [to #"^/" | to end]] | |
wrong-end: [( | |
ending: either 1 < length? stack/stk [ | |
value: switch type?/word stack/top [ | |
block! [#"]"] | |
paren! [#")"] | |
] | |
first [(throw-error/with ["missing" mold value "character"])] | |
][none] | |
) | |
ending | |
] | |
literal-value: [ | |
pos: (e: none) s: [ | |
comment-rule | |
| escaped-rule (stack/push value) | |
| tuple-rule (stack/push load-tuple copy/part s e) | |
| hexa-rule (stack/push decode-hexa copy/part s e) | |
| binary-rule (stack/push load-binary s e base) | |
| email-rule (stack/push to email! value) | |
| date-rule (stack/push value) | |
| integer-rule (stack/push value) | |
| decimal-rule (stack/push load-decimal copy/part s e) | |
| tag-rule (stack/push to tag! copy/part s e) | |
| word-rule (stack/push to type value) | |
| lit-word-rule (stack/push to type value) | |
| get-word-rule (stack/push to type value) | |
| refinement-rule (stack/push to refinement! copy/part s e) | |
| slash-rule (stack/push to word! copy/part s e) | |
| file-rule (stack/push load-file value) | |
| char-rule (stack/push decode-UTF8-char value) | |
| block-rule (stack/push value) | |
| paren-rule (stack/push value) | |
| string-rule (stack/push load-string s e) | |
| map-rule (stack/push value) | |
| issue-rule (stack/push to issue! copy/part s e) | |
] | |
] | |
any-value: [pos: any [literal-value | ws]] | |
header: [ | |
pos: ["Red/System" | "Red" | "REBOL"] | |
any-ws block-rule (stack/push value) | |
| (throw-error/with "Invalid Red/REBOL program") end skip | |
] | |
program: [ | |
pos: opt UTF-8-BOM | |
header | |
any-value | |
opt wrong-end | |
] | |
;====== Helper functions ====== | |
stack: context [ | |
stk: [] | |
nl?: no | |
allocate: func [type [datatype!] size [integer!] /local new pos][ | |
pos: insert/only tail stk new: make type size | |
if nl? [new-line back pos yes nl?: no] | |
new | |
] | |
prefix: func [value][insert/only last stk :value] | |
push: func [value][ | |
value: insert/only tail last stk :value | |
if nl? [new-line back value yes nl?: no] | |
value | |
] | |
pop: func [type [datatype!] /local pos][ | |
pos: back tail stk | |
nl?: new-line? pos | |
either any [type = path! type = set-path!][ | |
change/only pos to type pos/1 | |
][ | |
if type <> type? pos/1 [ | |
throw-error/with ["invalid" mold type "closing delimiter"] | |
] | |
] | |
also pos/1 remove pos | |
] | |
top: does [last stk] | |
reset: does [clear stk] | |
] | |
throw-error: func [/with msg [string! block!]][ | |
print rejoin [ | |
"*** Syntax Error: " either with [ | |
uppercase/part reform msg 1 | |
][ | |
reform ["Invalid" mold type "value"] | |
] | |
"^/*** line: " line | |
"^/*** at: " mold copy/part pos 40 | |
"^/*** in: " mold source-file | |
] | |
; either encap? [quit][halt] | |
] | |
pad-head: func [s [string!]][ | |
head insert/dup s #"0" 8 - length? s | |
] | |
encode-UTF8-char: func [s [string!] e [string!] /local c code new][ | |
c: debase/base pad-head copy/part s e 16 | |
while [c/1 = 0][c: next c] ;-- trim heading zeros | |
code: to integer! c | |
case [ | |
code <= 127 [ | |
new: to char! code ;-- c <= 7Fh | |
] | |
code <= 2047 [ ;-- c <= 07FFh | |
new: (shift/left (shift code 6) or #"^(C0)" 8) | |
or (code and #"^(3F)") or #"^(80)" | |
] | |
code <= 65535 [ ;-- c <= FFFFh | |
new: (shift/left (shift code 12) or #"^(E0)" 16) | |
or (shift/left (shift code 6) and #"^(3F)" or #"^(80)" 8) | |
or (code and #"^(3F)") or #"^(80)" | |
] | |
code <= 1114111 [ ;-- c <= 10FFFFh | |
new: (shift/left (shift code 18) or #"^(F0)" 24) | |
or (shift/left (shift code 12) and #"^(3F)" or #"^(80)" 16) | |
or (shift/left (shift code 6) and #"^(3F)" or #"^(80)" 8) | |
or (code and #"^(3F)") or #"^(80)" | |
] | |
'else [ | |
throw-error/with "Codepoints above U+10FFFF are not supported" | |
] | |
] | |
if integer? new [ | |
new: debase/base to-hex new 16 | |
remove-each byte new [byte = #"^(null)"] | |
] | |
new | |
] | |
decode-UTF8-char: func [value /redbin][ | |
if all [not redbin char? value][return encode-char to integer! value] | |
value: switch/default length? value [ | |
1 [value] | |
2 [ | |
value: value and #{1F3F} | |
value: add shift/left value/1 6 value/2 | |
] | |
3 [ | |
value: value and #{0F3F3F} | |
value: add add | |
shift/left value/1 12 | |
shift/left value/2 6 | |
value/3 | |
] | |
4 [ | |
value: value and #{073F3F3F} | |
value: add add add | |
shift/left value/1 18 | |
shift/left value/2 12 | |
shift/left value/3 6 | |
value/4 | |
] | |
][ | |
throw-error/with "Unsupported or invalid UTF-8 encoding" | |
] | |
either redbin [value][ | |
encode-char to integer! value ;-- special encoding for Unicode char! | |
] | |
] | |
decode-UTF8-string: func [str [string!] /local new s e][ | |
new: make string! length? str | |
parse/all str [ | |
some [ | |
s: UTF8-char e: ( | |
append new debase/base skip decode-UTF8-char as-binary copy/part s e 7 16 | |
) | |
] | |
] | |
head change/part str new tail str | |
] | |
encode-char: func [value [integer!]][ | |
head insert to-hex value #"'" | |
] | |
decode-hexa: func [s [string!]][ | |
to integer! debase/base s 16 | |
] | |
as-time: func [h [integer!] m [integer!] s [integer! decimal!] neg? [logic!] /local t][ | |
if any [all [h <> 0 m < 0] all [s s < 0]][type: time! throw-error] | |
t: to time! reduce [abs h abs m abs s] | |
either neg? [negate t][t] | |
] | |
to-weekday: func [d [date!] wd [integer!]][ | |
if negative? wd: wd - d/weekday [wd: 7 + wd] | |
d + wd | |
] | |
to-iso-week: func [d [date!] w [integer!] /local wd d1][ | |
d1: make date! reduce [1 1 d/year] | |
wd: d1/weekday | |
d1 + (w - 1 * 7 + (either wd < 5 [1][8]) - wd) | |
] | |
load-tuple: func [s [string!] /local new byte p e][ | |
new: join make issue! 1 + length? s #"~" | |
byte: [p: 1 3 digit e: (append new skip to-hex load copy/part p e 6)] | |
unless parse s [byte 2 11 [dot byte]][throw-error] | |
new | |
] | |
load-number: func [s [string!]][ | |
switch/default type [ | |
#[datatype! decimal!][s: load-decimal s] | |
#[datatype! issue! ][ | |
if s = "-0.0" [s: "0-"] ;-- re-encoded for consistency | |
s: to issue! either #"%" = last s [s][join "." s] | |
if neg? [append s #"-"] | |
] | |
][ | |
unless find [integer! decimal!] type?/word s: to integer! s [throw-error] | |
] | |
s | |
] | |
load-decimal: func [s [string!]][ | |
unless attempt [s: to decimal! s][throw-error] | |
s | |
] | |
load-string: func [s [string!] e [string!] /local new filter][ | |
new: make string! offset? s e ;-- allocated size close to final size | |
filter: get pick [UTF8-char UTF8-filtered-char] s/-1 = #"{" | |
parse/all/case copy/part s e [ | |
any [ | |
escaped-char (insert tail new value) | |
| s: filter e: (insert/part tail new s e) | |
] ;-- exit on matching " or } | |
] | |
new | |
] | |
load-binary: func [s [string!] e [string!] base [integer!] /local new str][ | |
new: make string! offset? s e ;-- allocated size above final size | |
parse/all/case s [ | |
some [ | |
copy str some base64-char (insert tail new str) | |
| ws | comment-rule | |
| #"}" end skip | |
] | |
] | |
new: debase/base new base | |
if none? new [throw-error] | |
new | |
] | |
load-file: func [s [string!]][ | |
parse s [any [#"%" [2 hexa | (pos: skip pos negate 1 + length? s throw-error)] | skip]] | |
to file! replace/all dehex s #"\" #"/" | |
] | |
encode-file: func [s [string!] e [string!]][ | |
replace/all copy/part s back e "%" "%25" | |
] | |
process: func [src [string! binary!] /local blk][ | |
old-line: line: 1 | |
count?: yes | |
blk: stack/allocate block! 100 ;-- root block | |
unless parse/all/case src program [throw-error] | |
stack/reset | |
blk | |
] | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment