Last active
December 6, 2021 04:57
-
-
Save actboy168/9e06b1214858a5e8973f064fe1184141 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
local error = error | |
local strchar = string.char | |
local strbyte = string.byte | |
local strmatch = string.match | |
local utf8char = utf8.char | |
local tconcat = table.concat | |
local function be_tochar(code) | |
return strchar((code >> 8) & 0xFF, code & 0xFF) | |
end | |
local function be_tobyte(s, i) | |
local h, l = strbyte(s, i, i+1) | |
return (h << 8) | l | |
end | |
local function le_tochar(code) | |
return strchar(code & 0xFF, (code >> 8) & 0xFF) | |
end | |
local function le_tobyte(s, i) | |
local l, h = strbyte(s, i, i+1) | |
return (h << 8) | l | |
end | |
local function utf16char(tochar, code) | |
if code < 0x10000 then | |
return tochar(code) | |
else | |
code = code - 0x10000 | |
return tochar(0xD800 + (code >> 10))..tochar(0xDC00 + (code & 0x3FF)) | |
end | |
end | |
local function utf16next(s, n, tobyte) | |
if n > #s then | |
return | |
end | |
local code1 = tobyte(s, n) | |
if code1 < 0xD800 or code1 >= 0xE000 then | |
return n+2, code1 | |
elseif code1 >= 0xD800 and code1 < 0xDC00 then | |
n = n + 2 | |
if n > #s then | |
return n --invaild | |
end | |
local code2 = tobyte(s, n) | |
if code2 < 0xDC00 or code2 >= 0xE000 then | |
return n --invaild | |
end | |
local code = 0x10000 + ((code1 - 0xD800) << 10) + ((code2 - 0xDC00) & 0x3FF) | |
return n+2, code | |
else | |
return n+2 --invaild | |
end | |
end | |
local function utf16codes(s, tobyte) | |
return function (_, n) | |
return utf16next(s, n, tobyte) | |
end, s, 1 | |
end | |
local _utf8byte = utf8.codes "" | |
local function utf8byte(s, n) | |
local _, code = _utf8byte(s, n-1) | |
return code | |
end | |
--[[ | |
U+0000.. U+007F 00..7F | |
U+0080.. U+07FF C2..DF 80..BF | |
U+0800.. U+0FFF E0 A0..BF 80..BF | |
U+1000.. U+CFFF E1..EC 80..BF 80..BF | |
U+D000.. U+D7FF ED 80..9F 80..BF | |
U+E000.. U+FFFF EE..EF 80..BF 80..BF | |
U+10000.. U+3FFFF F0 90..BF 80..BF 80..BF | |
U+40000.. U+FFFFF F1..F3 80..BF 80..BF 80..BF | |
U+100000..U+10FFFF F4 80..8F 80..BF 80..BF | |
]] | |
local function utf8next(s, n) | |
if n > #s then | |
return | |
end | |
if strmatch(s, "^[\0-\x7F]", n) then | |
return n+1, utf8byte(s, n) | |
elseif strmatch(s, "^[\xC2-\xDF][\x80-\xBF]", n) then | |
return n+2, utf8byte(s, n) | |
elseif strmatch(s, "^[\xE0][\xA0-\xBF][\x80-\xBF]", n) then | |
return n+3, utf8byte(s, n) | |
elseif strmatch(s, "^[\xE1-\xEC][\x80-\xBF][\x80-\xBF]", n) then | |
return n+3, utf8byte(s, n) | |
elseif strmatch(s, "^[\xED][\x80-\x9F][\x80-\xBF]", n) then | |
return n+3, utf8byte(s, n) | |
elseif strmatch(s, "^[\xEE-\xEF][\x80-\xBF][\x80-\xBF]", n) then | |
return n+3, utf8byte(s, n) | |
elseif strmatch(s, "^[\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF]", n) then | |
return n+4, utf8byte(s, n) | |
elseif strmatch(s, "^[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]", n) then | |
return n+4, utf8byte(s, n) | |
elseif strmatch(s, "^[\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF]", n) then | |
return n+4, utf8byte(s, n) | |
else | |
return n+1 --invaild | |
end | |
end | |
local function utf8codes(s) | |
return utf8next, s, 1 | |
end | |
return function (what, replace) | |
local tobyte, tochar | |
if what == "be" then | |
tobyte = be_tobyte | |
tochar = be_tochar | |
else | |
tobyte = le_tobyte | |
tochar = le_tochar | |
end | |
local utf8replace = replace and utf8char(replace) | |
local utf16replace = replace and utf16char(tochar, replace) | |
local function toutf8(s) | |
local r = {} | |
for _, code in utf16codes(s, tobyte) do | |
if code == nil then | |
if replace then | |
r[#r+1] = utf8replace | |
else | |
error "invalid UTF-16 code" | |
end | |
else | |
r[#r+1] = utf8char(code) | |
end | |
end | |
return tconcat(r) | |
end | |
local function fromutf8(s) | |
local r = {} | |
for _, code in utf8codes(s) do | |
if code == nil then | |
if replace then | |
r[#r+1] = utf16replace | |
else | |
error "invalid UTF-8 code" | |
end | |
else | |
r[#r+1] = utf16char(tochar, code) | |
end | |
end | |
return tconcat(r) | |
end | |
return { | |
toutf8 = toutf8, | |
fromutf8 = fromutf8, | |
} | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment