actboy168 · December 6, 2021 04:57
diff --git a/utf16.lua b/utf16.lua
 local error = error
 local strchar = string.char
 local strbyte = string.byte
 local strmatch = string.match
 local utf8char = utf8.char
 local tconcat = table.concat

 local function be_tochar(code)
    return strchar((code >> 8) & 0xFF, code & 0xFF)
 end

 local function be_tobyte(s, i)
    local h, l = strbyte(s, i, i+1)
    return (h << 8) | l
 end

 local function le_tochar(code)
    return strchar(code & 0xFF, (code >> 8) & 0xFF)
 end

 local function le_tobyte(s, i)
    local l, h = strbyte(s, i, i+1)
    return (h << 8) | l
 end

 local function utf16char(tochar, code)
    if code < 0x10000 then
        return tochar(code)
    else
        code = code - 0x10000
        return tochar(0xD800 + (code >> 10))..tochar(0xDC00 + (code & 0x3FF))
    end
 end

 local function utf16next(s, n, tobyte)
    if n > #s then
        return
    end
    local code1 = tobyte(s, n)
    if code1 < 0xD800 or code1 >= 0xE000 then
        return n+2, code1
    elseif code1 >= 0xD800 and code1 < 0xDC00 then
        n = n + 2
        if n > #s then
            return n --invaild
        end
        local code2 = tobyte(s, n)
        if code2 < 0xDC00 or code2 >= 0xE000 then
            return n --invaild
        end
        local code = 0x10000 + ((code1 - 0xD800) << 10) + ((code2 - 0xDC00) & 0x3FF)
        return n+2, code
    else
        return n+2 --invaild
    end
 end

 local function utf16codes(s, tobyte)
    return function (_, n)
        return utf16next(s, n, tobyte)
    end, s, 1
 end

 local _utf8byte = utf8.codes ""
 local function utf8byte(s, n)
    local _, code = _utf8byte(s, n-1)
    return code
 end

 --[[
  U+0000..  U+007F 00..7F
  U+0080..  U+07FF C2..DF 80..BF
  U+0800..  U+0FFF E0     A0..BF 80..BF
  U+1000..  U+CFFF E1..EC 80..BF 80..BF
  U+D000..  U+D7FF ED     80..9F 80..BF
  U+E000..  U+FFFF EE..EF 80..BF 80..BF
 U+10000.. U+3FFFF F0     90..BF 80..BF 80..BF
 U+40000.. U+FFFFF F1..F3 80..BF 80..BF 80..BF
 U+100000..U+10FFFF F4     80..8F 80..BF 80..BF
 ]]
 local function utf8next(s, n)
    if n > #s then
        return
    end
    if strmatch(s, "^[\0-\x7F]", n) then
        return n+1, utf8byte(s, n)
    elseif strmatch(s, "^[\xC2-\xDF][\x80-\xBF]", n) then
        return n+2, utf8byte(s, n)
    elseif strmatch(s, "^[\xE0][\xA0-\xBF][\x80-\xBF]", n) then
        return n+3, utf8byte(s, n)
    elseif strmatch(s, "^[\xE1-\xEC][\x80-\xBF][\x80-\xBF]", n) then
        return n+3, utf8byte(s, n)
    elseif strmatch(s, "^[\xED][\x80-\x9F][\x80-\xBF]", n) then
        return n+3, utf8byte(s, n)
    elseif strmatch(s, "^[\xEE-\xEF][\x80-\xBF][\x80-\xBF]", n) then
        return n+3, utf8byte(s, n)
    elseif strmatch(s, "^[\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF]", n) then
        return n+4, utf8byte(s, n)
    elseif strmatch(s, "^[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]", n) then
        return n+4, utf8byte(s, n)
    elseif strmatch(s, "^[\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF]", n) then
        return n+4, utf8byte(s, n)
    else
        return n+1 --invaild
    end
 end

 local function utf8codes(s)
    return utf8next, s, 1
 end

 return function (what, replace)
    local tobyte, tochar
    if what == "be" then
        tobyte = be_tobyte
        tochar = be_tochar
    else
        tobyte = le_tobyte
        tochar = le_tochar
    end
    local utf8replace  = replace and utf8char(replace)
    local utf16replace = replace and utf16char(tochar, replace)
    local function toutf8(s)
        local r = {}
        for _, code in utf16codes(s, tobyte) do
            if code == nil then
                if replace then
                    r[#r+1] = utf8replace
                else
                    error "invalid UTF-16 code"
                end
            else
                r[#r+1] = utf8char(code)
            end
        end
        return tconcat(r)
    end
    local function fromutf8(s)
        local r = {}
        for _, code in utf8codes(s) do
            if code == nil then
                if replace then
                    r[#r+1] = utf16replace
                else
                    error "invalid UTF-8 code"
                end
            else
                r[#r+1] = utf16char(tochar, code)
            end
        end
        return tconcat(r)
    end
    return {
        toutf8 = toutf8,
        fromutf8 = fromutf8,
    }
 end
	local error = error
	local strchar = string.char
	local strbyte = string.byte
	local strmatch = string.match
	local utf8char = utf8.char
	local tconcat = table.concat

	local function be_tochar(code)
	return strchar((code >> 8) & 0xFF, code & 0xFF)
	end

	local function be_tobyte(s, i)
	local h, l = strbyte(s, i, i+1)
	return (h << 8) \| l
	end

	local function le_tochar(code)
	return strchar(code & 0xFF, (code >> 8) & 0xFF)
	end

	local function le_tobyte(s, i)
	local l, h = strbyte(s, i, i+1)
	return (h << 8) \| l
	end

	local function utf16char(tochar, code)
	if code < 0x10000 then
	return tochar(code)
	else
	code = code - 0x10000
	return tochar(0xD800 + (code >> 10))..tochar(0xDC00 + (code & 0x3FF))
	end
	end

	local function utf16next(s, n, tobyte)
	if n > #s then
	return
	end
	local code1 = tobyte(s, n)
	if code1 < 0xD800 or code1 >= 0xE000 then
	return n+2, code1
	elseif code1 >= 0xD800 and code1 < 0xDC00 then
	n = n + 2
	if n > #s then
	return n --invaild
	end
	local code2 = tobyte(s, n)
	if code2 < 0xDC00 or code2 >= 0xE000 then
	return n --invaild
	end
	local code = 0x10000 + ((code1 - 0xD800) << 10) + ((code2 - 0xDC00) & 0x3FF)
	return n+2, code
	else
	return n+2 --invaild
	end
	end

	local function utf16codes(s, tobyte)
	return function (_, n)
	return utf16next(s, n, tobyte)
	end, s, 1
	end

	local _utf8byte = utf8.codes ""
	local function utf8byte(s, n)
	local _, code = _utf8byte(s, n-1)
	return code
	end

	--[[
	U+0000.. U+007F 00..7F
	U+0080.. U+07FF C2..DF 80..BF
	U+0800.. U+0FFF E0 A0..BF 80..BF
	U+1000.. U+CFFF E1..EC 80..BF 80..BF
	U+D000.. U+D7FF ED 80..9F 80..BF
	U+E000.. U+FFFF EE..EF 80..BF 80..BF
	U+10000.. U+3FFFF F0 90..BF 80..BF 80..BF
	U+40000.. U+FFFFF F1..F3 80..BF 80..BF 80..BF
	U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
	]]
	local function utf8next(s, n)
	if n > #s then
	return
	end
	if strmatch(s, "^[\0-\x7F]", n) then
	return n+1, utf8byte(s, n)
	elseif strmatch(s, "^[\xC2-\xDF][\x80-\xBF]", n) then
	return n+2, utf8byte(s, n)
	elseif strmatch(s, "^[\xE0][\xA0-\xBF][\x80-\xBF]", n) then
	return n+3, utf8byte(s, n)
	elseif strmatch(s, "^[\xE1-\xEC][\x80-\xBF][\x80-\xBF]", n) then
	return n+3, utf8byte(s, n)
	elseif strmatch(s, "^[\xED][\x80-\x9F][\x80-\xBF]", n) then
	return n+3, utf8byte(s, n)
	elseif strmatch(s, "^[\xEE-\xEF][\x80-\xBF][\x80-\xBF]", n) then
	return n+3, utf8byte(s, n)
	elseif strmatch(s, "^[\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF]", n) then
	return n+4, utf8byte(s, n)
	elseif strmatch(s, "^[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]", n) then
	return n+4, utf8byte(s, n)
	elseif strmatch(s, "^[\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF]", n) then
	return n+4, utf8byte(s, n)
	else
	return n+1 --invaild
	end
	end

	local function utf8codes(s)
	return utf8next, s, 1
	end

	return function (what, replace)
	local tobyte, tochar
	if what == "be" then
	tobyte = be_tobyte
	tochar = be_tochar
	else
	tobyte = le_tobyte
	tochar = le_tochar
	end
	local utf8replace = replace and utf8char(replace)
	local utf16replace = replace and utf16char(tochar, replace)
	local function toutf8(s)
	local r = {}
	for _, code in utf16codes(s, tobyte) do
	if code == nil then
	if replace then
	r[#r+1] = utf8replace
	else
	error "invalid UTF-16 code"
	end
	else
	r[#r+1] = utf8char(code)
	end
	end
	return tconcat(r)
	end
	local function fromutf8(s)
	local r = {}
	for _, code in utf8codes(s) do
	if code == nil then
	if replace then
	r[#r+1] = utf16replace
	else
	error "invalid UTF-8 code"
	end
	else
	r[#r+1] = utf16char(tochar, code)
	end
	end
	return tconcat(r)
	end
	return {
	toutf8 = toutf8,
	fromutf8 = fromutf8,
	}
	end