PhilipRoman · April 28, 2026 07:14
diff --git a/detect.lua b/detect.lua
 --[[
  detect.lua — distinguish UTF-8 from Shift-JIS

  Strategy (three layers):

  1. Grammar validation
     Parse the byte stream under each encoding's formal rules.
     A single invalid byte or sequence is decisive: the string
     cannot belong to that encoding.

  2. Discriminating byte patterns
     Even when both validations pass, certain byte values are
     structurally exclusive:
       · 0x81–0x9F  Shift-JIS lead bytes; impossible as UTF-8
                    start bytes (that range is continuation-only).
       · 0xA1–0xDF  Shift-JIS half-width katakana (single-byte);
                    these bytes appear in valid UTF-8 only as
                    continuation bytes buried inside multi-byte
                    sequences — never as isolated characters.
       · EF BB BF   UTF-8 BOM.

  3. Evidence scoring
     Walk each interpretation independently and sum weighted
     evidence tokens.  Normalize to a confidence in [0, 1].

     UTF-8  weights:
       +4  3-byte sequence, lead in E3–E9 (hiragana, katakana,
           most CJK unified ideographs — the bulk of Japanese)
       +2  other 3-byte sequence
       +1  2-byte sequence

     Shift-JIS weights:
       +5  double-byte char, lead 0x81–0x9F (never a UTF-8 start)
       +3  half-width katakana 0xA1–0xDF (standalone single byte)
       +2  double-byte char, lead 0xE0–0xFC (weaker: overlaps
           with UTF-8 3/4-byte lead range)

  ML note: a byte-bigram Naïve Bayes trained on labelled Japanese
  text would refine the ambiguous tail further, but requires a
  frequency table derived from training data.  For the specific
  two-encoding problem the structural evidence above is nearly as
  discriminating and needs no corpus.
 --]]

 local M = {}

 -- ── grammar validators ──────────────────────────────────────────────────────

 local function is_valid_utf8(s)
    local i, n = 1, #s
    while i <= n do
        local b = s:byte(i)
        local len
        if     b <= 0x7F                  then len = 1
        elseif b >= 0xC2 and b <= 0xDF   then len = 2
        elseif b >= 0xE0 and b <= 0xEF   then len = 3
        elseif b >= 0xF0 and b <= 0xF4   then len = 4
        else return false end             -- 0x80-0xBF (continuation-only),
                                          -- 0xC0-0xC1 (overlong), 0xF5-0xFF

        if i + len - 1 > n then return false end  -- truncated sequence

        for k = 1, len - 1 do
            local c = s:byte(i + k)
            if c < 0x80 or c > 0xBF then return false end
        end

        if len == 3 then
            local b2 = s:byte(i + 1)
            if b == 0xE0 and b2 < 0xA0 then return false end  -- overlong
            if b == 0xED and b2 >= 0xA0 then return false end  -- surrogates
        elseif len == 4 then
            local b2 = s:byte(i + 1)
            if b == 0xF0 and b2 < 0x90 then return false end  -- overlong
            if b == 0xF4 and b2 > 0x8F then return false end  -- > U+10FFFF
        end

        i = i + len
    end
    return true
 end

 local function is_valid_sjis(s)
    local i, n = 1, #s
    while i <= n do
        local b = s:byte(i)
        if b <= 0x7F then
            i = i + 1
        elseif b >= 0xA1 and b <= 0xDF then
            -- Half-width katakana: valid single byte
            i = i + 1
        elseif (b >= 0x81 and b <= 0x9F) or (b >= 0xE0 and b <= 0xFC) then
            -- Double-byte lead: must be followed by a valid trail byte
            if i >= n then return false end
            local b2 = s:byte(i + 1)
            if not ((b2 >= 0x40 and b2 <= 0x7E) or (b2 >= 0x80 and b2 <= 0xFC)) then
                return false
            end
            i = i + 2
        else
            -- 0x80, 0xA0, 0xFD–0xFF are undefined in Shift-JIS
            return false
        end
    end
    return true
 end

 -- ── evidence scorers ─────────────────────────────────────────────────────────
 -- Each function walks s under its own encoding rules and sums evidence weights.

 local function utf8_evidence(s)
    local score, i, n = 0, 1, #s
    while i <= n do
        local b = s:byte(i)
        if     b <= 0x7F                then i = i + 1
        elseif b >= 0xC2 and b <= 0xDF then score = score + 1; i = i + 2
        elseif b >= 0xE0 and b <= 0xEF then
            if i + 2 <= n then
                local b2, b3 = s:byte(i + 1), s:byte(i + 2)
                -- Decode the Unicode codepoint to check if it lands in a
                -- Japanese range.  Bits: (b & 0x0F)<<12 | (b2 & 0x3F)<<6 | (b3 & 0x3F)
                local cp = (b % 16) * 4096 + (b2 % 64) * 64 + (b3 % 64)
                if (cp >= 0x3000 and cp <= 0x9FFF) or   -- CJK symbols, hiragana,
                   (cp >= 0xFF00 and cp <= 0xFFEF) then  -- katakana, CJK unified ideographs
                    score = score + 8                    -- halfwidth/fullwidth forms
                elseif b >= 0xE3 and b <= 0xE9 then
                    score = score + 4  -- E3-E9 range but not a recognized Japanese cp
                else
                    score = score + 2
                end
            end
            i = i + 3
        elseif b >= 0xF0               then score = score + 1; i = i + 4
        else                                i = i + 1
        end
    end
    return score
 end

 local function sjis_evidence(s)
    local score, i, n = 0, 1, #s
    while i <= n do
        local b = s:byte(i)
        if b <= 0x7F then
            i = i + 1
        elseif b >= 0xA1 and b <= 0xDF then
            score = score + 3   -- half-width katakana: standalone, exclusive to SJIS
            i = i + 1
        elseif b >= 0x81 and b <= 0x9F then
            score = score + 5   -- lead 0x81-0x9F: structurally impossible in UTF-8
            i = i + 2
        elseif b >= 0xE0 and b <= 0xFC then
            score = score + 2   -- lead 0xE0-0xFC: weaker, overlaps UTF-8 range
            i = i + 2
        else
            i = i + 1
        end
    end
    return score
 end

 -- ── public API ───────────────────────────────────────────────────────────────

 -- detect(s) → encoding, confidence
 --
 -- encoding  : "utf-8" | "shift-jis" | "ascii" | "unknown" | "ambiguous"
 -- confidence: 0.0–1.0
 --
 -- "ascii"     — no bytes above 0x7F; both encodings are identical here.
 -- "unknown"   — neither encoding can parse the input without errors.
 -- "ambiguous" — both parse cleanly with equal evidence (very rare in practice).
 function M.detect(s)
    if #s == 0 then return "ascii", 1.0 end

    -- Explicit UTF-8 BOM is unambiguous
    if #s >= 3
       and s:byte(1) == 0xEF
       and s:byte(2) == 0xBB
       and s:byte(3) == 0xBF
    then
        return "utf-8", 1.0
    end

    local ok8  = is_valid_utf8(s)
    local okSJ = is_valid_sjis(s)

    if ok8  and not okSJ then return "utf-8",     0.99 end
    if okSJ and not ok8  then return "shift-jis", 0.99 end
    if not ok8 and not okSJ then return "unknown", 0.0  end

    -- Both valid: is there any non-ASCII content to discriminate on?
    if not s:find("[\x80-\xFF]") then return "ascii", 1.0 end

    local s8  = utf8_evidence(s)
    local sSJ = sjis_evidence(s)
    local total = s8 + sSJ

    if total == 0 then return "ambiguous", 0.5 end

    if   s8 > sSJ  then return "utf-8",     s8  / total
    elseif sSJ > s8 then return "shift-jis", sSJ / total
    else                 return "ambiguous", 0.5
    end
 end

 -- detect_file(path) → encoding, confidence   (or nil, errmsg)
 function M.detect_file(path)
    local f, err = io.open(path, "rb")
    if not f then return nil, err end
    local data = f:read("*a")
    f:close()
    return M.detect(data)
 end

 return M
	--[[
	detect.lua — distinguish UTF-8 from Shift-JIS

	Strategy (three layers):

	1. Grammar validation
	Parse the byte stream under each encoding's formal rules.
	A single invalid byte or sequence is decisive: the string
	cannot belong to that encoding.

	2. Discriminating byte patterns
	Even when both validations pass, certain byte values are
	structurally exclusive:
	· 0x81–0x9F Shift-JIS lead bytes; impossible as UTF-8
	start bytes (that range is continuation-only).
	· 0xA1–0xDF Shift-JIS half-width katakana (single-byte);
	these bytes appear in valid UTF-8 only as
	continuation bytes buried inside multi-byte
	sequences — never as isolated characters.
	· EF BB BF UTF-8 BOM.

	3. Evidence scoring
	Walk each interpretation independently and sum weighted
	evidence tokens. Normalize to a confidence in [0, 1].

	UTF-8 weights:
	+4 3-byte sequence, lead in E3–E9 (hiragana, katakana,
	most CJK unified ideographs — the bulk of Japanese)
	+2 other 3-byte sequence
	+1 2-byte sequence

	Shift-JIS weights:
	+5 double-byte char, lead 0x81–0x9F (never a UTF-8 start)
	+3 half-width katakana 0xA1–0xDF (standalone single byte)
	+2 double-byte char, lead 0xE0–0xFC (weaker: overlaps
	with UTF-8 3/4-byte lead range)

	ML note: a byte-bigram Naïve Bayes trained on labelled Japanese
	text would refine the ambiguous tail further, but requires a
	frequency table derived from training data. For the specific
	two-encoding problem the structural evidence above is nearly as
	discriminating and needs no corpus.
	--]]

	local M = {}

	-- ── grammar validators ──────────────────────────────────────────────────────

	local function is_valid_utf8(s)
	local i, n = 1, #s
	while i <= n do
	local b = s:byte(i)
	local len
	if b <= 0x7F then len = 1
	elseif b >= 0xC2 and b <= 0xDF then len = 2
	elseif b >= 0xE0 and b <= 0xEF then len = 3
	elseif b >= 0xF0 and b <= 0xF4 then len = 4
	else return false end -- 0x80-0xBF (continuation-only),
	-- 0xC0-0xC1 (overlong), 0xF5-0xFF

	if i + len - 1 > n then return false end -- truncated sequence

	for k = 1, len - 1 do
	local c = s:byte(i + k)
	if c < 0x80 or c > 0xBF then return false end
	end

	if len == 3 then
	local b2 = s:byte(i + 1)
	if b == 0xE0 and b2 < 0xA0 then return false end -- overlong
	if b == 0xED and b2 >= 0xA0 then return false end -- surrogates
	elseif len == 4 then
	local b2 = s:byte(i + 1)
	if b == 0xF0 and b2 < 0x90 then return false end -- overlong
	if b == 0xF4 and b2 > 0x8F then return false end -- > U+10FFFF
	end

	i = i + len
	end
	return true
	end

	local function is_valid_sjis(s)
	local i, n = 1, #s
	while i <= n do
	local b = s:byte(i)
	if b <= 0x7F then
	i = i + 1
	elseif b >= 0xA1 and b <= 0xDF then
	-- Half-width katakana: valid single byte
	i = i + 1
	elseif (b >= 0x81 and b <= 0x9F) or (b >= 0xE0 and b <= 0xFC) then
	-- Double-byte lead: must be followed by a valid trail byte
	if i >= n then return false end
	local b2 = s:byte(i + 1)
	if not ((b2 >= 0x40 and b2 <= 0x7E) or (b2 >= 0x80 and b2 <= 0xFC)) then
	return false
	end
	i = i + 2
	else
	-- 0x80, 0xA0, 0xFD–0xFF are undefined in Shift-JIS
	return false
	end
	end
	return true
	end

	-- ── evidence scorers ─────────────────────────────────────────────────────────
	-- Each function walks s under its own encoding rules and sums evidence weights.

	local function utf8_evidence(s)
	local score, i, n = 0, 1, #s
	while i <= n do
	local b = s:byte(i)
	if b <= 0x7F then i = i + 1
	elseif b >= 0xC2 and b <= 0xDF then score = score + 1; i = i + 2
	elseif b >= 0xE0 and b <= 0xEF then
	if i + 2 <= n then
	local b2, b3 = s:byte(i + 1), s:byte(i + 2)
	-- Decode the Unicode codepoint to check if it lands in a
	-- Japanese range. Bits: (b & 0x0F)<<12 \| (b2 & 0x3F)<<6 \| (b3 & 0x3F)
	local cp = (b % 16) * 4096 + (b2 % 64) * 64 + (b3 % 64)
	if (cp >= 0x3000 and cp <= 0x9FFF) or -- CJK symbols, hiragana,
	(cp >= 0xFF00 and cp <= 0xFFEF) then -- katakana, CJK unified ideographs
	score = score + 8 -- halfwidth/fullwidth forms
	elseif b >= 0xE3 and b <= 0xE9 then
	score = score + 4 -- E3-E9 range but not a recognized Japanese cp
	else
	score = score + 2
	end
	end
	i = i + 3
	elseif b >= 0xF0 then score = score + 1; i = i + 4
	else i = i + 1
	end
	end
	return score
	end

	local function sjis_evidence(s)
	local score, i, n = 0, 1, #s
	while i <= n do
	local b = s:byte(i)
	if b <= 0x7F then
	i = i + 1
	elseif b >= 0xA1 and b <= 0xDF then
	score = score + 3 -- half-width katakana: standalone, exclusive to SJIS
	i = i + 1
	elseif b >= 0x81 and b <= 0x9F then
	score = score + 5 -- lead 0x81-0x9F: structurally impossible in UTF-8
	i = i + 2
	elseif b >= 0xE0 and b <= 0xFC then
	score = score + 2 -- lead 0xE0-0xFC: weaker, overlaps UTF-8 range
	i = i + 2
	else
	i = i + 1
	end
	end
	return score
	end

	-- ── public API ───────────────────────────────────────────────────────────────

	-- detect(s) → encoding, confidence
	--
	-- encoding : "utf-8" \| "shift-jis" \| "ascii" \| "unknown" \| "ambiguous"
	-- confidence: 0.0–1.0
	--
	-- "ascii" — no bytes above 0x7F; both encodings are identical here.
	-- "unknown" — neither encoding can parse the input without errors.
	-- "ambiguous" — both parse cleanly with equal evidence (very rare in practice).
	function M.detect(s)
	if #s == 0 then return "ascii", 1.0 end

	-- Explicit UTF-8 BOM is unambiguous
	if #s >= 3
	and s:byte(1) == 0xEF
	and s:byte(2) == 0xBB
	and s:byte(3) == 0xBF
	then
	return "utf-8", 1.0
	end

	local ok8 = is_valid_utf8(s)
	local okSJ = is_valid_sjis(s)

	if ok8 and not okSJ then return "utf-8", 0.99 end
	if okSJ and not ok8 then return "shift-jis", 0.99 end
	if not ok8 and not okSJ then return "unknown", 0.0 end

	-- Both valid: is there any non-ASCII content to discriminate on?
	if not s:find("[\x80-\xFF]") then return "ascii", 1.0 end

	local s8 = utf8_evidence(s)
	local sSJ = sjis_evidence(s)
	local total = s8 + sSJ

	if total == 0 then return "ambiguous", 0.5 end

	if s8 > sSJ then return "utf-8", s8 / total
	elseif sSJ > s8 then return "shift-jis", sSJ / total
	else return "ambiguous", 0.5
	end
	end

	-- detect_file(path) → encoding, confidence (or nil, errmsg)
	function M.detect_file(path)
	local f, err = io.open(path, "rb")
	if not f then return nil, err end
	local data = f:read("*a")
	f:close()
	return M.detect(data)
	end

	return M
No results found