Skip to content

Instantly share code, notes, and snippets.

@PhilipRoman
Created April 28, 2026 07:14
Show Gist options
  • Select an option

  • Save PhilipRoman/e59495f542995e8166cfe6c24506a2ae to your computer and use it in GitHub Desktop.

Select an option

Save PhilipRoman/e59495f542995e8166cfe6c24506a2ae to your computer and use it in GitHub Desktop.
--[[
detect.lua — distinguish UTF-8 from Shift-JIS
Strategy (three layers):
1. Grammar validation
Parse the byte stream under each encoding's formal rules.
A single invalid byte or sequence is decisive: the string
cannot belong to that encoding.
2. Discriminating byte patterns
Even when both validations pass, certain byte values are
structurally exclusive:
· 0x81–0x9F Shift-JIS lead bytes; impossible as UTF-8
start bytes (that range is continuation-only).
· 0xA1–0xDF Shift-JIS half-width katakana (single-byte);
these bytes appear in valid UTF-8 only as
continuation bytes buried inside multi-byte
sequences — never as isolated characters.
· EF BB BF UTF-8 BOM.
3. Evidence scoring
Walk each interpretation independently and sum weighted
evidence tokens. Normalize to a confidence in [0, 1].
UTF-8 weights:
+4 3-byte sequence, lead in E3–E9 (hiragana, katakana,
most CJK unified ideographs — the bulk of Japanese)
+2 other 3-byte sequence
+1 2-byte sequence
Shift-JIS weights:
+5 double-byte char, lead 0x81–0x9F (never a UTF-8 start)
+3 half-width katakana 0xA1–0xDF (standalone single byte)
+2 double-byte char, lead 0xE0–0xFC (weaker: overlaps
with UTF-8 3/4-byte lead range)
ML note: a byte-bigram Naïve Bayes trained on labelled Japanese
text would refine the ambiguous tail further, but requires a
frequency table derived from training data. For the specific
two-encoding problem the structural evidence above is nearly as
discriminating and needs no corpus.
--]]
local M = {}
-- ── grammar validators ──────────────────────────────────────────────────────
local function is_valid_utf8(s)
local i, n = 1, #s
while i <= n do
local b = s:byte(i)
local len
if b <= 0x7F then len = 1
elseif b >= 0xC2 and b <= 0xDF then len = 2
elseif b >= 0xE0 and b <= 0xEF then len = 3
elseif b >= 0xF0 and b <= 0xF4 then len = 4
else return false end -- 0x80-0xBF (continuation-only),
-- 0xC0-0xC1 (overlong), 0xF5-0xFF
if i + len - 1 > n then return false end -- truncated sequence
for k = 1, len - 1 do
local c = s:byte(i + k)
if c < 0x80 or c > 0xBF then return false end
end
if len == 3 then
local b2 = s:byte(i + 1)
if b == 0xE0 and b2 < 0xA0 then return false end -- overlong
if b == 0xED and b2 >= 0xA0 then return false end -- surrogates
elseif len == 4 then
local b2 = s:byte(i + 1)
if b == 0xF0 and b2 < 0x90 then return false end -- overlong
if b == 0xF4 and b2 > 0x8F then return false end -- > U+10FFFF
end
i = i + len
end
return true
end
local function is_valid_sjis(s)
local i, n = 1, #s
while i <= n do
local b = s:byte(i)
if b <= 0x7F then
i = i + 1
elseif b >= 0xA1 and b <= 0xDF then
-- Half-width katakana: valid single byte
i = i + 1
elseif (b >= 0x81 and b <= 0x9F) or (b >= 0xE0 and b <= 0xFC) then
-- Double-byte lead: must be followed by a valid trail byte
if i >= n then return false end
local b2 = s:byte(i + 1)
if not ((b2 >= 0x40 and b2 <= 0x7E) or (b2 >= 0x80 and b2 <= 0xFC)) then
return false
end
i = i + 2
else
-- 0x80, 0xA0, 0xFD–0xFF are undefined in Shift-JIS
return false
end
end
return true
end
-- ── evidence scorers ─────────────────────────────────────────────────────────
-- Each function walks s under its own encoding rules and sums evidence weights.
local function utf8_evidence(s)
local score, i, n = 0, 1, #s
while i <= n do
local b = s:byte(i)
if b <= 0x7F then i = i + 1
elseif b >= 0xC2 and b <= 0xDF then score = score + 1; i = i + 2
elseif b >= 0xE0 and b <= 0xEF then
if i + 2 <= n then
local b2, b3 = s:byte(i + 1), s:byte(i + 2)
-- Decode the Unicode codepoint to check if it lands in a
-- Japanese range. Bits: (b & 0x0F)<<12 | (b2 & 0x3F)<<6 | (b3 & 0x3F)
local cp = (b % 16) * 4096 + (b2 % 64) * 64 + (b3 % 64)
if (cp >= 0x3000 and cp <= 0x9FFF) or -- CJK symbols, hiragana,
(cp >= 0xFF00 and cp <= 0xFFEF) then -- katakana, CJK unified ideographs
score = score + 8 -- halfwidth/fullwidth forms
elseif b >= 0xE3 and b <= 0xE9 then
score = score + 4 -- E3-E9 range but not a recognized Japanese cp
else
score = score + 2
end
end
i = i + 3
elseif b >= 0xF0 then score = score + 1; i = i + 4
else i = i + 1
end
end
return score
end
local function sjis_evidence(s)
local score, i, n = 0, 1, #s
while i <= n do
local b = s:byte(i)
if b <= 0x7F then
i = i + 1
elseif b >= 0xA1 and b <= 0xDF then
score = score + 3 -- half-width katakana: standalone, exclusive to SJIS
i = i + 1
elseif b >= 0x81 and b <= 0x9F then
score = score + 5 -- lead 0x81-0x9F: structurally impossible in UTF-8
i = i + 2
elseif b >= 0xE0 and b <= 0xFC then
score = score + 2 -- lead 0xE0-0xFC: weaker, overlaps UTF-8 range
i = i + 2
else
i = i + 1
end
end
return score
end
-- ── public API ───────────────────────────────────────────────────────────────
-- detect(s) → encoding, confidence
--
-- encoding : "utf-8" | "shift-jis" | "ascii" | "unknown" | "ambiguous"
-- confidence: 0.0–1.0
--
-- "ascii" — no bytes above 0x7F; both encodings are identical here.
-- "unknown" — neither encoding can parse the input without errors.
-- "ambiguous" — both parse cleanly with equal evidence (very rare in practice).
function M.detect(s)
if #s == 0 then return "ascii", 1.0 end
-- Explicit UTF-8 BOM is unambiguous
if #s >= 3
and s:byte(1) == 0xEF
and s:byte(2) == 0xBB
and s:byte(3) == 0xBF
then
return "utf-8", 1.0
end
local ok8 = is_valid_utf8(s)
local okSJ = is_valid_sjis(s)
if ok8 and not okSJ then return "utf-8", 0.99 end
if okSJ and not ok8 then return "shift-jis", 0.99 end
if not ok8 and not okSJ then return "unknown", 0.0 end
-- Both valid: is there any non-ASCII content to discriminate on?
if not s:find("[\x80-\xFF]") then return "ascii", 1.0 end
local s8 = utf8_evidence(s)
local sSJ = sjis_evidence(s)
local total = s8 + sSJ
if total == 0 then return "ambiguous", 0.5 end
if s8 > sSJ then return "utf-8", s8 / total
elseif sSJ > s8 then return "shift-jis", sSJ / total
else return "ambiguous", 0.5
end
end
-- detect_file(path) → encoding, confidence (or nil, errmsg)
function M.detect_file(path)
local f, err = io.open(path, "rb")
if not f then return nil, err end
local data = f:read("*a")
f:close()
return M.detect(data)
end
return M
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment