Created
April 28, 2026 07:14
-
-
Save PhilipRoman/e59495f542995e8166cfe6c24506a2ae to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| --[[ | |
| detect.lua — distinguish UTF-8 from Shift-JIS | |
| Strategy (three layers): | |
| 1. Grammar validation | |
| Parse the byte stream under each encoding's formal rules. | |
| A single invalid byte or sequence is decisive: the string | |
| cannot belong to that encoding. | |
| 2. Discriminating byte patterns | |
| Even when both validations pass, certain byte values are | |
| structurally exclusive: | |
| · 0x81–0x9F Shift-JIS lead bytes; impossible as UTF-8 | |
| start bytes (that range is continuation-only). | |
| · 0xA1–0xDF Shift-JIS half-width katakana (single-byte); | |
| these bytes appear in valid UTF-8 only as | |
| continuation bytes buried inside multi-byte | |
| sequences — never as isolated characters. | |
| · EF BB BF UTF-8 BOM. | |
| 3. Evidence scoring | |
| Walk each interpretation independently and sum weighted | |
| evidence tokens. Normalize to a confidence in [0, 1]. | |
| UTF-8 weights: | |
| +4 3-byte sequence, lead in E3–E9 (hiragana, katakana, | |
| most CJK unified ideographs — the bulk of Japanese) | |
| +2 other 3-byte sequence | |
| +1 2-byte sequence | |
| Shift-JIS weights: | |
| +5 double-byte char, lead 0x81–0x9F (never a UTF-8 start) | |
| +3 half-width katakana 0xA1–0xDF (standalone single byte) | |
| +2 double-byte char, lead 0xE0–0xFC (weaker: overlaps | |
| with UTF-8 3/4-byte lead range) | |
| ML note: a byte-bigram Naïve Bayes trained on labelled Japanese | |
| text would refine the ambiguous tail further, but requires a | |
| frequency table derived from training data. For the specific | |
| two-encoding problem the structural evidence above is nearly as | |
| discriminating and needs no corpus. | |
| --]] | |
| local M = {} | |
| -- ── grammar validators ────────────────────────────────────────────────────── | |
| local function is_valid_utf8(s) | |
| local i, n = 1, #s | |
| while i <= n do | |
| local b = s:byte(i) | |
| local len | |
| if b <= 0x7F then len = 1 | |
| elseif b >= 0xC2 and b <= 0xDF then len = 2 | |
| elseif b >= 0xE0 and b <= 0xEF then len = 3 | |
| elseif b >= 0xF0 and b <= 0xF4 then len = 4 | |
| else return false end -- 0x80-0xBF (continuation-only), | |
| -- 0xC0-0xC1 (overlong), 0xF5-0xFF | |
| if i + len - 1 > n then return false end -- truncated sequence | |
| for k = 1, len - 1 do | |
| local c = s:byte(i + k) | |
| if c < 0x80 or c > 0xBF then return false end | |
| end | |
| if len == 3 then | |
| local b2 = s:byte(i + 1) | |
| if b == 0xE0 and b2 < 0xA0 then return false end -- overlong | |
| if b == 0xED and b2 >= 0xA0 then return false end -- surrogates | |
| elseif len == 4 then | |
| local b2 = s:byte(i + 1) | |
| if b == 0xF0 and b2 < 0x90 then return false end -- overlong | |
| if b == 0xF4 and b2 > 0x8F then return false end -- > U+10FFFF | |
| end | |
| i = i + len | |
| end | |
| return true | |
| end | |
| local function is_valid_sjis(s) | |
| local i, n = 1, #s | |
| while i <= n do | |
| local b = s:byte(i) | |
| if b <= 0x7F then | |
| i = i + 1 | |
| elseif b >= 0xA1 and b <= 0xDF then | |
| -- Half-width katakana: valid single byte | |
| i = i + 1 | |
| elseif (b >= 0x81 and b <= 0x9F) or (b >= 0xE0 and b <= 0xFC) then | |
| -- Double-byte lead: must be followed by a valid trail byte | |
| if i >= n then return false end | |
| local b2 = s:byte(i + 1) | |
| if not ((b2 >= 0x40 and b2 <= 0x7E) or (b2 >= 0x80 and b2 <= 0xFC)) then | |
| return false | |
| end | |
| i = i + 2 | |
| else | |
| -- 0x80, 0xA0, 0xFD–0xFF are undefined in Shift-JIS | |
| return false | |
| end | |
| end | |
| return true | |
| end | |
| -- ── evidence scorers ───────────────────────────────────────────────────────── | |
| -- Each function walks s under its own encoding rules and sums evidence weights. | |
| local function utf8_evidence(s) | |
| local score, i, n = 0, 1, #s | |
| while i <= n do | |
| local b = s:byte(i) | |
| if b <= 0x7F then i = i + 1 | |
| elseif b >= 0xC2 and b <= 0xDF then score = score + 1; i = i + 2 | |
| elseif b >= 0xE0 and b <= 0xEF then | |
| if i + 2 <= n then | |
| local b2, b3 = s:byte(i + 1), s:byte(i + 2) | |
| -- Decode the Unicode codepoint to check if it lands in a | |
| -- Japanese range. Bits: (b & 0x0F)<<12 | (b2 & 0x3F)<<6 | (b3 & 0x3F) | |
| local cp = (b % 16) * 4096 + (b2 % 64) * 64 + (b3 % 64) | |
| if (cp >= 0x3000 and cp <= 0x9FFF) or -- CJK symbols, hiragana, | |
| (cp >= 0xFF00 and cp <= 0xFFEF) then -- katakana, CJK unified ideographs | |
| score = score + 8 -- halfwidth/fullwidth forms | |
| elseif b >= 0xE3 and b <= 0xE9 then | |
| score = score + 4 -- E3-E9 range but not a recognized Japanese cp | |
| else | |
| score = score + 2 | |
| end | |
| end | |
| i = i + 3 | |
| elseif b >= 0xF0 then score = score + 1; i = i + 4 | |
| else i = i + 1 | |
| end | |
| end | |
| return score | |
| end | |
| local function sjis_evidence(s) | |
| local score, i, n = 0, 1, #s | |
| while i <= n do | |
| local b = s:byte(i) | |
| if b <= 0x7F then | |
| i = i + 1 | |
| elseif b >= 0xA1 and b <= 0xDF then | |
| score = score + 3 -- half-width katakana: standalone, exclusive to SJIS | |
| i = i + 1 | |
| elseif b >= 0x81 and b <= 0x9F then | |
| score = score + 5 -- lead 0x81-0x9F: structurally impossible in UTF-8 | |
| i = i + 2 | |
| elseif b >= 0xE0 and b <= 0xFC then | |
| score = score + 2 -- lead 0xE0-0xFC: weaker, overlaps UTF-8 range | |
| i = i + 2 | |
| else | |
| i = i + 1 | |
| end | |
| end | |
| return score | |
| end | |
| -- ── public API ─────────────────────────────────────────────────────────────── | |
| -- detect(s) → encoding, confidence | |
| -- | |
| -- encoding : "utf-8" | "shift-jis" | "ascii" | "unknown" | "ambiguous" | |
| -- confidence: 0.0–1.0 | |
| -- | |
| -- "ascii" — no bytes above 0x7F; both encodings are identical here. | |
| -- "unknown" — neither encoding can parse the input without errors. | |
| -- "ambiguous" — both parse cleanly with equal evidence (very rare in practice). | |
| function M.detect(s) | |
| if #s == 0 then return "ascii", 1.0 end | |
| -- Explicit UTF-8 BOM is unambiguous | |
| if #s >= 3 | |
| and s:byte(1) == 0xEF | |
| and s:byte(2) == 0xBB | |
| and s:byte(3) == 0xBF | |
| then | |
| return "utf-8", 1.0 | |
| end | |
| local ok8 = is_valid_utf8(s) | |
| local okSJ = is_valid_sjis(s) | |
| if ok8 and not okSJ then return "utf-8", 0.99 end | |
| if okSJ and not ok8 then return "shift-jis", 0.99 end | |
| if not ok8 and not okSJ then return "unknown", 0.0 end | |
| -- Both valid: is there any non-ASCII content to discriminate on? | |
| if not s:find("[\x80-\xFF]") then return "ascii", 1.0 end | |
| local s8 = utf8_evidence(s) | |
| local sSJ = sjis_evidence(s) | |
| local total = s8 + sSJ | |
| if total == 0 then return "ambiguous", 0.5 end | |
| if s8 > sSJ then return "utf-8", s8 / total | |
| elseif sSJ > s8 then return "shift-jis", sSJ / total | |
| else return "ambiguous", 0.5 | |
| end | |
| end | |
| -- detect_file(path) → encoding, confidence (or nil, errmsg) | |
| function M.detect_file(path) | |
| local f, err = io.open(path, "rb") | |
| if not f then return nil, err end | |
| local data = f:read("*a") | |
| f:close() | |
| return M.detect(data) | |
| end | |
| return M |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment