Skip to content

Instantly share code, notes, and snippets.

@kjk
Last active March 28, 2026 07:57
Show Gist options
  • Select an option

  • Save kjk/bdbea9d90c3bb0454fbe26353c521bfd to your computer and use it in GitHub Desktop.

Select an option

Save kjk/bdbea9d90c3bb0454fbe26353c521bfd to your computer and use it in GitHub Desktop.
is_CJK benchmark
// --- Idea 1: Pre-compiled regex ---
const cjkRegex =
/[\u3000-\u303f\u3040-\u309f\u30a0-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uac00-\ud7af\uf900-\ufaff\uff00-\uffef]|\ud840[\udc00-\udfff]|\ud841[\udc00-\udfff]|\ud842[\udc00-\udfff]|\ud843[\udc00-\udfff]|\ud844[\udc00-\udfff]|\ud845[\udc00-\udfff]|\ud846[\udc00-\udfff]|\ud847[\udc00-\udfff]|\ud848[\udc00-\udfff]|\ud849[\udc00-\udfff]|\ud84a[\udc00-\udfff]|\ud84b[\udc00-\udfff]|\ud84c[\udc00-\udfff]|\ud84d[\udc00-\udfff]|\ud860[\udc00-\udfff]|\ud861[\udc00-\udfff]|\ud862[\udc00-\udfff]|\ud863[\udc00-\udfff]|\ud864[\udc00-\udfff]|\ud865[\udc00-\udfff]|\ud866[\udc00-\udfff]|\ud867[\udc00-\udfff]|\ud868[\udc00-\udfff]|\ud869[\udc00-\udfff]|\ud86a[\udc00-\udfff]|\ud86b[\udc00-\udfff]|\ud87e[\udc00-\udfff]|\ud880[\udc00-\udfff]|\ud881[\udc00-\udfff]|\ud882[\udc00-\udfff]|\ud883[\udc00-\udfff]|\ud884[\udc00-\udfff]/;
export function isCJKRegex(s) {
return cjkRegex.test(s);
}
// --- Idea 2: Early exit if char < 0x3000 ---
export function isCJKEarly(s) {
for (const ch of s) {
const c = ch.codePointAt(0);
if (c < 0x3000) continue;
if (
(c >= 0x4e00 && c <= 0x9fff) ||
(c >= 0x3400 && c <= 0x4dbf) ||
(c >= 0x20000 && c <= 0x2a6df) ||
(c >= 0x2a700 && c <= 0x2b73f) ||
(c >= 0x2b740 && c <= 0x2b81f) ||
(c >= 0x2b820 && c <= 0x2ceaf) ||
(c >= 0x2ceb0 && c <= 0x2ebef) ||
(c >= 0x30000 && c <= 0x3134f) ||
(c >= 0xf900 && c <= 0xfaff) ||
(c >= 0x2f800 && c <= 0x2fa1f) ||
(c >= 0x3000 && c <= 0x303f) ||
(c >= 0x3040 && c <= 0x309f) ||
(c >= 0x30a0 && c <= 0x30ff) ||
(c >= 0xac00 && c <= 0xd7af) ||
(c >= 0xff00 && c <= 0xffef)
) {
return true;
}
}
return false;
}
// --- Idea 2b: Early exit + charCodeAt index loop (no for...of) ---
export function isCJKEarlyFast(s) {
for (let i = 0; i < s.length; i++) {
let c = s.charCodeAt(i);
if (c < 0x3000) {
// check for high surrogate
if (c >= 0xd800 && c <= 0xdbff && i + 1 < s.length) {
const lo = s.charCodeAt(i + 1);
if (lo >= 0xdc00 && lo <= 0xdfff) {
c = ((c - 0xd800) << 10) + (lo - 0xdc00) + 0x10000;
if (
(c >= 0x20000 && c <= 0x2a6df) ||
(c >= 0x2a700 && c <= 0x2b73f) ||
(c >= 0x2b740 && c <= 0x2b81f) ||
(c >= 0x2b820 && c <= 0x2ceaf) ||
(c >= 0x2ceb0 && c <= 0x2ebef) ||
(c >= 0x30000 && c <= 0x3134f) ||
(c >= 0x2f800 && c <= 0x2fa1f)
) {
return true;
}
i++;
}
}
continue;
}
if (
(c >= 0x4e00 && c <= 0x9fff) ||
(c >= 0x3400 && c <= 0x4dbf) ||
(c >= 0x3000 && c <= 0x303f) ||
(c >= 0x3040 && c <= 0x309f) ||
(c >= 0x30a0 && c <= 0x30ff) ||
(c >= 0xac00 && c <= 0xd7af) ||
(c >= 0xf900 && c <= 0xfaff) ||
(c >= 0xff00 && c <= 0xffef)
) {
return true;
}
}
return false;
}
// --- Idea 3: Binary search over sorted ranges ---
// Flat array: [lo0, hi0, lo1, hi1, ...] sorted by lo
const cjkRanges = new Uint32Array([
0x3000, 0x303f,
0x3040, 0x309f,
0x30a0, 0x30ff,
0x3400, 0x4dbf,
0x4e00, 0x9fff,
0xac00, 0xd7af,
0xf900, 0xfaff,
0xff00, 0xffef,
0x20000, 0x2a6df,
0x2a700, 0x2b73f,
0x2b740, 0x2b81f,
0x2b820, 0x2ceaf,
0x2ceb0, 0x2ebef,
0x2f800, 0x2fa1f,
0x30000, 0x3134f,
]);
const cjkRangeCount = cjkRanges.length >>> 1;
function inCJKRange(c) {
let lo = 0;
let hi = cjkRangeCount - 1;
while (lo <= hi) {
const mid = (lo + hi) >>> 1;
const i = mid << 1;
if (c < cjkRanges[i]) {
hi = mid - 1;
} else if (c > cjkRanges[i + 1]) {
lo = mid + 1;
} else {
return true;
}
}
return false;
}
export function isCJKBisect(s) {
for (const ch of s) {
if (inCJKRange(ch.codePointAt(0))) return true;
}
return false;
}
// --- Idea 4: Unrolled binary search (no array, pure if/else) ---
// 15 ranges sorted by lo, binary search tree unrolled:
// mid=7: ff00-ffef
// L mid=3: 3400-4dbf
// L mid=1: 3040-309f
// L 0: 3000-303f
// R 2: 30a0-30ff
// R mid=5: ac00-d7af
// L 4: 4e00-9fff
// R 6: f900-faff
// R mid=11: 2b820-2ceaf
// L mid=9: 2a700-2b73f
// L 8: 20000-2a6df
// R 10: 2b740-2b81f
// R mid=13: 2f800-2fa1f
// L 12: 2ceb0-2ebef
// R 14: 30000-3134f
function inCJKUnrolled(c) {
if (c < 0x3000) return false;
if (c < 0xff00) {
if (c < 0x3400) {
if (c < 0x3040) {
return c <= 0x303f;
} else if (c > 0x309f) {
return c >= 0x30a0 && c <= 0x30ff;
}
return true;
} else if (c > 0x4dbf) {
if (c < 0xac00) {
return c >= 0x4e00 && c <= 0x9fff;
} else if (c > 0xd7af) {
return c >= 0xf900 && c <= 0xfaff;
}
return true;
}
return true;
} else if (c > 0xffef) {
if (c < 0x2b820) {
if (c < 0x2a700) {
return c >= 0x20000 && c <= 0x2a6df;
} else if (c > 0x2b73f) {
return c >= 0x2b740 && c <= 0x2b81f;
}
return true;
} else if (c > 0x2ceaf) {
if (c < 0x2f800) {
return c >= 0x2ceb0 && c <= 0x2ebef;
} else if (c > 0x2fa1f) {
return c >= 0x30000 && c <= 0x3134f;
}
return true;
}
return true;
}
return true;
}
export function isCJKUnrolled(s) {
for (const ch of s) {
if (inCJKUnrolled(ch.codePointAt(0))) return true;
}
return false;
}
// --- Original with for loop instead of for...of ---
export function isCJKForLoop(s) {
for (let i = 0; i < s.length; i++) {
let c = s.charCodeAt(i);
// decode surrogate pair
if (c >= 0xd800 && c <= 0xdbff && i + 1 < s.length) {
const lo = s.charCodeAt(i + 1);
if (lo >= 0xdc00 && lo <= 0xdfff) {
c = ((c - 0xd800) << 10) + (lo - 0xdc00) + 0x10000;
i++;
}
}
if (
(c >= 0x4e00 && c <= 0x9fff) ||
(c >= 0x3400 && c <= 0x4dbf) ||
(c >= 0x20000 && c <= 0x2a6df) ||
(c >= 0x2a700 && c <= 0x2b73f) ||
(c >= 0x2b740 && c <= 0x2b81f) ||
(c >= 0x2b820 && c <= 0x2ceaf) ||
(c >= 0x2ceb0 && c <= 0x2ebef) ||
(c >= 0x30000 && c <= 0x3134f) ||
(c >= 0xf900 && c <= 0xfaff) ||
(c >= 0x2f800 && c <= 0x2fa1f) ||
(c >= 0x3000 && c <= 0x303f) ||
(c >= 0x3040 && c <= 0x309f) ||
(c >= 0x30a0 && c <= 0x30ff) ||
(c >= 0xac00 && c <= 0xd7af) ||
(c >= 0xff00 && c <= 0xffef)
) {
return true;
}
}
return false;
}
// --- Original ---
export function isCJK(s) {
for (const ch of s) {
const c = ch.codePointAt(0);
if (
(c >= 0x4e00 && c <= 0x9fff) ||
(c >= 0x3400 && c <= 0x4dbf) ||
(c >= 0x20000 && c <= 0x2a6df) ||
(c >= 0x2a700 && c <= 0x2b73f) ||
(c >= 0x2b740 && c <= 0x2b81f) ||
(c >= 0x2b820 && c <= 0x2ceaf) ||
(c >= 0x2ceb0 && c <= 0x2ebef) ||
(c >= 0x30000 && c <= 0x3134f) ||
(c >= 0xf900 && c <= 0xfaff) ||
(c >= 0x2f800 && c <= 0x2fa1f) ||
(c >= 0x3000 && c <= 0x303f) ||
(c >= 0x3040 && c <= 0x309f) ||
(c >= 0x30a0 && c <= 0x30ff) ||
(c >= 0xac00 && c <= 0xd7af) ||
(c >= 0xff00 && c <= 0xffef)
) {
return true;
}
}
return false;
}
// --- Benchmark ---
function bench(name, fn, iterations = 1_000_000) {
for (let i = 0; i < 10_000; i++) fn(); // warmup
const start = performance.now();
for (let i = 0; i < iterations; i++) fn();
const elapsed = performance.now() - start;
const opsPerSec = ((iterations / elapsed) * 1000).toFixed(0);
console.log(`${name}: ${elapsed.toFixed(2)} ms (${opsPerSec} ops/sec)`);
}
const cjkStr = "这是中文测试";
const latinStr = "Hello World, this is a test string";
const mixedStr = "Hello 世界 World";
const singleCJK = "字";
const singleLatin = "A";
for (const [label, input] of [
["single CJK char", singleCJK],
["single latin char", singleLatin],
["CJK string", cjkStr],
["latin string", latinStr],
["mixed string", mixedStr],
]) {
console.log(`\n--- ${label}: "${input}" ---`);
bench("isCJK (original) ", () => isCJK(input));
bench("isCJKRegex ", () => isCJKRegex(input));
bench("isCJKEarly ", () => isCJKEarly(input));
bench("isCJKBisect ", () => isCJKBisect(input));
bench("isCJKUnrolled ", () => isCJKUnrolled(input));
bench("isCJKEarlyFast ", () => isCJKEarlyFast(input));
bench("isCJKForLoop ", () => isCJKForLoop(input));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment