Last active
March 28, 2026 07:57
-
-
Save kjk/bdbea9d90c3bb0454fbe26353c521bfd to your computer and use it in GitHub Desktop.
is_CJK benchmark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // --- Idea 1: Pre-compiled regex --- | |
| const cjkRegex = | |
| /[\u3000-\u303f\u3040-\u309f\u30a0-\u30ff\u3400-\u4dbf\u4e00-\u9fff\uac00-\ud7af\uf900-\ufaff\uff00-\uffef]|\ud840[\udc00-\udfff]|\ud841[\udc00-\udfff]|\ud842[\udc00-\udfff]|\ud843[\udc00-\udfff]|\ud844[\udc00-\udfff]|\ud845[\udc00-\udfff]|\ud846[\udc00-\udfff]|\ud847[\udc00-\udfff]|\ud848[\udc00-\udfff]|\ud849[\udc00-\udfff]|\ud84a[\udc00-\udfff]|\ud84b[\udc00-\udfff]|\ud84c[\udc00-\udfff]|\ud84d[\udc00-\udfff]|\ud860[\udc00-\udfff]|\ud861[\udc00-\udfff]|\ud862[\udc00-\udfff]|\ud863[\udc00-\udfff]|\ud864[\udc00-\udfff]|\ud865[\udc00-\udfff]|\ud866[\udc00-\udfff]|\ud867[\udc00-\udfff]|\ud868[\udc00-\udfff]|\ud869[\udc00-\udfff]|\ud86a[\udc00-\udfff]|\ud86b[\udc00-\udfff]|\ud87e[\udc00-\udfff]|\ud880[\udc00-\udfff]|\ud881[\udc00-\udfff]|\ud882[\udc00-\udfff]|\ud883[\udc00-\udfff]|\ud884[\udc00-\udfff]/; | |
| export function isCJKRegex(s) { | |
| return cjkRegex.test(s); | |
| } | |
| // --- Idea 2: Early exit if char < 0x3000 --- | |
| export function isCJKEarly(s) { | |
| for (const ch of s) { | |
| const c = ch.codePointAt(0); | |
| if (c < 0x3000) continue; | |
| if ( | |
| (c >= 0x4e00 && c <= 0x9fff) || | |
| (c >= 0x3400 && c <= 0x4dbf) || | |
| (c >= 0x20000 && c <= 0x2a6df) || | |
| (c >= 0x2a700 && c <= 0x2b73f) || | |
| (c >= 0x2b740 && c <= 0x2b81f) || | |
| (c >= 0x2b820 && c <= 0x2ceaf) || | |
| (c >= 0x2ceb0 && c <= 0x2ebef) || | |
| (c >= 0x30000 && c <= 0x3134f) || | |
| (c >= 0xf900 && c <= 0xfaff) || | |
| (c >= 0x2f800 && c <= 0x2fa1f) || | |
| (c >= 0x3000 && c <= 0x303f) || | |
| (c >= 0x3040 && c <= 0x309f) || | |
| (c >= 0x30a0 && c <= 0x30ff) || | |
| (c >= 0xac00 && c <= 0xd7af) || | |
| (c >= 0xff00 && c <= 0xffef) | |
| ) { | |
| return true; | |
| } | |
| } | |
| return false; | |
| } | |
| // --- Idea 2b: Early exit + charCodeAt index loop (no for...of) --- | |
| export function isCJKEarlyFast(s) { | |
| for (let i = 0; i < s.length; i++) { | |
| let c = s.charCodeAt(i); | |
| if (c < 0x3000) { | |
| // check for high surrogate | |
| if (c >= 0xd800 && c <= 0xdbff && i + 1 < s.length) { | |
| const lo = s.charCodeAt(i + 1); | |
| if (lo >= 0xdc00 && lo <= 0xdfff) { | |
| c = ((c - 0xd800) << 10) + (lo - 0xdc00) + 0x10000; | |
| if ( | |
| (c >= 0x20000 && c <= 0x2a6df) || | |
| (c >= 0x2a700 && c <= 0x2b73f) || | |
| (c >= 0x2b740 && c <= 0x2b81f) || | |
| (c >= 0x2b820 && c <= 0x2ceaf) || | |
| (c >= 0x2ceb0 && c <= 0x2ebef) || | |
| (c >= 0x30000 && c <= 0x3134f) || | |
| (c >= 0x2f800 && c <= 0x2fa1f) | |
| ) { | |
| return true; | |
| } | |
| i++; | |
| } | |
| } | |
| continue; | |
| } | |
| if ( | |
| (c >= 0x4e00 && c <= 0x9fff) || | |
| (c >= 0x3400 && c <= 0x4dbf) || | |
| (c >= 0x3000 && c <= 0x303f) || | |
| (c >= 0x3040 && c <= 0x309f) || | |
| (c >= 0x30a0 && c <= 0x30ff) || | |
| (c >= 0xac00 && c <= 0xd7af) || | |
| (c >= 0xf900 && c <= 0xfaff) || | |
| (c >= 0xff00 && c <= 0xffef) | |
| ) { | |
| return true; | |
| } | |
| } | |
| return false; | |
| } | |
| // --- Idea 3: Binary search over sorted ranges --- | |
| // Flat array: [lo0, hi0, lo1, hi1, ...] sorted by lo | |
| const cjkRanges = new Uint32Array([ | |
| 0x3000, 0x303f, | |
| 0x3040, 0x309f, | |
| 0x30a0, 0x30ff, | |
| 0x3400, 0x4dbf, | |
| 0x4e00, 0x9fff, | |
| 0xac00, 0xd7af, | |
| 0xf900, 0xfaff, | |
| 0xff00, 0xffef, | |
| 0x20000, 0x2a6df, | |
| 0x2a700, 0x2b73f, | |
| 0x2b740, 0x2b81f, | |
| 0x2b820, 0x2ceaf, | |
| 0x2ceb0, 0x2ebef, | |
| 0x2f800, 0x2fa1f, | |
| 0x30000, 0x3134f, | |
| ]); | |
| const cjkRangeCount = cjkRanges.length >>> 1; | |
| function inCJKRange(c) { | |
| let lo = 0; | |
| let hi = cjkRangeCount - 1; | |
| while (lo <= hi) { | |
| const mid = (lo + hi) >>> 1; | |
| const i = mid << 1; | |
| if (c < cjkRanges[i]) { | |
| hi = mid - 1; | |
| } else if (c > cjkRanges[i + 1]) { | |
| lo = mid + 1; | |
| } else { | |
| return true; | |
| } | |
| } | |
| return false; | |
| } | |
| export function isCJKBisect(s) { | |
| for (const ch of s) { | |
| if (inCJKRange(ch.codePointAt(0))) return true; | |
| } | |
| return false; | |
| } | |
| // --- Idea 4: Unrolled binary search (no array, pure if/else) --- | |
| // 15 ranges sorted by lo, binary search tree unrolled: | |
| // mid=7: ff00-ffef | |
| // L mid=3: 3400-4dbf | |
| // L mid=1: 3040-309f | |
| // L 0: 3000-303f | |
| // R 2: 30a0-30ff | |
| // R mid=5: ac00-d7af | |
| // L 4: 4e00-9fff | |
| // R 6: f900-faff | |
| // R mid=11: 2b820-2ceaf | |
| // L mid=9: 2a700-2b73f | |
| // L 8: 20000-2a6df | |
| // R 10: 2b740-2b81f | |
| // R mid=13: 2f800-2fa1f | |
| // L 12: 2ceb0-2ebef | |
| // R 14: 30000-3134f | |
| function inCJKUnrolled(c) { | |
| if (c < 0x3000) return false; | |
| if (c < 0xff00) { | |
| if (c < 0x3400) { | |
| if (c < 0x3040) { | |
| return c <= 0x303f; | |
| } else if (c > 0x309f) { | |
| return c >= 0x30a0 && c <= 0x30ff; | |
| } | |
| return true; | |
| } else if (c > 0x4dbf) { | |
| if (c < 0xac00) { | |
| return c >= 0x4e00 && c <= 0x9fff; | |
| } else if (c > 0xd7af) { | |
| return c >= 0xf900 && c <= 0xfaff; | |
| } | |
| return true; | |
| } | |
| return true; | |
| } else if (c > 0xffef) { | |
| if (c < 0x2b820) { | |
| if (c < 0x2a700) { | |
| return c >= 0x20000 && c <= 0x2a6df; | |
| } else if (c > 0x2b73f) { | |
| return c >= 0x2b740 && c <= 0x2b81f; | |
| } | |
| return true; | |
| } else if (c > 0x2ceaf) { | |
| if (c < 0x2f800) { | |
| return c >= 0x2ceb0 && c <= 0x2ebef; | |
| } else if (c > 0x2fa1f) { | |
| return c >= 0x30000 && c <= 0x3134f; | |
| } | |
| return true; | |
| } | |
| return true; | |
| } | |
| return true; | |
| } | |
| export function isCJKUnrolled(s) { | |
| for (const ch of s) { | |
| if (inCJKUnrolled(ch.codePointAt(0))) return true; | |
| } | |
| return false; | |
| } | |
| // --- Original with for loop instead of for...of --- | |
| export function isCJKForLoop(s) { | |
| for (let i = 0; i < s.length; i++) { | |
| let c = s.charCodeAt(i); | |
| // decode surrogate pair | |
| if (c >= 0xd800 && c <= 0xdbff && i + 1 < s.length) { | |
| const lo = s.charCodeAt(i + 1); | |
| if (lo >= 0xdc00 && lo <= 0xdfff) { | |
| c = ((c - 0xd800) << 10) + (lo - 0xdc00) + 0x10000; | |
| i++; | |
| } | |
| } | |
| if ( | |
| (c >= 0x4e00 && c <= 0x9fff) || | |
| (c >= 0x3400 && c <= 0x4dbf) || | |
| (c >= 0x20000 && c <= 0x2a6df) || | |
| (c >= 0x2a700 && c <= 0x2b73f) || | |
| (c >= 0x2b740 && c <= 0x2b81f) || | |
| (c >= 0x2b820 && c <= 0x2ceaf) || | |
| (c >= 0x2ceb0 && c <= 0x2ebef) || | |
| (c >= 0x30000 && c <= 0x3134f) || | |
| (c >= 0xf900 && c <= 0xfaff) || | |
| (c >= 0x2f800 && c <= 0x2fa1f) || | |
| (c >= 0x3000 && c <= 0x303f) || | |
| (c >= 0x3040 && c <= 0x309f) || | |
| (c >= 0x30a0 && c <= 0x30ff) || | |
| (c >= 0xac00 && c <= 0xd7af) || | |
| (c >= 0xff00 && c <= 0xffef) | |
| ) { | |
| return true; | |
| } | |
| } | |
| return false; | |
| } | |
| // --- Original --- | |
| export function isCJK(s) { | |
| for (const ch of s) { | |
| const c = ch.codePointAt(0); | |
| if ( | |
| (c >= 0x4e00 && c <= 0x9fff) || | |
| (c >= 0x3400 && c <= 0x4dbf) || | |
| (c >= 0x20000 && c <= 0x2a6df) || | |
| (c >= 0x2a700 && c <= 0x2b73f) || | |
| (c >= 0x2b740 && c <= 0x2b81f) || | |
| (c >= 0x2b820 && c <= 0x2ceaf) || | |
| (c >= 0x2ceb0 && c <= 0x2ebef) || | |
| (c >= 0x30000 && c <= 0x3134f) || | |
| (c >= 0xf900 && c <= 0xfaff) || | |
| (c >= 0x2f800 && c <= 0x2fa1f) || | |
| (c >= 0x3000 && c <= 0x303f) || | |
| (c >= 0x3040 && c <= 0x309f) || | |
| (c >= 0x30a0 && c <= 0x30ff) || | |
| (c >= 0xac00 && c <= 0xd7af) || | |
| (c >= 0xff00 && c <= 0xffef) | |
| ) { | |
| return true; | |
| } | |
| } | |
| return false; | |
| } | |
| // --- Benchmark --- | |
| function bench(name, fn, iterations = 1_000_000) { | |
| for (let i = 0; i < 10_000; i++) fn(); // warmup | |
| const start = performance.now(); | |
| for (let i = 0; i < iterations; i++) fn(); | |
| const elapsed = performance.now() - start; | |
| const opsPerSec = ((iterations / elapsed) * 1000).toFixed(0); | |
| console.log(`${name}: ${elapsed.toFixed(2)} ms (${opsPerSec} ops/sec)`); | |
| } | |
| const cjkStr = "这是中文测试"; | |
| const latinStr = "Hello World, this is a test string"; | |
| const mixedStr = "Hello 世界 World"; | |
| const singleCJK = "字"; | |
| const singleLatin = "A"; | |
| for (const [label, input] of [ | |
| ["single CJK char", singleCJK], | |
| ["single latin char", singleLatin], | |
| ["CJK string", cjkStr], | |
| ["latin string", latinStr], | |
| ["mixed string", mixedStr], | |
| ]) { | |
| console.log(`\n--- ${label}: "${input}" ---`); | |
| bench("isCJK (original) ", () => isCJK(input)); | |
| bench("isCJKRegex ", () => isCJKRegex(input)); | |
| bench("isCJKEarly ", () => isCJKEarly(input)); | |
| bench("isCJKBisect ", () => isCJKBisect(input)); | |
| bench("isCJKUnrolled ", () => isCJKUnrolled(input)); | |
| bench("isCJKEarlyFast ", () => isCJKEarlyFast(input)); | |
| bench("isCJKForLoop ", () => isCJKForLoop(input)); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment