Created
May 11, 2025 15:49
-
-
Save tinkerer-red/5604ad9bcf1a56dae1d93fc9bdf6effd to your computer and use it in GitHub Desktop.
A Latin based implementation of how to count grapheme char count in a string.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// @func string_grapheme_length(_unicode) | |
/// @desc Counts the number of grapheme clusters (visible characters) in a UTF-8 encoded string | |
/// @param {string} _unicode - A string potentially containing UTF-8 encoded content | |
/// @return {int} The number of visible grapheme clusters | |
function string_grapheme_length(_unicode) { | |
static _cache = {}; | |
// If this has been parsed before, return the data | |
var _hash = variable_get_hash(_unicode); | |
if (struct_exists_from_hash(_cache, _hash)) { | |
return struct_get_from_hash(_cache, _hash); | |
} | |
static ZWJ = 0x200D; | |
static VS16 = 0xFE0F; | |
static SKIN_MIN = 0x1F3FB; | |
static SKIN_MAX = 0x1F3FF; | |
static REGIONAL_MIN = 0x1F1E6; | |
static REGIONAL_MAX = 0x1F1FF; | |
static COMBINING_MIN = 0x0300; | |
static COMBINING_MAX = 0x036F; | |
//buffer set up | |
var size = string_byte_length(_unicode); | |
var _buff = buffer_create(size + 1, buffer_fixed, 1); | |
buffer_write(_buff, buffer_text, _unicode); | |
buffer_seek(_buff, buffer_seek_start, 0); | |
var count = 0; | |
var last_cp = undefined; | |
var cp; | |
var in_cluster = false; | |
while (buffer_tell(_buff) < size) { | |
cp = __uc_buffer_read_codepoint(_buff); | |
var is_extend = | |
(cp == VS16) | |
|| (cp >= COMBINING_MIN && cp <= COMBINING_MAX) | |
|| (cp >= SKIN_MIN && cp <= SKIN_MAX); | |
var is_regional_pair = | |
(last_cp >= REGIONAL_MIN && last_cp <= REGIONAL_MAX) && | |
(cp >= REGIONAL_MIN && cp <= REGIONAL_MAX); | |
var is_zwj_sequence = | |
(last_cp == ZWJ) || (cp == ZWJ); | |
// only break cluster if not extend or zwj or regional pair | |
if (!is_extend && !is_regional_pair && !is_zwj_sequence) { | |
count += 1; | |
} | |
last_cp = cp; | |
} | |
buffer_delete(_buff); | |
//cache the info | |
struct_set_from_hash(_cache, _hash, count) | |
return count; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function test_string_length_utf8() { | |
var tests = [ | |
["simple text", "hello", 5], | |
["basic emoji", "😀", 1], | |
["emoji sequence", "👨👩👧👦", 1], | |
["mixed text", "Hi 😀!", 5], | |
["emoji + skin tone", "👍🏽", 1], | |
["heart with ZWJ", "❤️", 1], | |
["complex emoji", "🧑🏽🚒", 1], | |
["flag", "🇺🇸", 1], | |
// Multilingual and edge cases | |
["Japanese hiragana", "こんにちは", 5], | |
["Korean hangul", "안녕하세요", 5], | |
["Hindi (Devanagari)", "नमस्ते", 3], | |
["Arabic", "السلام", 6], | |
["Thai", "สวัสดี", 4], | |
["Combining marks (é)", "é", 1], // e + U+0301 | |
["Chinese (CJK)", "你好", 2], | |
["Hebrew with niqqud", "שָׁלוֹם", 4], | |
["Grapheme with emoji ZWJs", "👩❤️👩", 1], | |
["Zalgo text", "Z͑͛ͤ̈́̓ͦ́͝a̡̓͑͛̑̈́l̈́͑ͤͩg̷͊̈́͛̿͗̇o̷͐̑̎̒ͦ͊", 5], | |
]; | |
show_debug_message("---- UTF-8 Grapheme Count Tests ----"); | |
for (var i = 0; i < array_length(tests); i++) { | |
var _label = tests[i][0]; | |
var _string = tests[i][1]; | |
var _expect = tests[i][2]; | |
var _actual = string_length_utf8(_string); | |
var _result = (_actual == _expect) ? "✅" : "❌"; | |
show_debug_message(_result + " " + _label + " | Got: " + string(_actual) + " | Expected: " + string(_expect)); | |
} | |
} | |
test_string_length_utf8() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment