Skip to content

Instantly share code, notes, and snippets.

@tinkerer-red
Created May 11, 2025 15:49
Show Gist options
  • Save tinkerer-red/5604ad9bcf1a56dae1d93fc9bdf6effd to your computer and use it in GitHub Desktop.
Save tinkerer-red/5604ad9bcf1a56dae1d93fc9bdf6effd to your computer and use it in GitHub Desktop.
A Latin based implementation of how to count grapheme char count in a string.
/// @func string_grapheme_length(_unicode)
/// @desc Counts the number of grapheme clusters (visible characters) in a UTF-8 encoded string
/// @param {string} _unicode - A string potentially containing UTF-8 encoded content
/// @return {int} The number of visible grapheme clusters
function string_grapheme_length(_unicode) {
static _cache = {};
// If this has been parsed before, return the data
var _hash = variable_get_hash(_unicode);
if (struct_exists_from_hash(_cache, _hash)) {
return struct_get_from_hash(_cache, _hash);
}
static ZWJ = 0x200D;
static VS16 = 0xFE0F;
static SKIN_MIN = 0x1F3FB;
static SKIN_MAX = 0x1F3FF;
static REGIONAL_MIN = 0x1F1E6;
static REGIONAL_MAX = 0x1F1FF;
static COMBINING_MIN = 0x0300;
static COMBINING_MAX = 0x036F;
//buffer set up
var size = string_byte_length(_unicode);
var _buff = buffer_create(size + 1, buffer_fixed, 1);
buffer_write(_buff, buffer_text, _unicode);
buffer_seek(_buff, buffer_seek_start, 0);
var count = 0;
var last_cp = undefined;
var cp;
var in_cluster = false;
while (buffer_tell(_buff) < size) {
cp = __uc_buffer_read_codepoint(_buff);
var is_extend =
(cp == VS16)
|| (cp >= COMBINING_MIN && cp <= COMBINING_MAX)
|| (cp >= SKIN_MIN && cp <= SKIN_MAX);
var is_regional_pair =
(last_cp >= REGIONAL_MIN && last_cp <= REGIONAL_MAX) &&
(cp >= REGIONAL_MIN && cp <= REGIONAL_MAX);
var is_zwj_sequence =
(last_cp == ZWJ) || (cp == ZWJ);
// only break cluster if not extend or zwj or regional pair
if (!is_extend && !is_regional_pair && !is_zwj_sequence) {
count += 1;
}
last_cp = cp;
}
buffer_delete(_buff);
//cache the info
struct_set_from_hash(_cache, _hash, count)
return count;
}
function test_string_length_utf8() {
var tests = [
["simple text", "hello", 5],
["basic emoji", "😀", 1],
["emoji sequence", "👨‍👩‍👧‍👦", 1],
["mixed text", "Hi 😀!", 5],
["emoji + skin tone", "👍🏽", 1],
["heart with ZWJ", "❤️", 1],
["complex emoji", "🧑🏽‍🚒", 1],
["flag", "🇺🇸", 1],
// Multilingual and edge cases
["Japanese hiragana", "こんにちは", 5],
["Korean hangul", "안녕하세요", 5],
["Hindi (Devanagari)", "नमस्ते", 3],
["Arabic", "السلام", 6],
["Thai", "สวัสดี", 4],
["Combining marks (é)", "é", 1], // e + U+0301
["Chinese (CJK)", "你好", 2],
["Hebrew with niqqud", "שָׁלוֹם", 4],
["Grapheme with emoji ZWJs", "👩‍❤️‍👩", 1],
["Zalgo text", "Z͑͛ͤ̈́̓ͦ́͝a̡̓͑͛̑̈́l̈́͑ͤͩg̷͊̈́͛̿͗̇o̷͐̑̎̒ͦ͊", 5],
];
show_debug_message("---- UTF-8 Grapheme Count Tests ----");
for (var i = 0; i < array_length(tests); i++) {
var _label = tests[i][0];
var _string = tests[i][1];
var _expect = tests[i][2];
var _actual = string_length_utf8(_string);
var _result = (_actual == _expect) ? "✅" : "❌";
show_debug_message(_result + " " + _label + " | Got: " + string(_actual) + " | Expected: " + string(_expect));
}
}
test_string_length_utf8()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment