tinkerer-red · May 11, 2025 15:49
diff --git a/string_grapheme_length.gml b/string_grapheme_length.gml
 /// @func string_grapheme_length(_unicode)
 /// @desc Counts the number of grapheme clusters (visible characters) in a UTF-8 encoded string
 /// @param {string} _unicode - A string potentially containing UTF-8 encoded content
 /// @return {int} The number of visible grapheme clusters
 function string_grapheme_length(_unicode) {
 	static _cache = {};
 	
 	// If this has been parsed before, return the data
 	var _hash = variable_get_hash(_unicode);
 	if (struct_exists_from_hash(_cache, _hash)) {
 		return struct_get_from_hash(_cache, _hash);
 	}
 	
 	static ZWJ = 0x200D;
 	static VS16 = 0xFE0F;
 	static SKIN_MIN = 0x1F3FB;
 	static SKIN_MAX = 0x1F3FF;
 	static REGIONAL_MIN = 0x1F1E6;
 	static REGIONAL_MAX = 0x1F1FF;
 	static COMBINING_MIN = 0x0300;
 	static COMBINING_MAX = 0x036F;
 	
 	//buffer set up
 	var size = string_byte_length(_unicode);
 	var _buff = buffer_create(size + 1, buffer_fixed, 1);
 	buffer_write(_buff, buffer_text, _unicode);
 	buffer_seek(_buff, buffer_seek_start, 0);
 	
 	
 	var count = 0;
 	var last_cp = undefined;
 	var cp;
 	var in_cluster = false;

 	while (buffer_tell(_buff) < size) {
 		cp = __uc_buffer_read_codepoint(_buff);

 		var is_extend =
 			(cp == VS16)
 			|| (cp >= COMBINING_MIN && cp <= COMBINING_MAX)
 			|| (cp >= SKIN_MIN && cp <= SKIN_MAX);

 		var is_regional_pair =
 			(last_cp >= REGIONAL_MIN && last_cp <= REGIONAL_MAX) &&
 			(cp >= REGIONAL_MIN && cp <= REGIONAL_MAX);

 		var is_zwj_sequence =
 			(last_cp == ZWJ) || (cp == ZWJ);

 		// only break cluster if not extend or zwj or regional pair
 		if (!is_extend && !is_regional_pair && !is_zwj_sequence) {
 			count += 1;
 		}

 		last_cp = cp;
 	}

 	buffer_delete(_buff);
 	
 	//cache the info
 	struct_set_from_hash(_cache, _hash, count)
 	
 	return count;
 }
diff --git a/unitTest.gml b/unitTest.gml
 function test_string_length_utf8() {
 var tests = [
    ["simple text",                  "hello",                    5],
    ["basic emoji",                  "😀",                      1],
    ["emoji sequence",              "👨‍👩‍👧‍👦",                 1],
    ["mixed text",                  "Hi 😀!",                   5],
    ["emoji + skin tone",           "👍🏽",                      1],
    ["heart with ZWJ",              "❤️",                      1],
    ["complex emoji",               "🧑🏽‍🚒",                   1],
    ["flag",                        "🇺🇸",                      1],

    // Multilingual and edge cases
    ["Japanese hiragana",           "こんにちは",               5],
    ["Korean hangul",               "안녕하세요",               5],
    ["Hindi (Devanagari)",          "नमस्ते",                  3],
    ["Arabic",                      "السلام",                  6],
    ["Thai",                        "สวัสดี",                  4],
    ["Combining marks (é)",        "é",                       1], // e + U+0301
    ["Chinese (CJK)",               "你好",                    2],
    ["Hebrew with niqqud",          "שָׁלוֹם",                4],
    ["Grapheme with emoji ZWJs",    "👩‍❤️‍👩",                 1],
    ["Zalgo text",                  "Z͑͛ͤ̈́̓ͦ́͝a̡̓͑͛̑̈́l̈́͑ͤͩg̷͊̈́͛̿͗̇o̷͐̑̎̒ͦ͊", 5],
 ];


    show_debug_message("---- UTF-8 Grapheme Count Tests ----");
    for (var i = 0; i < array_length(tests); i++) {
        var _label   = tests[i][0];
        var _string  = tests[i][1];
        var _expect  = tests[i][2];
        var _actual  = string_length_utf8(_string);
        var _result  = (_actual == _expect) ? "✅" : "❌";
        show_debug_message(_result + " " + _label + " | Got: " + string(_actual) + " | Expected: " + string(_expect));
    }
 }

 test_string_length_utf8()
	/// @func string_grapheme_length(_unicode)
	/// @desc Counts the number of grapheme clusters (visible characters) in a UTF-8 encoded string
	/// @param {string} _unicode - A string potentially containing UTF-8 encoded content
	/// @return {int} The number of visible grapheme clusters
	function string_grapheme_length(_unicode) {
	static _cache = {};

	// If this has been parsed before, return the data
	var _hash = variable_get_hash(_unicode);
	if (struct_exists_from_hash(_cache, _hash)) {
	return struct_get_from_hash(_cache, _hash);
	}

	static ZWJ = 0x200D;
	static VS16 = 0xFE0F;
	static SKIN_MIN = 0x1F3FB;
	static SKIN_MAX = 0x1F3FF;
	static REGIONAL_MIN = 0x1F1E6;
	static REGIONAL_MAX = 0x1F1FF;
	static COMBINING_MIN = 0x0300;
	static COMBINING_MAX = 0x036F;

	//buffer set up
	var size = string_byte_length(_unicode);
	var _buff = buffer_create(size + 1, buffer_fixed, 1);
	buffer_write(_buff, buffer_text, _unicode);
	buffer_seek(_buff, buffer_seek_start, 0);


	var count = 0;
	var last_cp = undefined;
	var cp;
	var in_cluster = false;

	while (buffer_tell(_buff) < size) {
	cp = __uc_buffer_read_codepoint(_buff);

	var is_extend =
	(cp == VS16)
	\|\| (cp >= COMBINING_MIN && cp <= COMBINING_MAX)
	\|\| (cp >= SKIN_MIN && cp <= SKIN_MAX);

	var is_regional_pair =
	(last_cp >= REGIONAL_MIN && last_cp <= REGIONAL_MAX) &&
	(cp >= REGIONAL_MIN && cp <= REGIONAL_MAX);

	var is_zwj_sequence =
	(last_cp == ZWJ) \|\| (cp == ZWJ);

	// only break cluster if not extend or zwj or regional pair
	if (!is_extend && !is_regional_pair && !is_zwj_sequence) {
	count += 1;
	}

	last_cp = cp;
	}

	buffer_delete(_buff);

	//cache the info
	struct_set_from_hash(_cache, _hash, count)

	return count;
	}
	function test_string_length_utf8() {
	var tests = [
	["simple text", "hello", 5],
	["basic emoji", "😀", 1],
	["emoji sequence", "👨‍👩‍👧‍👦", 1],
	["mixed text", "Hi 😀!", 5],
	["emoji + skin tone", "👍🏽", 1],
	["heart with ZWJ", "❤️", 1],
	["complex emoji", "🧑🏽‍🚒", 1],
	["flag", "🇺🇸", 1],

	// Multilingual and edge cases
	["Japanese hiragana", "こんにちは", 5],
	["Korean hangul", "안녕하세요", 5],
	["Hindi (Devanagari)", "नमस्ते", 3],
	["Arabic", "السلام", 6],
	["Thai", "สวัสดี", 4],
	["Combining marks (é)", "é", 1], // e + U+0301
	["Chinese (CJK)", "你好", 2],
	["Hebrew with niqqud", "שָׁלוֹם", 4],
	["Grapheme with emoji ZWJs", "👩‍❤️‍👩", 1],
	["Zalgo text", "Z͑͛ͤ̈́̓ͦ́͝a̡̓͑͛̑̈́l̈́͑ͤͩg̷͊̈́͛̿͗̇o̷͐̑̎̒ͦ͊", 5],
	];


	show_debug_message("---- UTF-8 Grapheme Count Tests ----");
	for (var i = 0; i < array_length(tests); i++) {
	var _label = tests[i][0];
	var _string = tests[i][1];
	var _expect = tests[i][2];
	var _actual = string_length_utf8(_string);
	var _result = (_actual == _expect) ? "✅" : "❌";
	show_debug_message(_result + " " + _label + " \| Got: " + string(_actual) + " \| Expected: " + string(_expect));
	}
	}

	test_string_length_utf8()