romualdo97 · April 18, 2025 17:52
diff --git a/README.md b/README.md
diff --git a/Utf8String.cpp b/Utf8String.cpp
 #include "Utf8String.h"

 namespace
 {
    enum class EUtf8SequenceSize : uint8_t
    {
        One, Two, Three, Four, Invalid
    };

    [[nodiscard]] EUtf8SequenceSize SizeOfUtf8Sequence(const unsigned char& Utf8Char)
    {
        // https://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String
        if (Utf8Char <= 0x7f) // 0b0111'1111
        {
            return EUtf8SequenceSize::One;
        }
        else if (Utf8Char <= 0xbf) // 0b1011'1111
        {
            return EUtf8SequenceSize::Invalid; // Not a leading UTF8 byte, possibly something went wrong reading previous sequences
        }
        else if (Utf8Char <= 0xdf) // 0b1101'1111
        {
            return EUtf8SequenceSize::Two;
        }
        else if (Utf8Char <= 0xef) // 0b1110'1111
        {
            return EUtf8SequenceSize::Three;
        }
        else if (Utf8Char <= 0xf7) // 0b1111'0111
        {
            return EUtf8SequenceSize::Four;
        }

        // Unicode 3.1 ruled out the five and six octets UTF-8 sequence as illegal although
        // previous standard / specification such as Unicode 3.0 and RFC 2279 allow the
        // five and six octets UTF-8 sequence. Therefore, we need to make sure those value are not in the UTF-8
        return EUtf8SequenceSize::Invalid;
    }

    [[nodiscard]] char32_t NextCodepointFromUtf8Sequence(const unsigned char*& Utf8Sequence)
    {
        if (*Utf8Sequence == 0)
        {
            return 0;
        }

        EUtf8SequenceSize NumOfBytes = SizeOfUtf8Sequence(*Utf8Sequence);
        if (NumOfBytes == EUtf8SequenceSize::Invalid)
        {
            return 0; // End processing
        }

        unsigned char FirstByte = *Utf8Sequence;
        if (NumOfBytes == EUtf8SequenceSize::One)
        {
            ++Utf8Sequence; // Point to the start of the next UTF8 sequence
            return FirstByte;
        }

        unsigned char SecondByte = *(++Utf8Sequence);
        if (SecondByte == 0)
        {
            return 0;
        }

        if (NumOfBytes == EUtf8SequenceSize::Two)
        {
            ++Utf8Sequence; // Point to the start of the next UTF8 sequence
            return
                ((FirstByte      & 0b0001'1111) << 6)       |
                 (SecondByte     & 0b0011'1111);
        }

        unsigned char ThirdByte = *(++Utf8Sequence);
        if (ThirdByte == 0)
        {
            return 0;
        }

        if (NumOfBytes == EUtf8SequenceSize::Three)
        {
            ++Utf8Sequence; // Point to the start of the next UTF8 sequence
            return
                ((FirstByte      & 0b0000'1111) << 12)      |
                ((SecondByte     & 0b0011'1111) << 6)       |
                 (ThirdByte      & 0b0011'1111);
        }

        unsigned char FourthByte = *(++Utf8Sequence);
        if (FourthByte == 0)
        {
            return 0;
        }

        ++Utf8Sequence; // Point to the start of the next UTF8 sequence
        return
            ((FirstByte         & 0b0000'0111) << 18)   |
            ((SecondByte        & 0b0011'1111) << 12)   |
            ((ThirdByte         & 0b0011'1111) << 6)    |
             (FourthByte        & 0b0011'1111);
    }
 }

 Utf8String::Utf8String(const char* Str)
    : Data(Str)
 {}

 int32_t Utf8String::Len() const
 {
    return static_cast<int32_t>(Data.size());
 }

 int32_t Utf8String::CodePointsLen() const
 {
    if (Len() == 0)
    {
        return 0;
    }

    int32_t TotalCodePoints = 0;
    const unsigned char* Utf8Str = GetRawData();
    while (NextCodepointFromUtf8Sequence(Utf8Str))
    {
        ++TotalCodePoints;
    }
    return TotalCodePoints;
 }

 bool Utf8String::IsMultiByte() const
 {
    const unsigned char* Utf8Str = GetRawData();
    while (*Utf8Str != 0)
    {
        char32_t UnicodeCodePoint = NextCodepointFromUtf8Sequence(Utf8Str);
        if (UnicodeCodePoint >= 0x1'0000)
        {
            return true;
        }
    }

    return false;
 }

 const char* Utf8String::operator*() const
 {
    return Data.c_str();
 }

 std::u32string Utf8String::ToUtf32() const
 {
    std::u32string Utf32Output;
    if (Len() == 0)
    {
        return Utf32Output;
    }

    const unsigned char* Utf8Str = GetRawData();
    while (*Utf8Str != 0)
    {
        char32_t UnicodeCodePoint = NextCodepointFromUtf8Sequence(Utf8Str);
        Utf32Output.push_back(UnicodeCodePoint);
    }

    return Utf32Output;
 }

 std::u16string Utf8String::ToUtf16() const
 {
    // UTF8: https://en.wikipedia.org/wiki/UTF-8
    // UTF16: https://en.wikipedia.org/wiki/UTF-16

    std::u16string Utf16Output;
    if (Len() == 0)
    {
        return Utf16Output;
    }

    // https://stackoverflow.com/questions/73758747/looking-for-the-description-of-the-algorithm-to-convert-utf8-to-utf16
    const unsigned char* Utf8Str = GetRawData();
    while (*Utf8Str != 0)
    {
        char32_t UnicodeCodePoint = NextCodepointFromUtf8Sequence(Utf8Str);
        if (UnicodeCodePoint < 0x1'0000) // 0b0001'0000'0000'0000'0000
        {
            Utf16Output.push_back(UnicodeCodePoint);
        }
        else
        {
            UnicodeCodePoint -= 0x1'0000;
            char16_t HighSurrogate = 0xd800 + ((UnicodeCodePoint >> 10) & 0x3FF); // 0x3FF == 0b0011'1111'1111
            char16_t LowSurrogate = 0xdc00 + (UnicodeCodePoint & 0x3FF);
            Utf16Output.push_back(HighSurrogate);
            Utf16Output.push_back(LowSurrogate);
        }
    }

    return Utf16Output;
 }

 std::wstring Utf8String::ToWide() const
 {
    std::wstring WideOutput;
    if (Len() == 0)
    {
        return WideOutput;
    }

    if constexpr (sizeof(wchar_t) == 4)
    {
        std::u32string Utf32String = ToUtf32();
        WideOutput.reserve(Utf32String.size());
        for (const char32_t& Char : Utf32String)
        {
            WideOutput.push_back(Char);
        }
        return WideOutput;
    }
    else if constexpr (sizeof(wchar_t) == 2)
    {
        std::u16string Utf16String = ToUtf16();
        WideOutput.reserve(Utf16String.size());
        for (const char16_t& Char : Utf16String)
        {
            WideOutput.push_back(Char);
        }
        return WideOutput;
    }
    else if constexpr (sizeof(wchar_t) == 1)
    {
        WideOutput.reserve(Data.size());
        for (const char& Char : Data)
        {
            WideOutput.push_back(Char);
        }
        return WideOutput;
    }

    static_assert(sizeof(wchar_t) == 1 || sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4, "Unexpected wchar_t size");
 }

 const unsigned char* Utf8String::GetRawData() const
 {
    return reinterpret_cast<const unsigned char*>(Data.c_str());
 }
diff --git a/Utf8String.h b/Utf8String.h
 #pragma once
 #include <cstdint>
 #include <string>

 /**
 * UTF-8 encoded string
 */
 class Utf8String
 {
 public:
    /** Construct a UTF-8 string */
    explicit FUtf8String(const char* Str);

    /** Num of characters in the string (no Unicode codepoints) */
    [[nodiscard]] int32_t Len() const;

    /** Num of Unicode codepoints */
    [[nodiscard]] int32_t CodePointsLen() const;

    /** If true, then this string contains codepoints outside the ASCII range i.e. [0, 127] which require multiple byts to be encoded */
    [[nodiscard]] bool IsMultiByte() const;

    /** Null terminated UTF8 string */
    [[nodiscard]] const char* operator*() const;

    /** Converts to a string of UTF32 or USC4, where each element is equivalent to a Unicode codepoint */
    [[nodiscard]] std::u32string ToUtf32() const;

    /** Converts to a string of UTF16 */
    [[nodiscard]] std::u16string ToUtf16() const;

    /**
     * Returns the UTF16 representation of this string if the platform size of wchar_t is 2,
     * Returns the UTF32 representation of this string if the platform size of wchar_t is 4,
     * this is mostly intended for usage in some APIs that require it like Win32,
     * but it's not safe in a cross-platform environment as the size can be different,
     * e.g. 2 bytes in Win, 4 bytes in Unix, so avoid as much as possible unless you know what you're doing.
     */
    [[nodiscard]] std::wstring ToWide() const;

 private:
    /** The data as an unsigned char for bitwise manipulation */
    [[nodiscard]] const unsigned char* GetRawData() const;

 private:
    std::string Data;
 };
	#include "Utf8String.h"

	namespace
	{
	enum class EUtf8SequenceSize : uint8_t
	{
	One, Two, Three, Four, Invalid
	};

	[[nodiscard]] EUtf8SequenceSize SizeOfUtf8Sequence(const unsigned char& Utf8Char)
	{
	// https://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String
	if (Utf8Char <= 0x7f) // 0b0111'1111
	{
	return EUtf8SequenceSize::One;
	}
	else if (Utf8Char <= 0xbf) // 0b1011'1111
	{
	return EUtf8SequenceSize::Invalid; // Not a leading UTF8 byte, possibly something went wrong reading previous sequences
	}
	else if (Utf8Char <= 0xdf) // 0b1101'1111
	{
	return EUtf8SequenceSize::Two;
	}
	else if (Utf8Char <= 0xef) // 0b1110'1111
	{
	return EUtf8SequenceSize::Three;
	}
	else if (Utf8Char <= 0xf7) // 0b1111'0111
	{
	return EUtf8SequenceSize::Four;
	}

	// Unicode 3.1 ruled out the five and six octets UTF-8 sequence as illegal although
	// previous standard / specification such as Unicode 3.0 and RFC 2279 allow the
	// five and six octets UTF-8 sequence. Therefore, we need to make sure those value are not in the UTF-8
	return EUtf8SequenceSize::Invalid;
	}

	[[nodiscard]] char32_t NextCodepointFromUtf8Sequence(const unsigned char*& Utf8Sequence)
	{
	if (*Utf8Sequence == 0)
	{
	return 0;
	}

	EUtf8SequenceSize NumOfBytes = SizeOfUtf8Sequence(*Utf8Sequence);
	if (NumOfBytes == EUtf8SequenceSize::Invalid)
	{
	return 0; // End processing
	}

	unsigned char FirstByte = *Utf8Sequence;
	if (NumOfBytes == EUtf8SequenceSize::One)
	{
	++Utf8Sequence; // Point to the start of the next UTF8 sequence
	return FirstByte;
	}

	unsigned char SecondByte = *(++Utf8Sequence);
	if (SecondByte == 0)
	{
	return 0;
	}

	if (NumOfBytes == EUtf8SequenceSize::Two)
	{
	++Utf8Sequence; // Point to the start of the next UTF8 sequence
	return
	((FirstByte & 0b0001'1111) << 6) \|
	(SecondByte & 0b0011'1111);
	}

	unsigned char ThirdByte = *(++Utf8Sequence);
	if (ThirdByte == 0)
	{
	return 0;
	}

	if (NumOfBytes == EUtf8SequenceSize::Three)
	{
	++Utf8Sequence; // Point to the start of the next UTF8 sequence
	return
	((FirstByte & 0b0000'1111) << 12) \|
	((SecondByte & 0b0011'1111) << 6) \|
	(ThirdByte & 0b0011'1111);
	}

	unsigned char FourthByte = *(++Utf8Sequence);
	if (FourthByte == 0)
	{
	return 0;
	}

	++Utf8Sequence; // Point to the start of the next UTF8 sequence
	return
	((FirstByte & 0b0000'0111) << 18) \|
	((SecondByte & 0b0011'1111) << 12) \|
	((ThirdByte & 0b0011'1111) << 6) \|
	(FourthByte & 0b0011'1111);
	}
	}

	Utf8String::Utf8String(const char* Str)
	: Data(Str)
	{}

	int32_t Utf8String::Len() const
	{
	return static_cast<int32_t>(Data.size());
	}

	int32_t Utf8String::CodePointsLen() const
	{
	if (Len() == 0)
	{
	return 0;
	}

	int32_t TotalCodePoints = 0;
	const unsigned char* Utf8Str = GetRawData();
	while (NextCodepointFromUtf8Sequence(Utf8Str))
	{
	++TotalCodePoints;
	}
	return TotalCodePoints;
	}

	bool Utf8String::IsMultiByte() const
	{
	const unsigned char* Utf8Str = GetRawData();
	while (*Utf8Str != 0)
	{
	char32_t UnicodeCodePoint = NextCodepointFromUtf8Sequence(Utf8Str);
	if (UnicodeCodePoint >= 0x1'0000)
	{
	return true;
	}
	}

	return false;
	}

	const char* Utf8String::operator*() const
	{
	return Data.c_str();
	}

	std::u32string Utf8String::ToUtf32() const
	{
	std::u32string Utf32Output;
	if (Len() == 0)
	{
	return Utf32Output;
	}

	const unsigned char* Utf8Str = GetRawData();
	while (*Utf8Str != 0)
	{
	char32_t UnicodeCodePoint = NextCodepointFromUtf8Sequence(Utf8Str);
	Utf32Output.push_back(UnicodeCodePoint);
	}

	return Utf32Output;
	}

	std::u16string Utf8String::ToUtf16() const
	{
	// UTF8: https://en.wikipedia.org/wiki/UTF-8
	// UTF16: https://en.wikipedia.org/wiki/UTF-16

	std::u16string Utf16Output;
	if (Len() == 0)
	{
	return Utf16Output;
	}

	// https://stackoverflow.com/questions/73758747/looking-for-the-description-of-the-algorithm-to-convert-utf8-to-utf16
	const unsigned char* Utf8Str = GetRawData();
	while (*Utf8Str != 0)
	{
	char32_t UnicodeCodePoint = NextCodepointFromUtf8Sequence(Utf8Str);
	if (UnicodeCodePoint < 0x1'0000) // 0b0001'0000'0000'0000'0000
	{
	Utf16Output.push_back(UnicodeCodePoint);
	}
	else
	{
	UnicodeCodePoint -= 0x1'0000;
	char16_t HighSurrogate = 0xd800 + ((UnicodeCodePoint >> 10) & 0x3FF); // 0x3FF == 0b0011'1111'1111
	char16_t LowSurrogate = 0xdc00 + (UnicodeCodePoint & 0x3FF);
	Utf16Output.push_back(HighSurrogate);
	Utf16Output.push_back(LowSurrogate);
	}
	}

	return Utf16Output;
	}

	std::wstring Utf8String::ToWide() const
	{
	std::wstring WideOutput;
	if (Len() == 0)
	{
	return WideOutput;
	}

	if constexpr (sizeof(wchar_t) == 4)
	{
	std::u32string Utf32String = ToUtf32();
	WideOutput.reserve(Utf32String.size());
	for (const char32_t& Char : Utf32String)
	{
	WideOutput.push_back(Char);
	}
	return WideOutput;
	}
	else if constexpr (sizeof(wchar_t) == 2)
	{
	std::u16string Utf16String = ToUtf16();
	WideOutput.reserve(Utf16String.size());
	for (const char16_t& Char : Utf16String)
	{
	WideOutput.push_back(Char);
	}
	return WideOutput;
	}
	else if constexpr (sizeof(wchar_t) == 1)
	{
	WideOutput.reserve(Data.size());
	for (const char& Char : Data)
	{
	WideOutput.push_back(Char);
	}
	return WideOutput;
	}

	static_assert(sizeof(wchar_t) == 1 \|\| sizeof(wchar_t) == 2 \|\| sizeof(wchar_t) == 4, "Unexpected wchar_t size");
	}

	const unsigned char* Utf8String::GetRawData() const
	{
	return reinterpret_cast<const unsigned char*>(Data.c_str());
	}
	#pragma once
	#include <cstdint>
	#include <string>

	/**
	* UTF-8 encoded string
	*/
	class Utf8String
	{
	public:
	/** Construct a UTF-8 string */
	explicit FUtf8String(const char* Str);

	/** Num of characters in the string (no Unicode codepoints) */
	[[nodiscard]] int32_t Len() const;

	/** Num of Unicode codepoints */
	[[nodiscard]] int32_t CodePointsLen() const;

	/** If true, then this string contains codepoints outside the ASCII range i.e. [0, 127] which require multiple byts to be encoded */
	[[nodiscard]] bool IsMultiByte() const;

	/** Null terminated UTF8 string */
	[[nodiscard]] const char* operator*() const;

	/** Converts to a string of UTF32 or USC4, where each element is equivalent to a Unicode codepoint */
	[[nodiscard]] std::u32string ToUtf32() const;

	/** Converts to a string of UTF16 */
	[[nodiscard]] std::u16string ToUtf16() const;

	/**
	* Returns the UTF16 representation of this string if the platform size of wchar_t is 2,
	* Returns the UTF32 representation of this string if the platform size of wchar_t is 4,
	* this is mostly intended for usage in some APIs that require it like Win32,
	* but it's not safe in a cross-platform environment as the size can be different,
	* e.g. 2 bytes in Win, 4 bytes in Unix, so avoid as much as possible unless you know what you're doing.
	*/
	[[nodiscard]] std::wstring ToWide() const;

	private:
	/** The data as an unsigned char for bitwise manipulation */
	[[nodiscard]] const unsigned char* GetRawData() const;

	private:
	std::string Data;
	};