Skip to content

Instantly share code, notes, and snippets.

@romualdo97
Last active April 18, 2025 17:52
Show Gist options
  • Save romualdo97/7423f4c5812ee72745a3b3bc3740d158 to your computer and use it in GitHub Desktop.
Save romualdo97/7423f4c5812ee72745a3b3bc3740d158 to your computer and use it in GitHub Desktop.
Basic C++ Utf8String class
#include "Utf8String.h"
namespace
{
enum class EUtf8SequenceSize : uint8_t
{
One, Two, Three, Four, Invalid
};
[[nodiscard]] EUtf8SequenceSize SizeOfUtf8Sequence(const unsigned char& Utf8Char)
{
// https://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String
if (Utf8Char <= 0x7f) // 0b0111'1111
{
return EUtf8SequenceSize::One;
}
else if (Utf8Char <= 0xbf) // 0b1011'1111
{
return EUtf8SequenceSize::Invalid; // Not a leading UTF8 byte, possibly something went wrong reading previous sequences
}
else if (Utf8Char <= 0xdf) // 0b1101'1111
{
return EUtf8SequenceSize::Two;
}
else if (Utf8Char <= 0xef) // 0b1110'1111
{
return EUtf8SequenceSize::Three;
}
else if (Utf8Char <= 0xf7) // 0b1111'0111
{
return EUtf8SequenceSize::Four;
}
// Unicode 3.1 ruled out the five and six octets UTF-8 sequence as illegal although
// previous standard / specification such as Unicode 3.0 and RFC 2279 allow the
// five and six octets UTF-8 sequence. Therefore, we need to make sure those value are not in the UTF-8
return EUtf8SequenceSize::Invalid;
}
[[nodiscard]] char32_t NextCodepointFromUtf8Sequence(const unsigned char*& Utf8Sequence)
{
if (*Utf8Sequence == 0)
{
return 0;
}
EUtf8SequenceSize NumOfBytes = SizeOfUtf8Sequence(*Utf8Sequence);
if (NumOfBytes == EUtf8SequenceSize::Invalid)
{
return 0; // End processing
}
unsigned char FirstByte = *Utf8Sequence;
if (NumOfBytes == EUtf8SequenceSize::One)
{
++Utf8Sequence; // Point to the start of the next UTF8 sequence
return FirstByte;
}
unsigned char SecondByte = *(++Utf8Sequence);
if (SecondByte == 0)
{
return 0;
}
if (NumOfBytes == EUtf8SequenceSize::Two)
{
++Utf8Sequence; // Point to the start of the next UTF8 sequence
return
((FirstByte & 0b0001'1111) << 6) |
(SecondByte & 0b0011'1111);
}
unsigned char ThirdByte = *(++Utf8Sequence);
if (ThirdByte == 0)
{
return 0;
}
if (NumOfBytes == EUtf8SequenceSize::Three)
{
++Utf8Sequence; // Point to the start of the next UTF8 sequence
return
((FirstByte & 0b0000'1111) << 12) |
((SecondByte & 0b0011'1111) << 6) |
(ThirdByte & 0b0011'1111);
}
unsigned char FourthByte = *(++Utf8Sequence);
if (FourthByte == 0)
{
return 0;
}
++Utf8Sequence; // Point to the start of the next UTF8 sequence
return
((FirstByte & 0b0000'0111) << 18) |
((SecondByte & 0b0011'1111) << 12) |
((ThirdByte & 0b0011'1111) << 6) |
(FourthByte & 0b0011'1111);
}
}
Utf8String::Utf8String(const char* Str)
: Data(Str)
{}
int32_t Utf8String::Len() const
{
return static_cast<int32_t>(Data.size());
}
int32_t Utf8String::CodePointsLen() const
{
if (Len() == 0)
{
return 0;
}
int32_t TotalCodePoints = 0;
const unsigned char* Utf8Str = GetRawData();
while (NextCodepointFromUtf8Sequence(Utf8Str))
{
++TotalCodePoints;
}
return TotalCodePoints;
}
bool Utf8String::IsMultiByte() const
{
const unsigned char* Utf8Str = GetRawData();
while (*Utf8Str != 0)
{
char32_t UnicodeCodePoint = NextCodepointFromUtf8Sequence(Utf8Str);
if (UnicodeCodePoint >= 0x1'0000)
{
return true;
}
}
return false;
}
const char* Utf8String::operator*() const
{
return Data.c_str();
}
std::u32string Utf8String::ToUtf32() const
{
std::u32string Utf32Output;
if (Len() == 0)
{
return Utf32Output;
}
const unsigned char* Utf8Str = GetRawData();
while (*Utf8Str != 0)
{
char32_t UnicodeCodePoint = NextCodepointFromUtf8Sequence(Utf8Str);
Utf32Output.push_back(UnicodeCodePoint);
}
return Utf32Output;
}
std::u16string Utf8String::ToUtf16() const
{
// UTF8: https://en.wikipedia.org/wiki/UTF-8
// UTF16: https://en.wikipedia.org/wiki/UTF-16
std::u16string Utf16Output;
if (Len() == 0)
{
return Utf16Output;
}
// https://stackoverflow.com/questions/73758747/looking-for-the-description-of-the-algorithm-to-convert-utf8-to-utf16
const unsigned char* Utf8Str = GetRawData();
while (*Utf8Str != 0)
{
char32_t UnicodeCodePoint = NextCodepointFromUtf8Sequence(Utf8Str);
if (UnicodeCodePoint < 0x1'0000) // 0b0001'0000'0000'0000'0000
{
Utf16Output.push_back(UnicodeCodePoint);
}
else
{
UnicodeCodePoint -= 0x1'0000;
char16_t HighSurrogate = 0xd800 + ((UnicodeCodePoint >> 10) & 0x3FF); // 0x3FF == 0b0011'1111'1111
char16_t LowSurrogate = 0xdc00 + (UnicodeCodePoint & 0x3FF);
Utf16Output.push_back(HighSurrogate);
Utf16Output.push_back(LowSurrogate);
}
}
return Utf16Output;
}
std::wstring Utf8String::ToWide() const
{
std::wstring WideOutput;
if (Len() == 0)
{
return WideOutput;
}
if constexpr (sizeof(wchar_t) == 4)
{
std::u32string Utf32String = ToUtf32();
WideOutput.reserve(Utf32String.size());
for (const char32_t& Char : Utf32String)
{
WideOutput.push_back(Char);
}
return WideOutput;
}
else if constexpr (sizeof(wchar_t) == 2)
{
std::u16string Utf16String = ToUtf16();
WideOutput.reserve(Utf16String.size());
for (const char16_t& Char : Utf16String)
{
WideOutput.push_back(Char);
}
return WideOutput;
}
else if constexpr (sizeof(wchar_t) == 1)
{
WideOutput.reserve(Data.size());
for (const char& Char : Data)
{
WideOutput.push_back(Char);
}
return WideOutput;
}
static_assert(sizeof(wchar_t) == 1 || sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4, "Unexpected wchar_t size");
}
const unsigned char* Utf8String::GetRawData() const
{
return reinterpret_cast<const unsigned char*>(Data.c_str());
}
#pragma once
#include <cstdint>
#include <string>
/**
* UTF-8 encoded string
*/
class Utf8String
{
public:
/** Construct a UTF-8 string */
explicit FUtf8String(const char* Str);
/** Num of characters in the string (no Unicode codepoints) */
[[nodiscard]] int32_t Len() const;
/** Num of Unicode codepoints */
[[nodiscard]] int32_t CodePointsLen() const;
/** If true, then this string contains codepoints outside the ASCII range i.e. [0, 127] which require multiple byts to be encoded */
[[nodiscard]] bool IsMultiByte() const;
/** Null terminated UTF8 string */
[[nodiscard]] const char* operator*() const;
/** Converts to a string of UTF32 or USC4, where each element is equivalent to a Unicode codepoint */
[[nodiscard]] std::u32string ToUtf32() const;
/** Converts to a string of UTF16 */
[[nodiscard]] std::u16string ToUtf16() const;
/**
* Returns the UTF16 representation of this string if the platform size of wchar_t is 2,
* Returns the UTF32 representation of this string if the platform size of wchar_t is 4,
* this is mostly intended for usage in some APIs that require it like Win32,
* but it's not safe in a cross-platform environment as the size can be different,
* e.g. 2 bytes in Win, 4 bytes in Unix, so avoid as much as possible unless you know what you're doing.
*/
[[nodiscard]] std::wstring ToWide() const;
private:
/** The data as an unsigned char for bitwise manipulation */
[[nodiscard]] const unsigned char* GetRawData() const;
private:
std::string Data;
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment