Easy to understand UT8 string class.
More details at rawsourcecode.io post
Easy to understand UT8 string class.
More details at rawsourcecode.io post
#include "Utf8String.h" | |
namespace | |
{ | |
enum class EUtf8SequenceSize : uint8_t | |
{ | |
One, Two, Three, Four, Invalid | |
}; | |
[[nodiscard]] EUtf8SequenceSize SizeOfUtf8Sequence(const unsigned char& Utf8Char) | |
{ | |
// https://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String | |
if (Utf8Char <= 0x7f) // 0b0111'1111 | |
{ | |
return EUtf8SequenceSize::One; | |
} | |
else if (Utf8Char <= 0xbf) // 0b1011'1111 | |
{ | |
return EUtf8SequenceSize::Invalid; // Not a leading UTF8 byte, possibly something went wrong reading previous sequences | |
} | |
else if (Utf8Char <= 0xdf) // 0b1101'1111 | |
{ | |
return EUtf8SequenceSize::Two; | |
} | |
else if (Utf8Char <= 0xef) // 0b1110'1111 | |
{ | |
return EUtf8SequenceSize::Three; | |
} | |
else if (Utf8Char <= 0xf7) // 0b1111'0111 | |
{ | |
return EUtf8SequenceSize::Four; | |
} | |
// Unicode 3.1 ruled out the five and six octets UTF-8 sequence as illegal although | |
// previous standard / specification such as Unicode 3.0 and RFC 2279 allow the | |
// five and six octets UTF-8 sequence. Therefore, we need to make sure those value are not in the UTF-8 | |
return EUtf8SequenceSize::Invalid; | |
} | |
[[nodiscard]] char32_t NextCodepointFromUtf8Sequence(const unsigned char*& Utf8Sequence) | |
{ | |
if (*Utf8Sequence == 0) | |
{ | |
return 0; | |
} | |
EUtf8SequenceSize NumOfBytes = SizeOfUtf8Sequence(*Utf8Sequence); | |
if (NumOfBytes == EUtf8SequenceSize::Invalid) | |
{ | |
return 0; // End processing | |
} | |
unsigned char FirstByte = *Utf8Sequence; | |
if (NumOfBytes == EUtf8SequenceSize::One) | |
{ | |
++Utf8Sequence; // Point to the start of the next UTF8 sequence | |
return FirstByte; | |
} | |
unsigned char SecondByte = *(++Utf8Sequence); | |
if (SecondByte == 0) | |
{ | |
return 0; | |
} | |
if (NumOfBytes == EUtf8SequenceSize::Two) | |
{ | |
++Utf8Sequence; // Point to the start of the next UTF8 sequence | |
return | |
((FirstByte & 0b0001'1111) << 6) | | |
(SecondByte & 0b0011'1111); | |
} | |
unsigned char ThirdByte = *(++Utf8Sequence); | |
if (ThirdByte == 0) | |
{ | |
return 0; | |
} | |
if (NumOfBytes == EUtf8SequenceSize::Three) | |
{ | |
++Utf8Sequence; // Point to the start of the next UTF8 sequence | |
return | |
((FirstByte & 0b0000'1111) << 12) | | |
((SecondByte & 0b0011'1111) << 6) | | |
(ThirdByte & 0b0011'1111); | |
} | |
unsigned char FourthByte = *(++Utf8Sequence); | |
if (FourthByte == 0) | |
{ | |
return 0; | |
} | |
++Utf8Sequence; // Point to the start of the next UTF8 sequence | |
return | |
((FirstByte & 0b0000'0111) << 18) | | |
((SecondByte & 0b0011'1111) << 12) | | |
((ThirdByte & 0b0011'1111) << 6) | | |
(FourthByte & 0b0011'1111); | |
} | |
} | |
Utf8String::Utf8String(const char* Str) | |
: Data(Str) | |
{} | |
int32_t Utf8String::Len() const | |
{ | |
return static_cast<int32_t>(Data.size()); | |
} | |
int32_t Utf8String::CodePointsLen() const | |
{ | |
if (Len() == 0) | |
{ | |
return 0; | |
} | |
int32_t TotalCodePoints = 0; | |
const unsigned char* Utf8Str = GetRawData(); | |
while (NextCodepointFromUtf8Sequence(Utf8Str)) | |
{ | |
++TotalCodePoints; | |
} | |
return TotalCodePoints; | |
} | |
bool Utf8String::IsMultiByte() const | |
{ | |
const unsigned char* Utf8Str = GetRawData(); | |
while (*Utf8Str != 0) | |
{ | |
char32_t UnicodeCodePoint = NextCodepointFromUtf8Sequence(Utf8Str); | |
if (UnicodeCodePoint >= 0x1'0000) | |
{ | |
return true; | |
} | |
} | |
return false; | |
} | |
const char* Utf8String::operator*() const | |
{ | |
return Data.c_str(); | |
} | |
std::u32string Utf8String::ToUtf32() const | |
{ | |
std::u32string Utf32Output; | |
if (Len() == 0) | |
{ | |
return Utf32Output; | |
} | |
const unsigned char* Utf8Str = GetRawData(); | |
while (*Utf8Str != 0) | |
{ | |
char32_t UnicodeCodePoint = NextCodepointFromUtf8Sequence(Utf8Str); | |
Utf32Output.push_back(UnicodeCodePoint); | |
} | |
return Utf32Output; | |
} | |
std::u16string Utf8String::ToUtf16() const | |
{ | |
// UTF8: https://en.wikipedia.org/wiki/UTF-8 | |
// UTF16: https://en.wikipedia.org/wiki/UTF-16 | |
std::u16string Utf16Output; | |
if (Len() == 0) | |
{ | |
return Utf16Output; | |
} | |
// https://stackoverflow.com/questions/73758747/looking-for-the-description-of-the-algorithm-to-convert-utf8-to-utf16 | |
const unsigned char* Utf8Str = GetRawData(); | |
while (*Utf8Str != 0) | |
{ | |
char32_t UnicodeCodePoint = NextCodepointFromUtf8Sequence(Utf8Str); | |
if (UnicodeCodePoint < 0x1'0000) // 0b0001'0000'0000'0000'0000 | |
{ | |
Utf16Output.push_back(UnicodeCodePoint); | |
} | |
else | |
{ | |
UnicodeCodePoint -= 0x1'0000; | |
char16_t HighSurrogate = 0xd800 + ((UnicodeCodePoint >> 10) & 0x3FF); // 0x3FF == 0b0011'1111'1111 | |
char16_t LowSurrogate = 0xdc00 + (UnicodeCodePoint & 0x3FF); | |
Utf16Output.push_back(HighSurrogate); | |
Utf16Output.push_back(LowSurrogate); | |
} | |
} | |
return Utf16Output; | |
} | |
std::wstring Utf8String::ToWide() const | |
{ | |
std::wstring WideOutput; | |
if (Len() == 0) | |
{ | |
return WideOutput; | |
} | |
if constexpr (sizeof(wchar_t) == 4) | |
{ | |
std::u32string Utf32String = ToUtf32(); | |
WideOutput.reserve(Utf32String.size()); | |
for (const char32_t& Char : Utf32String) | |
{ | |
WideOutput.push_back(Char); | |
} | |
return WideOutput; | |
} | |
else if constexpr (sizeof(wchar_t) == 2) | |
{ | |
std::u16string Utf16String = ToUtf16(); | |
WideOutput.reserve(Utf16String.size()); | |
for (const char16_t& Char : Utf16String) | |
{ | |
WideOutput.push_back(Char); | |
} | |
return WideOutput; | |
} | |
else if constexpr (sizeof(wchar_t) == 1) | |
{ | |
WideOutput.reserve(Data.size()); | |
for (const char& Char : Data) | |
{ | |
WideOutput.push_back(Char); | |
} | |
return WideOutput; | |
} | |
static_assert(sizeof(wchar_t) == 1 || sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4, "Unexpected wchar_t size"); | |
} | |
const unsigned char* Utf8String::GetRawData() const | |
{ | |
return reinterpret_cast<const unsigned char*>(Data.c_str()); | |
} |
#pragma once | |
#include <cstdint> | |
#include <string> | |
/** | |
* UTF-8 encoded string | |
*/ | |
class Utf8String | |
{ | |
public: | |
/** Construct a UTF-8 string */ | |
explicit FUtf8String(const char* Str); | |
/** Num of characters in the string (no Unicode codepoints) */ | |
[[nodiscard]] int32_t Len() const; | |
/** Num of Unicode codepoints */ | |
[[nodiscard]] int32_t CodePointsLen() const; | |
/** If true, then this string contains codepoints outside the ASCII range i.e. [0, 127] which require multiple byts to be encoded */ | |
[[nodiscard]] bool IsMultiByte() const; | |
/** Null terminated UTF8 string */ | |
[[nodiscard]] const char* operator*() const; | |
/** Converts to a string of UTF32 or USC4, where each element is equivalent to a Unicode codepoint */ | |
[[nodiscard]] std::u32string ToUtf32() const; | |
/** Converts to a string of UTF16 */ | |
[[nodiscard]] std::u16string ToUtf16() const; | |
/** | |
* Returns the UTF16 representation of this string if the platform size of wchar_t is 2, | |
* Returns the UTF32 representation of this string if the platform size of wchar_t is 4, | |
* this is mostly intended for usage in some APIs that require it like Win32, | |
* but it's not safe in a cross-platform environment as the size can be different, | |
* e.g. 2 bytes in Win, 4 bytes in Unix, so avoid as much as possible unless you know what you're doing. | |
*/ | |
[[nodiscard]] std::wstring ToWide() const; | |
private: | |
/** The data as an unsigned char for bitwise manipulation */ | |
[[nodiscard]] const unsigned char* GetRawData() const; | |
private: | |
std::string Data; | |
}; |