Skip to content

Instantly share code, notes, and snippets.

@fdwr
Last active May 23, 2025 01:38
Show Gist options
  • Select an option

  • Save fdwr/2bc0658372178dd2c7fca2924ac23633 to your computer and use it in GitHub Desktop.

Select an option

Save fdwr/2bc0658372178dd2c7fca2924ac23633 to your computer and use it in GitHub Desktop.
String conversion functions
// Miscellaneous helpers for strings that core C++ is missing.
module;
#include "precomp.h"
#include <stdint.h>
#include <string>
#include <string_view>
#include <span>
#include <optional>
#include <type_traits>
export module StringHelpers;
namespace
{
constexpr static char8_t g_utf8bom[] = {char8_t(0xEF), char8_t(0xBB), char8_t(0xBF)};
constexpr auto g_utf8bomView = std::u8string_view(std::data(g_utf8bom), std::size(g_utf8bom));
}
export namespace StringHelpers
{
enum UnicodeCodePoint
{
UnicodeSpace = 0x000020,
UnicodeNbsp = 0x0000A0,
UnicodeSoftHyphen = 0x0000AD,
UnicodeEnQuadSpace = 0x002000,
UnicodeZeroWidthSpace = 0x00200B,
UnicodeDottedCircle = 0x0025CC,
UnicodeIdeographicSpace = 0x003000,
UnicodeInlineObject = 0x00FFFC, // for embedded objects
UnicodeReplacementCharacter = 0x00FFFD, // for invalid sequences
UnicodeMax = 0x10FFFF,
UnicodeTotal = 0x110000,
};
inline char AsChar(std::byte c) { return static_cast<char>(c); }
inline char* AsChar(std::byte* p) { return reinterpret_cast<char*>(p); }
inline char const* AsChar(std::byte const* p) { return reinterpret_cast<char const*>(p); }
inline char AsChar(char8_t c) { return static_cast<char>(c); }
inline char* AsChar(char8_t* p) { return reinterpret_cast<char*>(p); }
inline char const* AsChar(char8_t const* p) { return reinterpret_cast<char const*>(p); }
inline char* AsChar(char* p) { return p; } // Identity operation for general templates
inline char const* AsChar(char const* p) { return p; } // Identity operation for general templates
inline char** AsChar(char8_t** p) { return reinterpret_cast<char**>(p); }
inline char* const* AsChar(char8_t* const* const p) { return reinterpret_cast<char* const* const>(p); }
inline char const* const* AsChar(char8_t const* const* const p) { return reinterpret_cast<char const* const* const>(p); }
inline unsigned char* AsUChar(char* p) { return reinterpret_cast<unsigned char*>(p); }
inline unsigned char* AsUChar(char8_t* p) { return reinterpret_cast<unsigned char*>(p); }
inline unsigned char* AsUChar(std::byte* p) { return reinterpret_cast<unsigned char*>(p); }
inline char8_t* AsUtf8Char(char* p) { return reinterpret_cast<char8_t*>(p); }
inline char8_t const* AsUtf8Char(char const* p) { return reinterpret_cast<char8_t const*>(p); }
inline std::u8string_view AsUtf8Char(std::string_view s) { return std::u8string_view(reinterpret_cast<char8_t const*>(s.data()), s.size()); }
inline std::string_view AsChar(std::u8string_view s) { return std::string_view(reinterpret_cast<char const*>(s.data()), s.size()); }
inline std::u8string& AsUtf8Char(std::string& s) { return reinterpret_cast<std::u8string&>(s); }
inline std::string& AsChar(std::u8string& s) { return reinterpret_cast<std::string&>(s); }
inline std::u8string const& AsUtf8Char(std::string const& s) { return reinterpret_cast<std::u8string const&>(s); }
inline std::string const& AsChar(std::u8string const& s) { return reinterpret_cast<std::string const&>(s); }
// The std::span::subset method is dangerous (sadly), offering no clamped version to stay within the buffer.
template <typename T>
std::span<T> ClampedSubspan(std::span<T> a, size_t index, size_t count)
{
size_t const maxCount = a.size();
index = std::min(index, maxCount);
count = std::min(maxCount - index, count);
return a.subspan(index, count);
}
inline bool IsSurrogate(char32_t ch) noexcept
{
// 0xD800 <= ch <= 0xDFFF
return (ch & 0xF800) == 0xD800;
}
inline bool IsLeadingSurrogate(char32_t ch) noexcept
{
// 0xD800 <= ch <= 0xDBFF
return (ch & 0xFC00) == 0xD800;
}
inline bool IsTrailingSurrogate(char32_t ch) noexcept
{
// 0xDC00 <= ch <= 0xDFFF
return (ch & 0xFC00) == 0xDC00;
}
inline bool IsCharacterBeyondBmp(char32_t ch) noexcept
{
return ch >= 0x10000;
}
inline char32_t MakeUnicodeCodePoint(char32_t high, char32_t low) noexcept
{
return ((high & 0x03FF) << 10 | (low & 0x03FF)) + 0x10000;
}
// Split into leading and trailing surrogatse.
// From http://unicode.org/faq/utf_bom.html#35
inline char16_t GetLeadingSurrogate(char32_t ch)
{
return char16_t(0xD800 + (ch >> 10) - (0x10000 >> 10));
}
inline char16_t GetTrailingSurrogate(char32_t ch)
{
return char16_t(0xDC00 + (ch & 0x3FF));
}
inline bool IsHexDigit(char32_t ch) noexcept
{
return (ch >= '0' && ch <= '9') || (ch &= ~32, ch >= 'A' && ch <= 'F');
}
struct Utf16CharacterReader
{
char16_t const* current_ = nullptr;
char16_t const* end_ = nullptr;
Utf16CharacterReader() = default;
Utf16CharacterReader(char16_t const* begin, char16_t const* end) : current_(begin), end_(end)
{}
template <typename CharacterType>
Utf16CharacterReader(CharacterType const* begin, CharacterType const* end)
requires (sizeof(CharacterType) == sizeof(char16_t))
: current_(reinterpret_cast<char16_t const*>(begin)),
end_(reinterpret_cast<char16_t const*>(end))
{}
template <typename Container>
Utf16CharacterReader(Container const& c)
requires (sizeof(*std::data(c)) == sizeof(char16_t))
: Utf16CharacterReader(std::data(c), std::data(c) + std::size(c))
{}
size_t size() const noexcept
{
return end_ - current_;
}
bool IsAtEnd() const noexcept
{
return current_ >= end_;
}
char32_t ReadNext() noexcept
{
if (current_ >= end_)
return 0;
char32_t ch = *current_++;
if (!IsSurrogate(ch))
return ch; // Character fits in the basic multilingual plane.
if (!IsLeadingSurrogate(ch) || current_ >= end_)
return UnicodeReplacementCharacter; // Illegal unpaired surrogate. Substitute with replacement character.
char32_t leading = ch;
char32_t trailing = *current_;
if (!IsTrailingSurrogate(trailing))
return UnicodeReplacementCharacter; // Illegal unpaired surrogate. Substitute with replacement character.
++current_;
return MakeUnicodeCodePoint(leading, trailing);
}
char32_t ReadNextNoReplacement() noexcept
{
if (current_ >= end_)
return 0;
char32_t codePoint = *current_++;
// Just use the character if not a surrogate code point.
// For unpaired surrogates, pass the isolated surrogate
// through (rather than remap to U+FFFD replacement).
if (IsLeadingSurrogate(codePoint) && current_ < end_)
{
char32_t leadingCodeUnit = codePoint;
char32_t trailingCodeUnit = *current_;
if (IsTrailingSurrogate(trailingCodeUnit))
{
codePoint = MakeUnicodeCodePoint(leadingCodeUnit, trailingCodeUnit);
++current_;
}
}
return codePoint;
}
};
struct Utf16CharacterWriter
{
char16_t* begin_ = nullptr;
char16_t* current_ = nullptr;
char16_t* end_ = nullptr;
Utf16CharacterWriter() = default;
Utf16CharacterWriter(char16_t* begin, char16_t* end) : begin_(begin), current_(begin), end_(end)
{}
template <typename CharacterType>
inline Utf16CharacterWriter(CharacterType* begin, CharacterType* end)
requires (sizeof(CharacterType) == sizeof(char16_t))
: Utf16CharacterWriter(
reinterpret_cast<char16_t*>(begin),
reinterpret_cast<char16_t*>(begin)
)
{}
template <typename Container>
Utf16CharacterWriter(Container& c)
requires (sizeof(*std::data(c)) == sizeof(char16_t))
: Utf16CharacterWriter(
reinterpret_cast<char16_t*>(std::data(c)),
reinterpret_cast<char16_t*>(std::data(c) + std::size(c))
)
{}
size_t size() const noexcept
{
return current_ - begin_;
}
bool IsAtEnd() const noexcept
{
return current_ >= end_;
}
void WriteNext(char32_t ch) noexcept
{
if (current_ >= end_)
return;
if (IsCharacterBeyondBmp(ch) && end_ - current_ >= 2)
{
// Split into leading and trailing surrogatse.
// From http://unicode.org/faq/utf_bom.html#35
current_[0] = char16_t(GetLeadingSurrogate(ch));
current_[1] = char16_t(GetTrailingSurrogate(ch));
current_ += 2;
}
else
{
// A BMP character (or isolated surrogate)
current_[0] = char16_t(ch);
++current_;
}
}
};
// For iterating directly over the characters.
// - Avoid deprecated std::wstring_convert<std::codecvt_utf8_utf16
// - OS-specific MultibyteToWideChar and allocating intermediate buffers.
// TODO: Compare with https://github.com/simdutf/simdutf/blob/master/src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h.
struct Utf8CharacterReader
{
char8_t const* current_ = nullptr;
char8_t const* end_ = nullptr;
Utf8CharacterReader() = default;
Utf8CharacterReader(char8_t const* begin, char8_t const* end) : current_(begin), end_(end)
{}
template <typename CharacterType>
Utf8CharacterReader(CharacterType const* begin, CharacterType const* end)
requires (sizeof(CharacterType) == sizeof(char8_t))
: Utf8CharacterReader(
reinterpret_cast<char8_t const*>(begin),
reinterpret_cast<char8_t const*>(end)
)
{}
template <typename Container>
Utf8CharacterReader(Container const& c)
requires (sizeof(*std::data(c)) == sizeof(char8_t))
: Utf8CharacterReader(
reinterpret_cast<char8_t const*>(std::data(c)),
reinterpret_cast<char8_t const*>(std::data(c) + std::size(c))
)
{}
size_t size() const noexcept
{
return end_ - current_;
}
bool IsAtEnd() const noexcept
{
return current_ >= end_;
}
char32_t ReadNext() noexcept
{
// TODO: Test overlong sequences.
if (current_ >= end_)
return 0;
char32_t codeUnit = *current_++;
if (codeUnit <= 0b0111'1111) // 0xxxxxxx 0-127
return codeUnit;
// The following byte values should never occur: 0xC0, 0xC1, 0xF5-0xFF
if (codeUnit < 0b1100'0000) // Any 10xxxxxx patterns are illegal.
return UnicodeReplacementCharacter;
uint32_t continuationMask = 0b0100'0000;
uint32_t continuationByteCount = 0;
while (codeUnit & continuationMask)
{
codeUnit ^= continuationMask;
continuationMask >>= 1;
++continuationByteCount;
if (continuationByteCount > 3)
return UnicodeReplacementCharacter;
}
char32_t codePoint = codeUnit & 0b0011'1111;
while (continuationByteCount--)
{
codeUnit = *current_;
if ((codeUnit & 0b1100'0000) != 0b1000'0000) // Expect 10xxxxxx pattern.
return UnicodeReplacementCharacter; // Expected continuation byte.
++current_;
// Combine next code unit lowest 6 bits with existing bits.
codePoint = (codePoint << 6) | codeUnit & 0b0011'1111;
}
return codePoint;
}
// Skip the byte order mark, if present.
void SkipBom() noexcept
{
if (size() >= sizeof(g_utf8bom) && memcmp(current_, g_utf8bom, sizeof(g_utf8bom)) == 0)
{
current_ += sizeof(g_utf8bom);
}
}
};
// Writes a single code point out to a memory region of char8's.
// Does NOT throw if an invalid character is passed - just writes replacement.
struct Utf8CharacterWriter
{
char8_t* begin_ = nullptr;
char8_t* current_ = nullptr;
char8_t* end_ = nullptr;
Utf8CharacterWriter() = default;
Utf8CharacterWriter(char8_t* begin, char8_t* end) : begin_(begin), current_(begin), end_(end)
{}
template <typename CharacterType>
Utf8CharacterWriter(CharacterType* begin, CharacterType* end)
requires (sizeof(CharacterType) == sizeof(char8_t))
: Utf8CharacterWriter(reinterpret_cast<char8_t*>(begin), reinterpret_cast<char8_t*>(end))
{}
template <typename Container>
Utf8CharacterWriter(Container& c)
requires (sizeof(*std::data(c)) == sizeof(char8_t))
: Utf8CharacterWriter(
reinterpret_cast<char8_t*>(std::data(c)),
reinterpret_cast<char8_t*>(std::data(c) + std::size(c))
)
{}
size_t size() const noexcept
{
return current_ - begin_;
}
bool IsAtEnd() const noexcept
{
return current_ >= end_;
}
void WriteNext(char32_t codePoint) noexcept
{
// Consider Duff's device like approach: https://github.com/Alexhuszagh/UTFPP/blob/bd99a5e4f3fbfb3bc86c1d7af5cf5edf2f00e1a7/utf.hpp#L115
if (current_ >= end_)
return;
if (codePoint < 0x80)
{
*current_++ = static_cast<char8_t>(codePoint);
}
else if (codePoint < 0x0800)
{
*current_++ = static_cast<char8_t>((codePoint >> 6) | 0xC0);
if (current_ < end_) *current_++ = static_cast<char8_t>((codePoint & 0x3F) | 0x80);
}
else if (codePoint < 0x10000)
{
*current_++ = static_cast<char8_t>((codePoint >> 12) | 0xE0);
if (current_ < end_) *current_++ = static_cast<char8_t>(((codePoint >> 6) & 0x3F) | 0x80);
if (current_ < end_) *current_++ = static_cast<char8_t>((codePoint & 0x3F) | 0x80);
}
else
{
*current_++ = static_cast<char8_t>((codePoint >> 18) | 0xF0);
if (current_ < end_) *current_++ = static_cast<char8_t>(((codePoint >> 12) & 0x3F) | 0x80);
if (current_ < end_) *current_++ = static_cast<char8_t>(((codePoint >> 6) & 0x3F) | 0x80);
if (current_ < end_) *current_++ = static_cast<char8_t>((codePoint & 0x3F) | 0x80);
}
}
};
struct Utf32CharacterReader
{
char32_t const* current_ = nullptr;
char32_t const* end_ = nullptr;
Utf32CharacterReader() = default;
Utf32CharacterReader(char32_t const* begin, char32_t const* end)
: current_(begin), end_(end)
{}
template <typename CharacterType>
Utf32CharacterReader(CharacterType const* begin, CharacterType const* end)
requires (sizeof(CharacterType) == sizeof(char32_t))
: Utf32CharacterReader(
reinterpret_cast<char32_t const*>(begin),
reinterpret_cast<char32_t const*>(end)
)
{}
template <typename Container>
Utf32CharacterReader(Container const& c)
requires (sizeof(*std::data(c)) == sizeof(char32_t))
: Utf32CharacterReader(
reinterpret_cast<char32_t const*>(std::data(c)),
reinterpret_cast<char32_t const*>(std::data(c) + std::size(c))
)
{}
size_t size() const noexcept
{
return end_ - current_;
}
bool IsAtEnd() const noexcept
{
return current_ >= end_;
}
char32_t ReadNext() noexcept
{
if (current_ >= end_)
return 0;
return *current_++;
}
};
struct Utf32CharacterWriter
{
char32_t* begin_ = nullptr;
char32_t* current_ = nullptr;
char32_t* end_ = nullptr;
Utf32CharacterWriter() = default;
Utf32CharacterWriter(char32_t* begin, char32_t* end)
: begin_(begin), current_(begin), end_(end)
{}
template <typename CharacterType>
Utf32CharacterWriter(CharacterType* begin, CharacterType* end)
requires (sizeof(CharacterType) == sizeof(char32_t))
: Utf32CharacterWriter(
reinterpret_cast<char32_t*>(begin),
reinterpret_cast<char32_t*>(end)
)
{}
template <typename Container>
Utf32CharacterWriter(Container& c)
requires (sizeof(*std::data(c)) == sizeof(char32_t))
: Utf32CharacterWriter(
reinterpret_cast<char32_t*>(std::data(c)),
reinterpret_cast<char32_t*>(std::data(c) + std::size(c))
)
{}
size_t size() const noexcept
{
return current_ - begin_;
}
bool IsAtEnd() const noexcept
{
return current_ >= end_;
}
void WriteNext(char32_t codePoint) noexcept
{
if (current_ >= end_)
return;
*current_++;
}
};
std::u8string_view StripUtf8Bom(std::u8string_view s)
{
if (s.starts_with(g_utf8bomView))
{
s.remove_prefix(3);
}
return s;
}
template<
typename InputContainer,
typename OutputView = std::u8string_view, // Could be a std::string or std::span too or any type that accepts two iterators.
typename OutputContainer = std::vector<OutputView>
>
requires requires(InputContainer i, OutputContainer o, OutputView v) {
i.begin(); // Must have iterators.
i.end();
OutputView(i.begin(), i.end()); // Must be constructible from iterator pair.
o.push_back(OutputView{}); // Must be push_back'able.
}
auto SplitLines(InputContainer& text) -> OutputContainer
{
OutputContainer result;
auto lineBegin = text.begin();
auto textEnd = text.end();
while (lineBegin != textEnd)
{
auto it = lineBegin;
while (it != textEnd)
{
auto lineEnd = it;
auto ch = *it++;
if (ch == '\r' || ch == '\n')
{
result.push_back(OutputView(lineBegin, lineEnd));
// Skip the CR and LF pair.
// Note parapgrah separate and line separator are ignored.
if (ch == '\r' && it != textEnd && *it == '\n')
{
++it; // Skip the line feed.
}
break;
}
}
lineBegin = it; // Next line.
}
return result;
}
template <typename StringViewType = std::u8string_view>
class SplitEnumerator
{
StringViewType view_ = 0;
char32_t splitCodeUnit_ = 0; // A single code unit to split upon, like ",".
bool hasMore_ = true;
public:
SplitEnumerator(StringViewType view, char32_t splitCodeUnit) noexcept
: splitCodeUnit_(splitCodeUnit),
view_(view)
{}
bool HasMore() const noexcept { return hasMore_; }
StringViewType Read() noexcept
{
using C = decltype(*StringViewType().data());
auto nextSplit = std::find(view_.begin(), view_.end(), C(splitCodeUnit_));
auto token = StringViewType(view_.begin(), nextSplit);
if (nextSplit == view_.end())
{
hasMore_ = false;
}
else
{
++nextSplit; // Skip the split code unit.
}
view_ = StringViewType(nextSplit, view_.end());
return token;
}
};
// Fills the entire buffer up to fixed size, including leading zeroes.
template <typename CharacterType>
void WriteZeroPaddedHexNum(uint32_t value, /*out*/ std::span<CharacterType> text)
{
minimal_span<CharacterType> currentText(text);
// Convert character to digits.
while (!currentText.empty())
{
CharacterType digit = value & 0xF;
digit += (digit >= 10) ? 'A' - 10 : '0';
currentText.back() = digit;
currentText.pop_back();
value >>= 4;
}
}
void WriteZeroPaddedHexNum(uint32_t value, /*out*/ std::span<char8_t> text) { return WriteZeroPaddedHexNum<char8_t>(value, /*out*/ text); }
void WriteZeroPaddedHexNum(uint32_t value, /*out*/ std::span<char16_t> text) { return WriteZeroPaddedHexNum<char16_t>(value, /*out*/ text); }
void WriteZeroPaddedHexNum(uint32_t value, /*out*/ std::span<char32_t> text) { return WriteZeroPaddedHexNum<char32_t>(value, /*out*/ text); }
// 'text' is updated to the end of all characters read.
// TODO: Consider using std::from_chars instead now that it exists.
template <typename CharacterType>
uint32_t ReadUnsignedNumericValue(/*inout*/ std::span<CharacterType const>& text, _In_range_(2, 36) uint32_t base)
{
// Sadly, both wcstoul and std::stoul are useless functions because:
// (1) wcstoul doesn't respect any boundaries and tries to parse beyond the code sequence
// (e.g. \x12345 should be treated as {0x1234, '5'}, not as {0x12345})
// (2) std::stoul throws an exception on parse error, which is overkill for the user
// interactively typing in a number.
// (3) std::stoul requries a std::string as input, which gimps its utility.
// Additionally, some uses such as escapement conversion don't want whitespace skipped.
// - 'text' is updated upon returning to point after the consumed part.
// - Any character outside the radix stops the read. So 123A4G would stop at 'A' for decimal,
// but it would continue until 'G' for hexademical.
// - An empty string returns 0.
// - The caller doesn't receive a flag, but it can easily detect missing strings or whether
// the entire number was read by checking the return std::span.
uint32_t value = 0;
minimal_span<CharacterType const> input = text;
while (!input.empty())
{
uint32_t digit = input.front();
if (digit < '0')
break;
digit -= '0'; // Handle 0..9.
if (digit >= 10) // Handle A..Z.
{
digit &= ~32; // Make upper case.
if (digit < 'A' - '0')
break;
digit -= 'A' - '0' - 10;
}
if (digit >= base)
{
break;
}
value = value * base + digit;
input.pop_front();
}
text = input;
return value;
}
uint32_t ReadUnsignedNumericValue(/*inout*/ std::span<char8_t const>& text, _In_range_(2, 36) uint32_t base) { return ReadUnsignedNumericValue<char8_t>(/*inout*/ text, base); }
uint32_t ReadUnsignedNumericValue(/*inout*/ std::span<char16_t const>& text, _In_range_(2, 36) uint32_t base) { return ReadUnsignedNumericValue<char16_t>(/*inout*/ text, base); }
uint32_t ReadUnsignedNumericValue(/*inout*/ std::span<char32_t const>& text, _In_range_(2, 36) uint32_t base) { return ReadUnsignedNumericValue<char32_t>(/*inout*/ text, base); }
void UnescapeCppUniversalCharacterNames(
std::span<char16_t const> escapedText,
/*out*/ std::u16string& expandedText
)
{
minimal_span<char16_t const> currentEscapedText(escapedText);
expandedText.clear();
expandedText.reserve(currentEscapedText.size());
while (!currentEscapedText.empty())
{
char16_t ch = currentEscapedText.consume_front();
// Check escape codes.
if (ch == '\\' && !currentEscapedText.empty())
{
char32_t replacement = L'\\';
char16_t code = currentEscapedText.front();
switch (code)
{
case 'a': replacement = 0x0007; currentEscapedText.pop_front(); break; // Alert (Beep, Bell)
case 'b': replacement = 0x0008; currentEscapedText.pop_front(); break; // Backspace
case 'f': replacement = 0x000C; currentEscapedText.pop_front(); break; // Formfeed
case 'n': replacement = 0x000A; currentEscapedText.pop_front(); break; // Newline (Line Feed)
case 'r': replacement = 0x000D; currentEscapedText.pop_front(); break; // Carriage Return
case 't': replacement = 0x0009; currentEscapedText.pop_front(); break; // Horizontal Tab
case 'v': replacement = 0x000B; currentEscapedText.pop_front(); break; // Vertical Tab
case '\\': replacement = 0x005C; currentEscapedText.pop_front(); break; // Backslash
case '\'': replacement = 0x0027; currentEscapedText.pop_front(); break; // Single quotation mark
case '\"': replacement = 0x0022; currentEscapedText.pop_front(); break; // Double quotation mark
case '?': replacement = 0x003F; currentEscapedText.pop_front(); break; // Question mark
case L'x':
case L'u':
case L'U':
{
size_t expectedHexSequenceLength = (code == 'U') ? 8 : 4;
char16_t const* escapeStart = currentEscapedText.data() + 1; // Skip the 'x' 'u' 'U'
char16_t const* escapeEnd = std::min(escapeStart + expectedHexSequenceLength, currentEscapedText.data_end());
std::span<char16_t const> digitSpan = {escapeStart, escapeEnd};
// Parse the number.
if (digitSpan.size() >= expectedHexSequenceLength)
{
char32_t hexValue = ReadUnsignedNumericValue(/*inout*/ digitSpan, 16);
if (digitSpan.empty()) // Completely read the sequence.
{
replacement = hexValue;
currentEscapedText.reset(digitSpan.data(), currentEscapedText.end());
}
}
// Else parse error. So keep '\' to preserve original text.
}
break;
// Anything else yields a '\', preserving the original text.
// Silly octal is not supported.
}
if (IsCharacterBeyondBmp(replacement))
{
expandedText.push_back(GetLeadingSurrogate(replacement));
expandedText.push_back(GetTrailingSurrogate(replacement));
}
else
{
expandedText.push_back(char16_t(replacement));
}
}
else // Just append ordinary code unit.
{
expandedText.push_back(ch);
}
}
}
void EscapeCppUniversalCharacterNames(
std::span<char16_t const> text,
/*out*/ std::u16string& escapedText
)
{
constexpr size_t escapePrefixLength = 2; // \u or \U
constexpr size_t shortEscapeDigitLength = 4;
constexpr size_t longEscapeDigitLength = 8;
char16_t shortEscapedSequence[6] = {'\\','u','0','0','0','0'};
char16_t longEscapedSequence[10] = {'\\','U','0','0','0','0','0','0','0','0'};
escapedText.clear();
escapedText.reserve(text.size() * std::size(shortEscapedSequence));
std::span<char16_t> shortDigitRange(&shortEscapedSequence[escapePrefixLength], &shortEscapedSequence[escapePrefixLength + shortEscapeDigitLength]);
std::span<char16_t> longDigitRange(&longEscapedSequence[escapePrefixLength], &longEscapedSequence[escapePrefixLength + longEscapeDigitLength]);
for (Utf16CharacterReader reader(text); !reader.IsAtEnd(); )
{
char32_t ch = reader.ReadNext();
if (IsCharacterBeyondBmp(ch))
{
// Write surrogate pair.
WriteZeroPaddedHexNum(ch, /*out*/ longDigitRange);
escapedText.append(std::begin(longEscapedSequence), std::end(longEscapedSequence));
}
else // Single UTF-16 code unit.
{
WriteZeroPaddedHexNum(ch, /*out*/ shortDigitRange);
escapedText.append(std::begin(shortEscapedSequence), std::end(shortEscapedSequence));
}
}
}
void EscapeHtmlNamedCharacterReferences(
std::span<char16_t const> text,
/*out*/ std::u16string& escapedText
)
{
constexpr size_t escapePrefixLength = 3; // '&#x'
constexpr size_t shortEscapeDigitLength = 4;
constexpr size_t longEscapeDigitLength = 8;
constexpr size_t escapeSuffixLength = 1; // ;
char16_t shortEscapedSequence[8] = {'&','#','x','0','0','0','0',';'};
char16_t longEscapedSequence[12] = {'&','#','x','0','0','0','0','0','0','0','0',';'};
escapedText.clear();
escapedText.reserve(text.size() * std::size(shortEscapedSequence));
std::span<char16_t> shortDigitRange(shortEscapedSequence + escapePrefixLength, shortEscapedSequence + escapePrefixLength + shortEscapeDigitLength);
std::span<char16_t> longDigitRange(longEscapedSequence + escapePrefixLength, longEscapedSequence + escapePrefixLength + longEscapeDigitLength);
for (Utf16CharacterReader reader(text); !reader.IsAtEnd(); )
{
char32_t ch = reader.ReadNext();
if (IsCharacterBeyondBmp(ch))
{
// Write surrogate pair.
WriteZeroPaddedHexNum(ch, /*out*/ longDigitRange);
escapedText.append(std::begin(longEscapedSequence), std::end(longEscapedSequence));
}
else // Single UTF-16 code unit.
{
WriteZeroPaddedHexNum(ch, /*out*/ shortDigitRange);
escapedText.append(std::begin(shortEscapedSequence), std::end(shortEscapedSequence));
}
}
}
void UnescapeHtmlNamedCharacterReferences(std::span<char16_t const> escapedText, /*out*/ std::u16string& expandedText)
{
minimal_span<char16_t const> currentEscapedText(escapedText);
expandedText.clear();
expandedText.reserve(currentEscapedText.size());
while (!currentEscapedText.empty())
{
char16_t ch = currentEscapedText.consume_front();
// Check escape codes.
if (ch == '&' && !currentEscapedText.empty())
{
char32_t replacement = L'&';
char16_t const* escapeStart = currentEscapedText.data();
char16_t const* escapeEnd = escapeStart;
// Only numeric escapes are supported: &#1234;&#x1A2B;
// Not named ones: &amp;
if (*escapeStart == '#')
{
uint32_t radix = 10; // Assume decimal, unless 'x' follows.
++escapeStart;
if (escapeStart < currentEscapedText.data_end() && *escapeStart == 'x')
{
radix = 16; // Hexadecimal.
++escapeStart;
}
// Parse the number, and replacing on error with just a '\' to preserve original text.
std::span<char16_t const> digitSpan = {escapeStart, currentEscapedText.end()};
replacement = ReadUnsignedNumericValue(/*inout*/ digitSpan, radix);
// Successful if the digits were not empty and a semicolon was present.
if (digitSpan.data() > currentEscapedText.data() && !digitSpan.empty() && digitSpan.front() == ';')
{
currentEscapedText = {digitSpan.data() + 1, currentEscapedText.data_end()}; // After the semicolon.
}
else // Parse error. So restore '\' to preserve original text.
{
replacement = L'\\';
}
}
if (IsCharacterBeyondBmp(replacement))
{
expandedText.push_back(GetLeadingSurrogate(replacement));
expandedText.push_back(GetTrailingSurrogate(replacement));
}
else
{
expandedText.push_back(char16_t(replacement));
}
}
else // Just append ordinary code unit.
{
expandedText.push_back(ch);
}
}
}
void EscapeFilenameCharactersPercentEncoding(
std::span<char8_t const> text,
/*out*/ std::u8string& escapedText,
char8_t escapeCharacter = '%' // $ might be another useful option, for Javascript variable names.
)
{
escapedText.clear();
escapedText.reserve(text.size());
constexpr size_t escapePrefixLength = 1; // '%'
constexpr size_t shortEscapeDigitLength = 2;
char8_t shortEscapedSequence[3] = {escapeCharacter,'0','0'};
std::span<char8_t> shortDigitRange(shortEscapedSequence + escapePrefixLength, shortEscapedSequence + escapePrefixLength + shortEscapeDigitLength);
for (char8_t ch : text)
{
switch (ch)
{
case '*':
case '?':
case '/':
case '|':
case '\\':
case ':':
case '<':
case '>':
case '"':
WriteZeroPaddedHexNum(ch, /*out*/ shortDigitRange);
escapedText.append(std::begin(shortEscapedSequence), std::end(shortEscapedSequence));
break;
default:
escapedText.push_back(ch);
break;
}
}
}
void UnescapeFilenameCharactersPercentEncoding(
std::span<char8_t const> escapedText,
/*out*/ std::u8string& expandedText,
char8_t escapeCharacter = '%' // $ might be another useful option, for Javascript variable names.
)
{
expandedText.clear();
expandedText.reserve(escapedText.size());
minimal_span<char8_t const> currentEscapedText(escapedText);
constexpr size_t escapePrefixLength = 1; // '%'
constexpr size_t shortEscapeDigitLength = 2;
while (!currentEscapedText.empty())
{
char8_t ch = currentEscapedText.consume_front();
// Read the following two digit hex code (e.g. hello%2Fworld -> hello/world).
if (ch == escapeCharacter)
{
std::span<char8_t const> digitSpan = ClampedSubspan<char8_t const>(currentEscapedText, escapePrefixLength, shortEscapeDigitLength);
//currentEscapedText.subspan_clamped(escapePrefixLength, shortEscapeDigitLength);
char32_t replacement = ReadUnsignedNumericValue(/*inout*/ digitSpan, /*radix*/ 16);
if (digitSpan.size() == shortEscapeDigitLength)
{
ch = char8_t(replacement);
currentEscapedText.remove_prefix(shortEscapeDigitLength);
}
}
expandedText.push_back(ch);
}
}
_Out_range_(0, utf32text.end_ - utf32text.begin_)
size_t ConvertTextUtf16ToUtf32(
std::span<char16_t const> utf16text,
/*out*/ std::span<char32_t> utf32text,
_Out_opt_ size_t* sourceCount
) noexcept
{
// Convert all code points, substituting the replacement character for unpaired surrogates.
Utf16CharacterReader reader(utf16text);
size_t utf32count = utf32text.size();
size_t utf32index = 0;
for (; !reader.IsAtEnd() && utf32index < utf32count; ++utf32index)
{
char32_t ch = reader.ReadNext();
utf32text[utf32index] = ch;
}
// Return how many UTF-16 code units and UTF-32 units were read/written.
// Might have more UTF16 code units than UTF32, but never the other way around.
if (sourceCount != nullptr)
*sourceCount = reader.size();
return utf32index;
}
_Out_range_(0, utf32text.end_ - utf32text.begin_)
size_t ConvertTextUtf16ToUtf32NoReplacement(
std::span<char16_t const> utf16text,
/*out*/ std::span<char32_t> utf32text,
_Out_opt_ size_t* sourceCount
) noexcept
{
// Can have more UTF16 characters than UTF32,
// but never the other way around.
Utf16CharacterReader reader(utf16text);
size_t const utf32count = utf32text.size();
size_t utf32index = 0;
for (; !reader.IsAtEnd() && utf32index < utf32count; ++utf32index)
{
utf32text[utf32index] = reader.ReadNextNoReplacement();
}
if (sourceCount != nullptr)
*sourceCount = reader.size();
return utf32index;
}
_Out_range_(0, utf16text.end_ - utf16text.begin_)
size_t ConvertTextUtf32ToUtf16(
std::span<char32_t const> utf32text,
/*out*/ std::span<char16_t> utf16text,
_Out_opt_ size_t* sourceCount
) noexcept
{
size_t const utf32count = utf32text.size();
size_t utf32index = 0;
Utf16CharacterWriter writer(utf16text);
for (; !writer.IsAtEnd() && utf32index < utf32count; ++utf32index)
{
writer.WriteNext(utf32text[utf32index]);
}
if (sourceCount != nullptr)
*sourceCount = utf32index;
return writer.size();
}
_Out_range_(0, utf16text.end_ - utf16text.begin_)
size_t ConvertTextUtf32ToUtf8(
std::span<char32_t const> utf32text,
/*out*/ std::span<char8_t> utf8text,
_Out_opt_ size_t* sourceCount
) noexcept
{
size_t const utf32count = utf32text.size();
size_t utf32index = 0;
Utf8CharacterWriter writer(utf8text);
for (; !writer.IsAtEnd() && utf32index < utf32count; ++utf32index)
{
writer.WriteNext(utf32text[utf32index]);
}
if (sourceCount != nullptr)
*sourceCount = utf32index;
return writer.size();
}
void ConvertTextUtf8ToUtf16(
std::span<char8_t const> utf8text,
/*out*/ std::u16string& utf16text
)
{
// This function can only throw if out-of-memory when resizing utf16text.
// If utf16text is already reserve()'d, no exception will happen.
Utf8CharacterReader reader(utf8text);
reader.SkipBom();
utf16text.resize(reader.size()); // UTF-16 (1-2 code units) will always have equal or fewer code units than UTF-8 (1-4 code units).
Utf16CharacterWriter writer(utf16text);
while (!reader.IsAtEnd())
{
assert(!writer.IsAtEnd());
writer.WriteNext(reader.ReadNext());
}
utf16text.resize(writer.size()); // Shrink back to actual size.
}
void ConvertTextUtf8ToUtf32(
std::u8string_view utf8text,
/*out*/ std::u32string& utf32text
)
{
// This function can only throw if out-of-memory when resizing u32string.
// If u32string is already reserve()'d, no exception will happen.
Utf8CharacterReader reader(utf8text);
reader.SkipBom();
utf32text.resize(reader.size()); // UTF-8 (1-4 code units) will always have equal or fewer code units than UTF-32 (1 code unit).
Utf32CharacterWriter writer(utf32text);
while (!reader.IsAtEnd())
{
assert(!writer.IsAtEnd());
writer.WriteNext(reader.ReadNext());
}
utf32text.resize(writer.size()); // Shrink back to actual size.
}
void ConvertTextUtf16ToUtf8(
std::span<char16_t const> utf16text,
/*out*/ std::u8string& utf8text
)
{
// C++ deprecated codecvt_utf8_utf16 but offered no successor in its place.
//
// std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t> g_converterToUtf8;
// std::string temporary = g_converterToUtf8.to_bytes(source.data(), source.data() + source.size());
//
// So implement it directly.
// Allow double the space for the output, as UTF-16 can't yield
// more than double the UTF-8 unit count:
//
// Code point UTF-16 UTF-8 Factor
// code unit count
// U+0 through U+7F 1 1 1x
// U+80 through U+7FF 1 2 2x
// U+800 through U+FFFF 2 3 1.5x
// U+0800 through U+10FFFF 2 4 2x
utf8text.resize(utf16text.size() * 2); // Preallocate up to 2 UTF-8 code units per UTF-16 code unit.
Utf16CharacterReader reader(utf16text);
Utf8CharacterWriter writer(utf8text);
while (!reader.IsAtEnd())
{
assert(!writer.IsAtEnd());
writer.WriteNext(reader.ReadNext());
}
utf8text.resize(writer.size());
}
void ConvertTextUtf16ToUtf32(
std::span<char16_t const> utf16text,
/*out*/ std::u32string& utf32text
)
{
utf32text.resize(utf16text.size()); // UTF-32 (1 code unit) will always have equal or fewer code units than UTF-16 (1-2 code units).
Utf16CharacterReader reader(utf16text);
Utf32CharacterWriter writer(utf32text);
while (!reader.IsAtEnd())
{
assert(!writer.IsAtEnd());
writer.WriteNext(reader.ReadNext());
}
utf32text.resize(writer.size());
}
void ConvertTextUtf32ToUtf8(
std::span<char32_t const> utf32text,
/*out*/ std::u8string& utf8text
)
{
utf8text.resize(utf32text.size() * 4); // UTF-8 could expand up to 4 code units.
Utf32CharacterReader reader(utf32text);
Utf8CharacterWriter writer(utf8text);
while (!reader.IsAtEnd())
{
assert(!writer.IsAtEnd());
writer.WriteNext(reader.ReadNext());
}
utf8text.resize(writer.size());
}
void ConvertTextUtf32ToUtf16(
std::span<char32_t const> utf32text,
/*out*/ std::u16string& utf16text
)
{
utf16text.resize(utf32text.size() * 2);
Utf32CharacterReader reader(utf32text);
Utf16CharacterWriter writer(utf16text);
while (!reader.IsAtEnd())
{
assert(!writer.IsAtEnd());
writer.WriteNext(reader.ReadNext());
}
utf16text.resize(writer.size());
}
inline std::u16string ToUtf16String(std::span<char8_t const> source)
{
std::u16string dest;
ConvertTextUtf8ToUtf16(source, dest);
return dest;
}
inline std::u8string ToUtf8String(std::span<char16_t const> source)
{
std::u8string dest;
ConvertTextUtf16ToUtf8(source, dest);
return dest;
}
#ifdef _WIN32
inline std::u8string ToUtf8String(std::span<wchar_t const> source)
{
std::u8string dest;
ConvertTextUtf16ToUtf8(reinterpret_span<char16_t const>(source), dest);
return dest;
}
#endif // _WIN32
struct StringAndIndex
{
char8_t const* text; // Null terminated.
uint32_t index;
};
std::optional<uint32_t> TryMapStringToIndex(std::u8string_view text, std::span<const StringAndIndex> list) noexcept
{
for (StringAndIndex const& item : list)
{
if (item.text == text)
{
return item.index;
}
}
return {};
}
template<typename T>
std::optional<T> TryMapStringToIndex(std::u8string_view mode, std::span<const StringAndIndex> nameAndIndexList) noexcept
{
static_assert(sizeof(T) == sizeof(uint32_t));
auto result = TryMapStringToIndex(mode, nameAndIndexList);
return *reinterpret_cast<std::optional<T>*>(std::addressof(result));
}
template<typename T>
T MapStringToIndex(std::u8string_view mode, std::span<const StringAndIndex> nameAndIndexList, T defaultValue) noexcept
{
auto result = TryMapStringToIndex(mode, nameAndIndexList);
return result ? T(*result) : defaultValue;
}
template <typename CharType>
void ToLowercase(/*inout*/ std::span<CharType> text) noexcept
{
for (CharType& c : text)
{
// TODO: Extend this to other languages besides English?
// It's currently only used for English keywords, and salient case conversions are 1:1 (the German double S is irrelevant now)
c = static_cast<CharType>(::tolower(c));
}
}
void ToLowercase(/*inout*/ std::span<char> text) { return ToLowercase<char>(text); };
void ToLowercase(/*inout*/ std::span<char8_t> text) { return ToLowercase<char8_t>(text); };
void ToLowercase(/*inout*/ std::span<char16_t> text) { return ToLowercase<char16_t>(text); };
template <typename CharType>
void ToUpperCase(/*inout*/ std::span<CharType> text)
{
for (CharType& c : text)
{
c = static_cast<CharType>(::toupper(c));
}
}
void ToUpperCase(/*inout*/ std::span<char> text) { return ToUpperCase<char>(text); };
void ToUpperCase(/*inout*/ std::span<char8_t> text) { return ToUpperCase<char8_t>(text); };
void ToUpperCase(/*inout*/ std::span<char16_t> text) { return ToUpperCase<char16_t>(text); };
std::optional<std::u8string_view> TryMapIndexToString(uint32_t index, std::span<const StringAndIndex> nameAndIndexList) noexcept
{
for (auto& nameAndIndex : nameAndIndexList)
{
if (nameAndIndex.index == index)
{
return nameAndIndex.text;
}
}
return {};
}
template<typename T>
std::optional<T> TryMapIndexToString(T index, std::span<const StringAndIndex> nameAndIndexList) noexcept
{
static_assert(sizeof(T) == sizeof(uint32_t));
return TryMapIndexToString(static_cast<uint32_t>(index), nameAndIndexList);
}
template<typename T>
std::u8string_view MapIndexToString(T index, std::span<const StringAndIndex> nameAndIndexList, std::u8string_view defaultValue) noexcept
{
auto result = TryMapIndexToString(uint32_t(index), nameAndIndexList);
return result ? *result : defaultValue;
}
export uint32_t MapStringSuffixIcaseToIndex(
std::u8string_view text,
std::span<const StringAndIndex> nameAndIndexList,
uint32_t defaultValue
)
{
std::u8string lowerCaseFilename(text.begin(), text.end());
ToLowercase(/*inout*/ lowerCaseFilename);
for (auto const& entry : nameAndIndexList)
{
if (lowerCaseFilename.ends_with(entry.text))
{
return entry.index;
}
}
return defaultValue;
}
template <typename StringType, typename StringCharType>
void TrimSpaces(/*inout*/ StringType& text, /*nullterminated*/ StringCharType const* spaces)
{
// Trim space (U+0020) and tab. It does not trim all whitespace, like U+200X
// or the new line controls.
// Trim trailing spaces
size_t lastPos = text.find_last_not_of(spaces);
if (lastPos != std::string::npos)
{
text.erase(lastPos + 1);
}
// Trim leading spaces
size_t firstPos = text.find_first_not_of(spaces);
if (firstPos != 0)
{
if (firstPos == std::string::npos)
firstPos = text.size();
text.erase(0, firstPos);
}
}
void TrimSpaces(/*inout*/ std::string text) { return TrimSpaces<std::string>(text, " \t"); }
void TrimSpaces(/*inout*/ std::u8string text) { return TrimSpaces<std::u8string>(text, u8" \t"); }
void TrimSpaces(/*inout*/ std::u16string& text) { return TrimSpaces<std::u16string>(text, u" \t"); }
template <typename StringType>
void UnquoteString(/*inout*/ StringType& text)
requires requires (StringType& text) { text.empty(); text.back(); text.pop_back(); text.front(); text.erase(); }
{
if (text.empty())
return;
if (text.back() == '\"')
{
text.pop_back();
}
if (text.empty())
return;
if (text.front() == '\"')
{
text.erase(0, 1);
}
}
void UnquoteString(/*inout*/ std::string& text) { return UnquoteString<std::string>(text); }
void UnquoteString(/*inout*/ std::u8string& text) { return UnquoteString<std::u8string>(text); }
void UnquoteString(/*inout*/ std::u16string& text) { return UnquoteString<std::u16string>(text); }
// Useful for reconcatenating main's argc and argv[].
template <typename CharType, typename StringType, typename ViewType>
StringType ConcatenateStrings(std::span<CharType const* const> stringList, ViewType delimiter)
{
StringType concatenatedString;
for (CharType const* s : stringList)
{
if (!concatenatedString.empty())
{
concatenatedString.append(delimiter);
}
concatenatedString.append(s);
}
return concatenatedString;
}
std::string ConcatenateStrings(std::span<char const* const> stringList, std::string_view delimiter = " ")
{
return ConcatenateStrings<char, std::string, std::string_view>(stringList, delimiter);
}
std::u8string ConcatenateStrings(std::span<char8_t const* const> stringList, std::u8string_view delimiter = u8" ")
{
return ConcatenateStrings<char8_t, std::u8string, std::u8string_view>(stringList, delimiter);
}
std::u16string ConcatenateStrings(std::span<char16_t const* const> stringList, std::u16string_view delimiter = u" ")
{
return ConcatenateStrings<char16_t, std::u16string, std::u16string_view>(stringList, delimiter);
}
// Helper typedef class to resolve a code unit to type the respective reader type.
// Because we're only targeting Unicode (no Shift-JIS, Big 5, or other), then we can simplify
// the specialization by code unit byte size.
template <size_t CharacterTypeByteSize> struct CharacterReaderResolver;
template <> struct CharacterReaderResolver<1> { using Type = Utf8CharacterReader; };
template <> struct CharacterReaderResolver<2> { using Type = Utf16CharacterReader; };
template <> struct CharacterReaderResolver<4> { using Type = Utf32CharacterReader; };
// Compare two Unicode strings of possibly different encodings.
template <typename CharTypeA, typename CharTypeB>
bool CompareStringSpans(std::span<CharTypeA const> a, std::span<CharTypeB const> b)
{
if constexpr (sizeof(CharTypeA) == sizeof(CharTypeB))
{
// Can just compare code units directly, since they are the same size.
return std::equal(a.data(), a.data() + a.size(), b.data(), b.data() + b.size());
}
else // Strings are heterogeneous Unicode encodings.
{
typename CharacterReaderResolver<sizeof(CharTypeA)>::Type readerA(a);
typename CharacterReaderResolver<sizeof(CharTypeB)>::Type readerB(b);
while (true)
{
if (bool aIsDone = readerA.IsAtEnd(), bIsDone = readerB.IsAtEnd(); aIsDone || bIsDone)
{
return aIsDone == bIsDone; // Return false if reached the end of one string before the other.
}
if (readerA.ReadNext() != readerB.ReadNext())
{
return false;
}
}
return true;
}
}
/*
Usage:
std::string a = "Hello"; // Encoding is actually UTF-8.
std::u8string b = u8"Hello";
bool result = StringHelpers::CompareStrings(a, b);
std::wstring a = L"Hello";
std::u8string b = u8"Hello";
bool result = StringHelpers::CompareStrings(a, b);
std::u32string a = U"Hello";
std::string b = "Hello";
bool result = StringHelpers::CompareStrings(a, b);
std::u16string a = u"Hello";
char32_t b[5] = {'H','e','l','l','o'};
bool result = StringHelpers::CompareStrings(a, std::spanb);
*/
template <typename StringTypeA, typename StringTypeB>
inline bool CompareStrings(StringTypeA const& a, StringTypeB const& b)
// TODO: Figure out why requires with std::data doesn't work as expected on raw C arrays.
//requires requires (StringTypeA a, StringTypeB b) { std::data(a); std::size(a); std::data(b); std::size(b); }
{
// Sadly we can't just rely on template type deduction here because mutable spans then
// no CompareStringSpans overload because CompareStringSpans expects span<T const>.
using CharTypeA = std::remove_reference_t<decltype(*std::data(a))>;
using CharTypeB = std::remove_reference_t<decltype(*std::data(b))>;
return CompareStringSpans(std::span<CharTypeA const>(a), std::span<CharTypeB const>(b));
}
} // namespace StringHelpers
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment