Last active
May 23, 2025 01:38
-
-
Save fdwr/2bc0658372178dd2c7fca2924ac23633 to your computer and use it in GitHub Desktop.
String conversion functions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Miscellaneous helpers for strings that core C++ is missing. | |
| module; | |
| #include "precomp.h" | |
| #include <stdint.h> | |
| #include <string> | |
| #include <string_view> | |
| #include <span> | |
| #include <optional> | |
| #include <type_traits> | |
| export module StringHelpers; | |
| namespace | |
| { | |
| constexpr static char8_t g_utf8bom[] = {char8_t(0xEF), char8_t(0xBB), char8_t(0xBF)}; | |
| constexpr auto g_utf8bomView = std::u8string_view(std::data(g_utf8bom), std::size(g_utf8bom)); | |
| } | |
| export namespace StringHelpers | |
| { | |
| enum UnicodeCodePoint | |
| { | |
| UnicodeSpace = 0x000020, | |
| UnicodeNbsp = 0x0000A0, | |
| UnicodeSoftHyphen = 0x0000AD, | |
| UnicodeEnQuadSpace = 0x002000, | |
| UnicodeZeroWidthSpace = 0x00200B, | |
| UnicodeDottedCircle = 0x0025CC, | |
| UnicodeIdeographicSpace = 0x003000, | |
| UnicodeInlineObject = 0x00FFFC, // for embedded objects | |
| UnicodeReplacementCharacter = 0x00FFFD, // for invalid sequences | |
| UnicodeMax = 0x10FFFF, | |
| UnicodeTotal = 0x110000, | |
| }; | |
| inline char AsChar(std::byte c) { return static_cast<char>(c); } | |
| inline char* AsChar(std::byte* p) { return reinterpret_cast<char*>(p); } | |
| inline char const* AsChar(std::byte const* p) { return reinterpret_cast<char const*>(p); } | |
| inline char AsChar(char8_t c) { return static_cast<char>(c); } | |
| inline char* AsChar(char8_t* p) { return reinterpret_cast<char*>(p); } | |
| inline char const* AsChar(char8_t const* p) { return reinterpret_cast<char const*>(p); } | |
| inline char* AsChar(char* p) { return p; } // Identity operation for general templates | |
| inline char const* AsChar(char const* p) { return p; } // Identity operation for general templates | |
| inline char** AsChar(char8_t** p) { return reinterpret_cast<char**>(p); } | |
| inline char* const* AsChar(char8_t* const* const p) { return reinterpret_cast<char* const* const>(p); } | |
| inline char const* const* AsChar(char8_t const* const* const p) { return reinterpret_cast<char const* const* const>(p); } | |
| inline unsigned char* AsUChar(char* p) { return reinterpret_cast<unsigned char*>(p); } | |
| inline unsigned char* AsUChar(char8_t* p) { return reinterpret_cast<unsigned char*>(p); } | |
| inline unsigned char* AsUChar(std::byte* p) { return reinterpret_cast<unsigned char*>(p); } | |
| inline char8_t* AsUtf8Char(char* p) { return reinterpret_cast<char8_t*>(p); } | |
| inline char8_t const* AsUtf8Char(char const* p) { return reinterpret_cast<char8_t const*>(p); } | |
| inline std::u8string_view AsUtf8Char(std::string_view s) { return std::u8string_view(reinterpret_cast<char8_t const*>(s.data()), s.size()); } | |
| inline std::string_view AsChar(std::u8string_view s) { return std::string_view(reinterpret_cast<char const*>(s.data()), s.size()); } | |
| inline std::u8string& AsUtf8Char(std::string& s) { return reinterpret_cast<std::u8string&>(s); } | |
| inline std::string& AsChar(std::u8string& s) { return reinterpret_cast<std::string&>(s); } | |
| inline std::u8string const& AsUtf8Char(std::string const& s) { return reinterpret_cast<std::u8string const&>(s); } | |
| inline std::string const& AsChar(std::u8string const& s) { return reinterpret_cast<std::string const&>(s); } | |
| // The std::span::subset method is dangerous (sadly), offering no clamped version to stay within the buffer. | |
| template <typename T> | |
| std::span<T> ClampedSubspan(std::span<T> a, size_t index, size_t count) | |
| { | |
| size_t const maxCount = a.size(); | |
| index = std::min(index, maxCount); | |
| count = std::min(maxCount - index, count); | |
| return a.subspan(index, count); | |
| } | |
| inline bool IsSurrogate(char32_t ch) noexcept | |
| { | |
| // 0xD800 <= ch <= 0xDFFF | |
| return (ch & 0xF800) == 0xD800; | |
| } | |
| inline bool IsLeadingSurrogate(char32_t ch) noexcept | |
| { | |
| // 0xD800 <= ch <= 0xDBFF | |
| return (ch & 0xFC00) == 0xD800; | |
| } | |
| inline bool IsTrailingSurrogate(char32_t ch) noexcept | |
| { | |
| // 0xDC00 <= ch <= 0xDFFF | |
| return (ch & 0xFC00) == 0xDC00; | |
| } | |
| inline bool IsCharacterBeyondBmp(char32_t ch) noexcept | |
| { | |
| return ch >= 0x10000; | |
| } | |
| inline char32_t MakeUnicodeCodePoint(char32_t high, char32_t low) noexcept | |
| { | |
| return ((high & 0x03FF) << 10 | (low & 0x03FF)) + 0x10000; | |
| } | |
| // Split into leading and trailing surrogatse. | |
| // From http://unicode.org/faq/utf_bom.html#35 | |
| inline char16_t GetLeadingSurrogate(char32_t ch) | |
| { | |
| return char16_t(0xD800 + (ch >> 10) - (0x10000 >> 10)); | |
| } | |
| inline char16_t GetTrailingSurrogate(char32_t ch) | |
| { | |
| return char16_t(0xDC00 + (ch & 0x3FF)); | |
| } | |
| inline bool IsHexDigit(char32_t ch) noexcept | |
| { | |
| return (ch >= '0' && ch <= '9') || (ch &= ~32, ch >= 'A' && ch <= 'F'); | |
| } | |
| struct Utf16CharacterReader | |
| { | |
| char16_t const* current_ = nullptr; | |
| char16_t const* end_ = nullptr; | |
| Utf16CharacterReader() = default; | |
| Utf16CharacterReader(char16_t const* begin, char16_t const* end) : current_(begin), end_(end) | |
| {} | |
| template <typename CharacterType> | |
| Utf16CharacterReader(CharacterType const* begin, CharacterType const* end) | |
| requires (sizeof(CharacterType) == sizeof(char16_t)) | |
| : current_(reinterpret_cast<char16_t const*>(begin)), | |
| end_(reinterpret_cast<char16_t const*>(end)) | |
| {} | |
| template <typename Container> | |
| Utf16CharacterReader(Container const& c) | |
| requires (sizeof(*std::data(c)) == sizeof(char16_t)) | |
| : Utf16CharacterReader(std::data(c), std::data(c) + std::size(c)) | |
| {} | |
| size_t size() const noexcept | |
| { | |
| return end_ - current_; | |
| } | |
| bool IsAtEnd() const noexcept | |
| { | |
| return current_ >= end_; | |
| } | |
| char32_t ReadNext() noexcept | |
| { | |
| if (current_ >= end_) | |
| return 0; | |
| char32_t ch = *current_++; | |
| if (!IsSurrogate(ch)) | |
| return ch; // Character fits in the basic multilingual plane. | |
| if (!IsLeadingSurrogate(ch) || current_ >= end_) | |
| return UnicodeReplacementCharacter; // Illegal unpaired surrogate. Substitute with replacement character. | |
| char32_t leading = ch; | |
| char32_t trailing = *current_; | |
| if (!IsTrailingSurrogate(trailing)) | |
| return UnicodeReplacementCharacter; // Illegal unpaired surrogate. Substitute with replacement character. | |
| ++current_; | |
| return MakeUnicodeCodePoint(leading, trailing); | |
| } | |
| char32_t ReadNextNoReplacement() noexcept | |
| { | |
| if (current_ >= end_) | |
| return 0; | |
| char32_t codePoint = *current_++; | |
| // Just use the character if not a surrogate code point. | |
| // For unpaired surrogates, pass the isolated surrogate | |
| // through (rather than remap to U+FFFD replacement). | |
| if (IsLeadingSurrogate(codePoint) && current_ < end_) | |
| { | |
| char32_t leadingCodeUnit = codePoint; | |
| char32_t trailingCodeUnit = *current_; | |
| if (IsTrailingSurrogate(trailingCodeUnit)) | |
| { | |
| codePoint = MakeUnicodeCodePoint(leadingCodeUnit, trailingCodeUnit); | |
| ++current_; | |
| } | |
| } | |
| return codePoint; | |
| } | |
| }; | |
| struct Utf16CharacterWriter | |
| { | |
| char16_t* begin_ = nullptr; | |
| char16_t* current_ = nullptr; | |
| char16_t* end_ = nullptr; | |
| Utf16CharacterWriter() = default; | |
| Utf16CharacterWriter(char16_t* begin, char16_t* end) : begin_(begin), current_(begin), end_(end) | |
| {} | |
| template <typename CharacterType> | |
| inline Utf16CharacterWriter(CharacterType* begin, CharacterType* end) | |
| requires (sizeof(CharacterType) == sizeof(char16_t)) | |
| : Utf16CharacterWriter( | |
| reinterpret_cast<char16_t*>(begin), | |
| reinterpret_cast<char16_t*>(begin) | |
| ) | |
| {} | |
| template <typename Container> | |
| Utf16CharacterWriter(Container& c) | |
| requires (sizeof(*std::data(c)) == sizeof(char16_t)) | |
| : Utf16CharacterWriter( | |
| reinterpret_cast<char16_t*>(std::data(c)), | |
| reinterpret_cast<char16_t*>(std::data(c) + std::size(c)) | |
| ) | |
| {} | |
| size_t size() const noexcept | |
| { | |
| return current_ - begin_; | |
| } | |
| bool IsAtEnd() const noexcept | |
| { | |
| return current_ >= end_; | |
| } | |
| void WriteNext(char32_t ch) noexcept | |
| { | |
| if (current_ >= end_) | |
| return; | |
| if (IsCharacterBeyondBmp(ch) && end_ - current_ >= 2) | |
| { | |
| // Split into leading and trailing surrogatse. | |
| // From http://unicode.org/faq/utf_bom.html#35 | |
| current_[0] = char16_t(GetLeadingSurrogate(ch)); | |
| current_[1] = char16_t(GetTrailingSurrogate(ch)); | |
| current_ += 2; | |
| } | |
| else | |
| { | |
| // A BMP character (or isolated surrogate) | |
| current_[0] = char16_t(ch); | |
| ++current_; | |
| } | |
| } | |
| }; | |
| // For iterating directly over the characters. | |
| // - Avoid deprecated std::wstring_convert<std::codecvt_utf8_utf16 | |
| // - OS-specific MultibyteToWideChar and allocating intermediate buffers. | |
| // TODO: Compare with https://github.com/simdutf/simdutf/blob/master/src/scalar/utf8_to_utf32/valid_utf8_to_utf32.h. | |
| struct Utf8CharacterReader | |
| { | |
| char8_t const* current_ = nullptr; | |
| char8_t const* end_ = nullptr; | |
| Utf8CharacterReader() = default; | |
| Utf8CharacterReader(char8_t const* begin, char8_t const* end) : current_(begin), end_(end) | |
| {} | |
| template <typename CharacterType> | |
| Utf8CharacterReader(CharacterType const* begin, CharacterType const* end) | |
| requires (sizeof(CharacterType) == sizeof(char8_t)) | |
| : Utf8CharacterReader( | |
| reinterpret_cast<char8_t const*>(begin), | |
| reinterpret_cast<char8_t const*>(end) | |
| ) | |
| {} | |
| template <typename Container> | |
| Utf8CharacterReader(Container const& c) | |
| requires (sizeof(*std::data(c)) == sizeof(char8_t)) | |
| : Utf8CharacterReader( | |
| reinterpret_cast<char8_t const*>(std::data(c)), | |
| reinterpret_cast<char8_t const*>(std::data(c) + std::size(c)) | |
| ) | |
| {} | |
| size_t size() const noexcept | |
| { | |
| return end_ - current_; | |
| } | |
| bool IsAtEnd() const noexcept | |
| { | |
| return current_ >= end_; | |
| } | |
| char32_t ReadNext() noexcept | |
| { | |
| // TODO: Test overlong sequences. | |
| if (current_ >= end_) | |
| return 0; | |
| char32_t codeUnit = *current_++; | |
| if (codeUnit <= 0b0111'1111) // 0xxxxxxx 0-127 | |
| return codeUnit; | |
| // The following byte values should never occur: 0xC0, 0xC1, 0xF5-0xFF | |
| if (codeUnit < 0b1100'0000) // Any 10xxxxxx patterns are illegal. | |
| return UnicodeReplacementCharacter; | |
| uint32_t continuationMask = 0b0100'0000; | |
| uint32_t continuationByteCount = 0; | |
| while (codeUnit & continuationMask) | |
| { | |
| codeUnit ^= continuationMask; | |
| continuationMask >>= 1; | |
| ++continuationByteCount; | |
| if (continuationByteCount > 3) | |
| return UnicodeReplacementCharacter; | |
| } | |
| char32_t codePoint = codeUnit & 0b0011'1111; | |
| while (continuationByteCount--) | |
| { | |
| codeUnit = *current_; | |
| if ((codeUnit & 0b1100'0000) != 0b1000'0000) // Expect 10xxxxxx pattern. | |
| return UnicodeReplacementCharacter; // Expected continuation byte. | |
| ++current_; | |
| // Combine next code unit lowest 6 bits with existing bits. | |
| codePoint = (codePoint << 6) | codeUnit & 0b0011'1111; | |
| } | |
| return codePoint; | |
| } | |
| // Skip the byte order mark, if present. | |
| void SkipBom() noexcept | |
| { | |
| if (size() >= sizeof(g_utf8bom) && memcmp(current_, g_utf8bom, sizeof(g_utf8bom)) == 0) | |
| { | |
| current_ += sizeof(g_utf8bom); | |
| } | |
| } | |
| }; | |
| // Writes a single code point out to a memory region of char8's. | |
| // Does NOT throw if an invalid character is passed - just writes replacement. | |
| struct Utf8CharacterWriter | |
| { | |
| char8_t* begin_ = nullptr; | |
| char8_t* current_ = nullptr; | |
| char8_t* end_ = nullptr; | |
| Utf8CharacterWriter() = default; | |
| Utf8CharacterWriter(char8_t* begin, char8_t* end) : begin_(begin), current_(begin), end_(end) | |
| {} | |
| template <typename CharacterType> | |
| Utf8CharacterWriter(CharacterType* begin, CharacterType* end) | |
| requires (sizeof(CharacterType) == sizeof(char8_t)) | |
| : Utf8CharacterWriter(reinterpret_cast<char8_t*>(begin), reinterpret_cast<char8_t*>(end)) | |
| {} | |
| template <typename Container> | |
| Utf8CharacterWriter(Container& c) | |
| requires (sizeof(*std::data(c)) == sizeof(char8_t)) | |
| : Utf8CharacterWriter( | |
| reinterpret_cast<char8_t*>(std::data(c)), | |
| reinterpret_cast<char8_t*>(std::data(c) + std::size(c)) | |
| ) | |
| {} | |
| size_t size() const noexcept | |
| { | |
| return current_ - begin_; | |
| } | |
| bool IsAtEnd() const noexcept | |
| { | |
| return current_ >= end_; | |
| } | |
| void WriteNext(char32_t codePoint) noexcept | |
| { | |
| // Consider Duff's device like approach: https://github.com/Alexhuszagh/UTFPP/blob/bd99a5e4f3fbfb3bc86c1d7af5cf5edf2f00e1a7/utf.hpp#L115 | |
| if (current_ >= end_) | |
| return; | |
| if (codePoint < 0x80) | |
| { | |
| *current_++ = static_cast<char8_t>(codePoint); | |
| } | |
| else if (codePoint < 0x0800) | |
| { | |
| *current_++ = static_cast<char8_t>((codePoint >> 6) | 0xC0); | |
| if (current_ < end_) *current_++ = static_cast<char8_t>((codePoint & 0x3F) | 0x80); | |
| } | |
| else if (codePoint < 0x10000) | |
| { | |
| *current_++ = static_cast<char8_t>((codePoint >> 12) | 0xE0); | |
| if (current_ < end_) *current_++ = static_cast<char8_t>(((codePoint >> 6) & 0x3F) | 0x80); | |
| if (current_ < end_) *current_++ = static_cast<char8_t>((codePoint & 0x3F) | 0x80); | |
| } | |
| else | |
| { | |
| *current_++ = static_cast<char8_t>((codePoint >> 18) | 0xF0); | |
| if (current_ < end_) *current_++ = static_cast<char8_t>(((codePoint >> 12) & 0x3F) | 0x80); | |
| if (current_ < end_) *current_++ = static_cast<char8_t>(((codePoint >> 6) & 0x3F) | 0x80); | |
| if (current_ < end_) *current_++ = static_cast<char8_t>((codePoint & 0x3F) | 0x80); | |
| } | |
| } | |
| }; | |
| struct Utf32CharacterReader | |
| { | |
| char32_t const* current_ = nullptr; | |
| char32_t const* end_ = nullptr; | |
| Utf32CharacterReader() = default; | |
| Utf32CharacterReader(char32_t const* begin, char32_t const* end) | |
| : current_(begin), end_(end) | |
| {} | |
| template <typename CharacterType> | |
| Utf32CharacterReader(CharacterType const* begin, CharacterType const* end) | |
| requires (sizeof(CharacterType) == sizeof(char32_t)) | |
| : Utf32CharacterReader( | |
| reinterpret_cast<char32_t const*>(begin), | |
| reinterpret_cast<char32_t const*>(end) | |
| ) | |
| {} | |
| template <typename Container> | |
| Utf32CharacterReader(Container const& c) | |
| requires (sizeof(*std::data(c)) == sizeof(char32_t)) | |
| : Utf32CharacterReader( | |
| reinterpret_cast<char32_t const*>(std::data(c)), | |
| reinterpret_cast<char32_t const*>(std::data(c) + std::size(c)) | |
| ) | |
| {} | |
| size_t size() const noexcept | |
| { | |
| return end_ - current_; | |
| } | |
| bool IsAtEnd() const noexcept | |
| { | |
| return current_ >= end_; | |
| } | |
| char32_t ReadNext() noexcept | |
| { | |
| if (current_ >= end_) | |
| return 0; | |
| return *current_++; | |
| } | |
| }; | |
| struct Utf32CharacterWriter | |
| { | |
| char32_t* begin_ = nullptr; | |
| char32_t* current_ = nullptr; | |
| char32_t* end_ = nullptr; | |
| Utf32CharacterWriter() = default; | |
| Utf32CharacterWriter(char32_t* begin, char32_t* end) | |
| : begin_(begin), current_(begin), end_(end) | |
| {} | |
| template <typename CharacterType> | |
| Utf32CharacterWriter(CharacterType* begin, CharacterType* end) | |
| requires (sizeof(CharacterType) == sizeof(char32_t)) | |
| : Utf32CharacterWriter( | |
| reinterpret_cast<char32_t*>(begin), | |
| reinterpret_cast<char32_t*>(end) | |
| ) | |
| {} | |
| template <typename Container> | |
| Utf32CharacterWriter(Container& c) | |
| requires (sizeof(*std::data(c)) == sizeof(char32_t)) | |
| : Utf32CharacterWriter( | |
| reinterpret_cast<char32_t*>(std::data(c)), | |
| reinterpret_cast<char32_t*>(std::data(c) + std::size(c)) | |
| ) | |
| {} | |
| size_t size() const noexcept | |
| { | |
| return current_ - begin_; | |
| } | |
| bool IsAtEnd() const noexcept | |
| { | |
| return current_ >= end_; | |
| } | |
| void WriteNext(char32_t codePoint) noexcept | |
| { | |
| if (current_ >= end_) | |
| return; | |
| *current_++; | |
| } | |
| }; | |
| std::u8string_view StripUtf8Bom(std::u8string_view s) | |
| { | |
| if (s.starts_with(g_utf8bomView)) | |
| { | |
| s.remove_prefix(3); | |
| } | |
| return s; | |
| } | |
| template< | |
| typename InputContainer, | |
| typename OutputView = std::u8string_view, // Could be a std::string or std::span too or any type that accepts two iterators. | |
| typename OutputContainer = std::vector<OutputView> | |
| > | |
| requires requires(InputContainer i, OutputContainer o, OutputView v) { | |
| i.begin(); // Must have iterators. | |
| i.end(); | |
| OutputView(i.begin(), i.end()); // Must be constructible from iterator pair. | |
| o.push_back(OutputView{}); // Must be push_back'able. | |
| } | |
| auto SplitLines(InputContainer& text) -> OutputContainer | |
| { | |
| OutputContainer result; | |
| auto lineBegin = text.begin(); | |
| auto textEnd = text.end(); | |
| while (lineBegin != textEnd) | |
| { | |
| auto it = lineBegin; | |
| while (it != textEnd) | |
| { | |
| auto lineEnd = it; | |
| auto ch = *it++; | |
| if (ch == '\r' || ch == '\n') | |
| { | |
| result.push_back(OutputView(lineBegin, lineEnd)); | |
| // Skip the CR and LF pair. | |
| // Note parapgrah separate and line separator are ignored. | |
| if (ch == '\r' && it != textEnd && *it == '\n') | |
| { | |
| ++it; // Skip the line feed. | |
| } | |
| break; | |
| } | |
| } | |
| lineBegin = it; // Next line. | |
| } | |
| return result; | |
| } | |
| template <typename StringViewType = std::u8string_view> | |
| class SplitEnumerator | |
| { | |
| StringViewType view_ = 0; | |
| char32_t splitCodeUnit_ = 0; // A single code unit to split upon, like ",". | |
| bool hasMore_ = true; | |
| public: | |
| SplitEnumerator(StringViewType view, char32_t splitCodeUnit) noexcept | |
| : splitCodeUnit_(splitCodeUnit), | |
| view_(view) | |
| {} | |
| bool HasMore() const noexcept { return hasMore_; } | |
| StringViewType Read() noexcept | |
| { | |
| using C = decltype(*StringViewType().data()); | |
| auto nextSplit = std::find(view_.begin(), view_.end(), C(splitCodeUnit_)); | |
| auto token = StringViewType(view_.begin(), nextSplit); | |
| if (nextSplit == view_.end()) | |
| { | |
| hasMore_ = false; | |
| } | |
| else | |
| { | |
| ++nextSplit; // Skip the split code unit. | |
| } | |
| view_ = StringViewType(nextSplit, view_.end()); | |
| return token; | |
| } | |
| }; | |
| // Fills the entire buffer up to fixed size, including leading zeroes. | |
| template <typename CharacterType> | |
| void WriteZeroPaddedHexNum(uint32_t value, /*out*/ std::span<CharacterType> text) | |
| { | |
| minimal_span<CharacterType> currentText(text); | |
| // Convert character to digits. | |
| while (!currentText.empty()) | |
| { | |
| CharacterType digit = value & 0xF; | |
| digit += (digit >= 10) ? 'A' - 10 : '0'; | |
| currentText.back() = digit; | |
| currentText.pop_back(); | |
| value >>= 4; | |
| } | |
| } | |
| void WriteZeroPaddedHexNum(uint32_t value, /*out*/ std::span<char8_t> text) { return WriteZeroPaddedHexNum<char8_t>(value, /*out*/ text); } | |
| void WriteZeroPaddedHexNum(uint32_t value, /*out*/ std::span<char16_t> text) { return WriteZeroPaddedHexNum<char16_t>(value, /*out*/ text); } | |
| void WriteZeroPaddedHexNum(uint32_t value, /*out*/ std::span<char32_t> text) { return WriteZeroPaddedHexNum<char32_t>(value, /*out*/ text); } | |
| // 'text' is updated to the end of all characters read. | |
| // TODO: Consider using std::from_chars instead now that it exists. | |
| template <typename CharacterType> | |
| uint32_t ReadUnsignedNumericValue(/*inout*/ std::span<CharacterType const>& text, _In_range_(2, 36) uint32_t base) | |
| { | |
| // Sadly, both wcstoul and std::stoul are useless functions because: | |
| // (1) wcstoul doesn't respect any boundaries and tries to parse beyond the code sequence | |
| // (e.g. \x12345 should be treated as {0x1234, '5'}, not as {0x12345}) | |
| // (2) std::stoul throws an exception on parse error, which is overkill for the user | |
| // interactively typing in a number. | |
| // (3) std::stoul requries a std::string as input, which gimps its utility. | |
| // Additionally, some uses such as escapement conversion don't want whitespace skipped. | |
| // - 'text' is updated upon returning to point after the consumed part. | |
| // - Any character outside the radix stops the read. So 123A4G would stop at 'A' for decimal, | |
| // but it would continue until 'G' for hexademical. | |
| // - An empty string returns 0. | |
| // - The caller doesn't receive a flag, but it can easily detect missing strings or whether | |
| // the entire number was read by checking the return std::span. | |
| uint32_t value = 0; | |
| minimal_span<CharacterType const> input = text; | |
| while (!input.empty()) | |
| { | |
| uint32_t digit = input.front(); | |
| if (digit < '0') | |
| break; | |
| digit -= '0'; // Handle 0..9. | |
| if (digit >= 10) // Handle A..Z. | |
| { | |
| digit &= ~32; // Make upper case. | |
| if (digit < 'A' - '0') | |
| break; | |
| digit -= 'A' - '0' - 10; | |
| } | |
| if (digit >= base) | |
| { | |
| break; | |
| } | |
| value = value * base + digit; | |
| input.pop_front(); | |
| } | |
| text = input; | |
| return value; | |
| } | |
| uint32_t ReadUnsignedNumericValue(/*inout*/ std::span<char8_t const>& text, _In_range_(2, 36) uint32_t base) { return ReadUnsignedNumericValue<char8_t>(/*inout*/ text, base); } | |
| uint32_t ReadUnsignedNumericValue(/*inout*/ std::span<char16_t const>& text, _In_range_(2, 36) uint32_t base) { return ReadUnsignedNumericValue<char16_t>(/*inout*/ text, base); } | |
| uint32_t ReadUnsignedNumericValue(/*inout*/ std::span<char32_t const>& text, _In_range_(2, 36) uint32_t base) { return ReadUnsignedNumericValue<char32_t>(/*inout*/ text, base); } | |
| void UnescapeCppUniversalCharacterNames( | |
| std::span<char16_t const> escapedText, | |
| /*out*/ std::u16string& expandedText | |
| ) | |
| { | |
| minimal_span<char16_t const> currentEscapedText(escapedText); | |
| expandedText.clear(); | |
| expandedText.reserve(currentEscapedText.size()); | |
| while (!currentEscapedText.empty()) | |
| { | |
| char16_t ch = currentEscapedText.consume_front(); | |
| // Check escape codes. | |
| if (ch == '\\' && !currentEscapedText.empty()) | |
| { | |
| char32_t replacement = L'\\'; | |
| char16_t code = currentEscapedText.front(); | |
| switch (code) | |
| { | |
| case 'a': replacement = 0x0007; currentEscapedText.pop_front(); break; // Alert (Beep, Bell) | |
| case 'b': replacement = 0x0008; currentEscapedText.pop_front(); break; // Backspace | |
| case 'f': replacement = 0x000C; currentEscapedText.pop_front(); break; // Formfeed | |
| case 'n': replacement = 0x000A; currentEscapedText.pop_front(); break; // Newline (Line Feed) | |
| case 'r': replacement = 0x000D; currentEscapedText.pop_front(); break; // Carriage Return | |
| case 't': replacement = 0x0009; currentEscapedText.pop_front(); break; // Horizontal Tab | |
| case 'v': replacement = 0x000B; currentEscapedText.pop_front(); break; // Vertical Tab | |
| case '\\': replacement = 0x005C; currentEscapedText.pop_front(); break; // Backslash | |
| case '\'': replacement = 0x0027; currentEscapedText.pop_front(); break; // Single quotation mark | |
| case '\"': replacement = 0x0022; currentEscapedText.pop_front(); break; // Double quotation mark | |
| case '?': replacement = 0x003F; currentEscapedText.pop_front(); break; // Question mark | |
| case L'x': | |
| case L'u': | |
| case L'U': | |
| { | |
| size_t expectedHexSequenceLength = (code == 'U') ? 8 : 4; | |
| char16_t const* escapeStart = currentEscapedText.data() + 1; // Skip the 'x' 'u' 'U' | |
| char16_t const* escapeEnd = std::min(escapeStart + expectedHexSequenceLength, currentEscapedText.data_end()); | |
| std::span<char16_t const> digitSpan = {escapeStart, escapeEnd}; | |
| // Parse the number. | |
| if (digitSpan.size() >= expectedHexSequenceLength) | |
| { | |
| char32_t hexValue = ReadUnsignedNumericValue(/*inout*/ digitSpan, 16); | |
| if (digitSpan.empty()) // Completely read the sequence. | |
| { | |
| replacement = hexValue; | |
| currentEscapedText.reset(digitSpan.data(), currentEscapedText.end()); | |
| } | |
| } | |
| // Else parse error. So keep '\' to preserve original text. | |
| } | |
| break; | |
| // Anything else yields a '\', preserving the original text. | |
| // Silly octal is not supported. | |
| } | |
| if (IsCharacterBeyondBmp(replacement)) | |
| { | |
| expandedText.push_back(GetLeadingSurrogate(replacement)); | |
| expandedText.push_back(GetTrailingSurrogate(replacement)); | |
| } | |
| else | |
| { | |
| expandedText.push_back(char16_t(replacement)); | |
| } | |
| } | |
| else // Just append ordinary code unit. | |
| { | |
| expandedText.push_back(ch); | |
| } | |
| } | |
| } | |
| void EscapeCppUniversalCharacterNames( | |
| std::span<char16_t const> text, | |
| /*out*/ std::u16string& escapedText | |
| ) | |
| { | |
| constexpr size_t escapePrefixLength = 2; // \u or \U | |
| constexpr size_t shortEscapeDigitLength = 4; | |
| constexpr size_t longEscapeDigitLength = 8; | |
| char16_t shortEscapedSequence[6] = {'\\','u','0','0','0','0'}; | |
| char16_t longEscapedSequence[10] = {'\\','U','0','0','0','0','0','0','0','0'}; | |
| escapedText.clear(); | |
| escapedText.reserve(text.size() * std::size(shortEscapedSequence)); | |
| std::span<char16_t> shortDigitRange(&shortEscapedSequence[escapePrefixLength], &shortEscapedSequence[escapePrefixLength + shortEscapeDigitLength]); | |
| std::span<char16_t> longDigitRange(&longEscapedSequence[escapePrefixLength], &longEscapedSequence[escapePrefixLength + longEscapeDigitLength]); | |
| for (Utf16CharacterReader reader(text); !reader.IsAtEnd(); ) | |
| { | |
| char32_t ch = reader.ReadNext(); | |
| if (IsCharacterBeyondBmp(ch)) | |
| { | |
| // Write surrogate pair. | |
| WriteZeroPaddedHexNum(ch, /*out*/ longDigitRange); | |
| escapedText.append(std::begin(longEscapedSequence), std::end(longEscapedSequence)); | |
| } | |
| else // Single UTF-16 code unit. | |
| { | |
| WriteZeroPaddedHexNum(ch, /*out*/ shortDigitRange); | |
| escapedText.append(std::begin(shortEscapedSequence), std::end(shortEscapedSequence)); | |
| } | |
| } | |
| } | |
| void EscapeHtmlNamedCharacterReferences( | |
| std::span<char16_t const> text, | |
| /*out*/ std::u16string& escapedText | |
| ) | |
| { | |
| constexpr size_t escapePrefixLength = 3; // '&#x' | |
| constexpr size_t shortEscapeDigitLength = 4; | |
| constexpr size_t longEscapeDigitLength = 8; | |
| constexpr size_t escapeSuffixLength = 1; // ; | |
| char16_t shortEscapedSequence[8] = {'&','#','x','0','0','0','0',';'}; | |
| char16_t longEscapedSequence[12] = {'&','#','x','0','0','0','0','0','0','0','0',';'}; | |
| escapedText.clear(); | |
| escapedText.reserve(text.size() * std::size(shortEscapedSequence)); | |
| std::span<char16_t> shortDigitRange(shortEscapedSequence + escapePrefixLength, shortEscapedSequence + escapePrefixLength + shortEscapeDigitLength); | |
| std::span<char16_t> longDigitRange(longEscapedSequence + escapePrefixLength, longEscapedSequence + escapePrefixLength + longEscapeDigitLength); | |
| for (Utf16CharacterReader reader(text); !reader.IsAtEnd(); ) | |
| { | |
| char32_t ch = reader.ReadNext(); | |
| if (IsCharacterBeyondBmp(ch)) | |
| { | |
| // Write surrogate pair. | |
| WriteZeroPaddedHexNum(ch, /*out*/ longDigitRange); | |
| escapedText.append(std::begin(longEscapedSequence), std::end(longEscapedSequence)); | |
| } | |
| else // Single UTF-16 code unit. | |
| { | |
| WriteZeroPaddedHexNum(ch, /*out*/ shortDigitRange); | |
| escapedText.append(std::begin(shortEscapedSequence), std::end(shortEscapedSequence)); | |
| } | |
| } | |
| } | |
| void UnescapeHtmlNamedCharacterReferences(std::span<char16_t const> escapedText, /*out*/ std::u16string& expandedText) | |
| { | |
| minimal_span<char16_t const> currentEscapedText(escapedText); | |
| expandedText.clear(); | |
| expandedText.reserve(currentEscapedText.size()); | |
| while (!currentEscapedText.empty()) | |
| { | |
| char16_t ch = currentEscapedText.consume_front(); | |
| // Check escape codes. | |
| if (ch == '&' && !currentEscapedText.empty()) | |
| { | |
| char32_t replacement = L'&'; | |
| char16_t const* escapeStart = currentEscapedText.data(); | |
| char16_t const* escapeEnd = escapeStart; | |
| // Only numeric escapes are supported: Ӓᨫ | |
| // Not named ones: & | |
| if (*escapeStart == '#') | |
| { | |
| uint32_t radix = 10; // Assume decimal, unless 'x' follows. | |
| ++escapeStart; | |
| if (escapeStart < currentEscapedText.data_end() && *escapeStart == 'x') | |
| { | |
| radix = 16; // Hexadecimal. | |
| ++escapeStart; | |
| } | |
| // Parse the number, and replacing on error with just a '\' to preserve original text. | |
| std::span<char16_t const> digitSpan = {escapeStart, currentEscapedText.end()}; | |
| replacement = ReadUnsignedNumericValue(/*inout*/ digitSpan, radix); | |
| // Successful if the digits were not empty and a semicolon was present. | |
| if (digitSpan.data() > currentEscapedText.data() && !digitSpan.empty() && digitSpan.front() == ';') | |
| { | |
| currentEscapedText = {digitSpan.data() + 1, currentEscapedText.data_end()}; // After the semicolon. | |
| } | |
| else // Parse error. So restore '\' to preserve original text. | |
| { | |
| replacement = L'\\'; | |
| } | |
| } | |
| if (IsCharacterBeyondBmp(replacement)) | |
| { | |
| expandedText.push_back(GetLeadingSurrogate(replacement)); | |
| expandedText.push_back(GetTrailingSurrogate(replacement)); | |
| } | |
| else | |
| { | |
| expandedText.push_back(char16_t(replacement)); | |
| } | |
| } | |
| else // Just append ordinary code unit. | |
| { | |
| expandedText.push_back(ch); | |
| } | |
| } | |
| } | |
| void EscapeFilenameCharactersPercentEncoding( | |
| std::span<char8_t const> text, | |
| /*out*/ std::u8string& escapedText, | |
| char8_t escapeCharacter = '%' // $ might be another useful option, for Javascript variable names. | |
| ) | |
| { | |
| escapedText.clear(); | |
| escapedText.reserve(text.size()); | |
| constexpr size_t escapePrefixLength = 1; // '%' | |
| constexpr size_t shortEscapeDigitLength = 2; | |
| char8_t shortEscapedSequence[3] = {escapeCharacter,'0','0'}; | |
| std::span<char8_t> shortDigitRange(shortEscapedSequence + escapePrefixLength, shortEscapedSequence + escapePrefixLength + shortEscapeDigitLength); | |
| for (char8_t ch : text) | |
| { | |
| switch (ch) | |
| { | |
| case '*': | |
| case '?': | |
| case '/': | |
| case '|': | |
| case '\\': | |
| case ':': | |
| case '<': | |
| case '>': | |
| case '"': | |
| WriteZeroPaddedHexNum(ch, /*out*/ shortDigitRange); | |
| escapedText.append(std::begin(shortEscapedSequence), std::end(shortEscapedSequence)); | |
| break; | |
| default: | |
| escapedText.push_back(ch); | |
| break; | |
| } | |
| } | |
| } | |
| void UnescapeFilenameCharactersPercentEncoding( | |
| std::span<char8_t const> escapedText, | |
| /*out*/ std::u8string& expandedText, | |
| char8_t escapeCharacter = '%' // $ might be another useful option, for Javascript variable names. | |
| ) | |
| { | |
| expandedText.clear(); | |
| expandedText.reserve(escapedText.size()); | |
| minimal_span<char8_t const> currentEscapedText(escapedText); | |
| constexpr size_t escapePrefixLength = 1; // '%' | |
| constexpr size_t shortEscapeDigitLength = 2; | |
| while (!currentEscapedText.empty()) | |
| { | |
| char8_t ch = currentEscapedText.consume_front(); | |
| // Read the following two digit hex code (e.g. hello%2Fworld -> hello/world). | |
| if (ch == escapeCharacter) | |
| { | |
| std::span<char8_t const> digitSpan = ClampedSubspan<char8_t const>(currentEscapedText, escapePrefixLength, shortEscapeDigitLength); | |
| //currentEscapedText.subspan_clamped(escapePrefixLength, shortEscapeDigitLength); | |
| char32_t replacement = ReadUnsignedNumericValue(/*inout*/ digitSpan, /*radix*/ 16); | |
| if (digitSpan.size() == shortEscapeDigitLength) | |
| { | |
| ch = char8_t(replacement); | |
| currentEscapedText.remove_prefix(shortEscapeDigitLength); | |
| } | |
| } | |
| expandedText.push_back(ch); | |
| } | |
| } | |
| _Out_range_(0, utf32text.end_ - utf32text.begin_) | |
| size_t ConvertTextUtf16ToUtf32( | |
| std::span<char16_t const> utf16text, | |
| /*out*/ std::span<char32_t> utf32text, | |
| _Out_opt_ size_t* sourceCount | |
| ) noexcept | |
| { | |
| // Convert all code points, substituting the replacement character for unpaired surrogates. | |
| Utf16CharacterReader reader(utf16text); | |
| size_t utf32count = utf32text.size(); | |
| size_t utf32index = 0; | |
| for (; !reader.IsAtEnd() && utf32index < utf32count; ++utf32index) | |
| { | |
| char32_t ch = reader.ReadNext(); | |
| utf32text[utf32index] = ch; | |
| } | |
| // Return how many UTF-16 code units and UTF-32 units were read/written. | |
| // Might have more UTF16 code units than UTF32, but never the other way around. | |
| if (sourceCount != nullptr) | |
| *sourceCount = reader.size(); | |
| return utf32index; | |
| } | |
| _Out_range_(0, utf32text.end_ - utf32text.begin_) | |
| size_t ConvertTextUtf16ToUtf32NoReplacement( | |
| std::span<char16_t const> utf16text, | |
| /*out*/ std::span<char32_t> utf32text, | |
| _Out_opt_ size_t* sourceCount | |
| ) noexcept | |
| { | |
| // Can have more UTF16 characters than UTF32, | |
| // but never the other way around. | |
| Utf16CharacterReader reader(utf16text); | |
| size_t const utf32count = utf32text.size(); | |
| size_t utf32index = 0; | |
| for (; !reader.IsAtEnd() && utf32index < utf32count; ++utf32index) | |
| { | |
| utf32text[utf32index] = reader.ReadNextNoReplacement(); | |
| } | |
| if (sourceCount != nullptr) | |
| *sourceCount = reader.size(); | |
| return utf32index; | |
| } | |
| _Out_range_(0, utf16text.end_ - utf16text.begin_) | |
| size_t ConvertTextUtf32ToUtf16( | |
| std::span<char32_t const> utf32text, | |
| /*out*/ std::span<char16_t> utf16text, | |
| _Out_opt_ size_t* sourceCount | |
| ) noexcept | |
| { | |
| size_t const utf32count = utf32text.size(); | |
| size_t utf32index = 0; | |
| Utf16CharacterWriter writer(utf16text); | |
| for (; !writer.IsAtEnd() && utf32index < utf32count; ++utf32index) | |
| { | |
| writer.WriteNext(utf32text[utf32index]); | |
| } | |
| if (sourceCount != nullptr) | |
| *sourceCount = utf32index; | |
| return writer.size(); | |
| } | |
| _Out_range_(0, utf16text.end_ - utf16text.begin_) | |
| size_t ConvertTextUtf32ToUtf8( | |
| std::span<char32_t const> utf32text, | |
| /*out*/ std::span<char8_t> utf8text, | |
| _Out_opt_ size_t* sourceCount | |
| ) noexcept | |
| { | |
| size_t const utf32count = utf32text.size(); | |
| size_t utf32index = 0; | |
| Utf8CharacterWriter writer(utf8text); | |
| for (; !writer.IsAtEnd() && utf32index < utf32count; ++utf32index) | |
| { | |
| writer.WriteNext(utf32text[utf32index]); | |
| } | |
| if (sourceCount != nullptr) | |
| *sourceCount = utf32index; | |
| return writer.size(); | |
| } | |
| void ConvertTextUtf8ToUtf16( | |
| std::span<char8_t const> utf8text, | |
| /*out*/ std::u16string& utf16text | |
| ) | |
| { | |
| // This function can only throw if out-of-memory when resizing utf16text. | |
| // If utf16text is already reserve()'d, no exception will happen. | |
| Utf8CharacterReader reader(utf8text); | |
| reader.SkipBom(); | |
| utf16text.resize(reader.size()); // UTF-16 (1-2 code units) will always have equal or fewer code units than UTF-8 (1-4 code units). | |
| Utf16CharacterWriter writer(utf16text); | |
| while (!reader.IsAtEnd()) | |
| { | |
| assert(!writer.IsAtEnd()); | |
| writer.WriteNext(reader.ReadNext()); | |
| } | |
| utf16text.resize(writer.size()); // Shrink back to actual size. | |
| } | |
| void ConvertTextUtf8ToUtf32( | |
| std::u8string_view utf8text, | |
| /*out*/ std::u32string& utf32text | |
| ) | |
| { | |
| // This function can only throw if out-of-memory when resizing u32string. | |
| // If u32string is already reserve()'d, no exception will happen. | |
| Utf8CharacterReader reader(utf8text); | |
| reader.SkipBom(); | |
| utf32text.resize(reader.size()); // UTF-8 (1-4 code units) will always have equal or fewer code units than UTF-32 (1 code unit). | |
| Utf32CharacterWriter writer(utf32text); | |
| while (!reader.IsAtEnd()) | |
| { | |
| assert(!writer.IsAtEnd()); | |
| writer.WriteNext(reader.ReadNext()); | |
| } | |
| utf32text.resize(writer.size()); // Shrink back to actual size. | |
| } | |
| void ConvertTextUtf16ToUtf8( | |
| std::span<char16_t const> utf16text, | |
| /*out*/ std::u8string& utf8text | |
| ) | |
| { | |
| // C++ deprecated codecvt_utf8_utf16 but offered no successor in its place. | |
| // | |
| // std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t> g_converterToUtf8; | |
| // std::string temporary = g_converterToUtf8.to_bytes(source.data(), source.data() + source.size()); | |
| // | |
| // So implement it directly. | |
| // Allow double the space for the output, as UTF-16 can't yield | |
| // more than double the UTF-8 unit count: | |
| // | |
| // Code point UTF-16 UTF-8 Factor | |
| // code unit count | |
| // U+0 through U+7F 1 1 1x | |
| // U+80 through U+7FF 1 2 2x | |
| // U+800 through U+FFFF 2 3 1.5x | |
| // U+0800 through U+10FFFF 2 4 2x | |
| utf8text.resize(utf16text.size() * 2); // Preallocate up to 2 UTF-8 code units per UTF-16 code unit. | |
| Utf16CharacterReader reader(utf16text); | |
| Utf8CharacterWriter writer(utf8text); | |
| while (!reader.IsAtEnd()) | |
| { | |
| assert(!writer.IsAtEnd()); | |
| writer.WriteNext(reader.ReadNext()); | |
| } | |
| utf8text.resize(writer.size()); | |
| } | |
| void ConvertTextUtf16ToUtf32( | |
| std::span<char16_t const> utf16text, | |
| /*out*/ std::u32string& utf32text | |
| ) | |
| { | |
| utf32text.resize(utf16text.size()); // UTF-32 (1 code unit) will always have equal or fewer code units than UTF-16 (1-2 code units). | |
| Utf16CharacterReader reader(utf16text); | |
| Utf32CharacterWriter writer(utf32text); | |
| while (!reader.IsAtEnd()) | |
| { | |
| assert(!writer.IsAtEnd()); | |
| writer.WriteNext(reader.ReadNext()); | |
| } | |
| utf32text.resize(writer.size()); | |
| } | |
| void ConvertTextUtf32ToUtf8( | |
| std::span<char32_t const> utf32text, | |
| /*out*/ std::u8string& utf8text | |
| ) | |
| { | |
| utf8text.resize(utf32text.size() * 4); // UTF-8 could expand up to 4 code units. | |
| Utf32CharacterReader reader(utf32text); | |
| Utf8CharacterWriter writer(utf8text); | |
| while (!reader.IsAtEnd()) | |
| { | |
| assert(!writer.IsAtEnd()); | |
| writer.WriteNext(reader.ReadNext()); | |
| } | |
| utf8text.resize(writer.size()); | |
| } | |
| void ConvertTextUtf32ToUtf16( | |
| std::span<char32_t const> utf32text, | |
| /*out*/ std::u16string& utf16text | |
| ) | |
| { | |
| utf16text.resize(utf32text.size() * 2); | |
| Utf32CharacterReader reader(utf32text); | |
| Utf16CharacterWriter writer(utf16text); | |
| while (!reader.IsAtEnd()) | |
| { | |
| assert(!writer.IsAtEnd()); | |
| writer.WriteNext(reader.ReadNext()); | |
| } | |
| utf16text.resize(writer.size()); | |
| } | |
| inline std::u16string ToUtf16String(std::span<char8_t const> source) | |
| { | |
| std::u16string dest; | |
| ConvertTextUtf8ToUtf16(source, dest); | |
| return dest; | |
| } | |
| inline std::u8string ToUtf8String(std::span<char16_t const> source) | |
| { | |
| std::u8string dest; | |
| ConvertTextUtf16ToUtf8(source, dest); | |
| return dest; | |
| } | |
| #ifdef _WIN32 | |
| inline std::u8string ToUtf8String(std::span<wchar_t const> source) | |
| { | |
| std::u8string dest; | |
| ConvertTextUtf16ToUtf8(reinterpret_span<char16_t const>(source), dest); | |
| return dest; | |
| } | |
| #endif // _WIN32 | |
| struct StringAndIndex | |
| { | |
| char8_t const* text; // Null terminated. | |
| uint32_t index; | |
| }; | |
| std::optional<uint32_t> TryMapStringToIndex(std::u8string_view text, std::span<const StringAndIndex> list) noexcept | |
| { | |
| for (StringAndIndex const& item : list) | |
| { | |
| if (item.text == text) | |
| { | |
| return item.index; | |
| } | |
| } | |
| return {}; | |
| } | |
| template<typename T> | |
| std::optional<T> TryMapStringToIndex(std::u8string_view mode, std::span<const StringAndIndex> nameAndIndexList) noexcept | |
| { | |
| static_assert(sizeof(T) == sizeof(uint32_t)); | |
| auto result = TryMapStringToIndex(mode, nameAndIndexList); | |
| return *reinterpret_cast<std::optional<T>*>(std::addressof(result)); | |
| } | |
| template<typename T> | |
| T MapStringToIndex(std::u8string_view mode, std::span<const StringAndIndex> nameAndIndexList, T defaultValue) noexcept | |
| { | |
| auto result = TryMapStringToIndex(mode, nameAndIndexList); | |
| return result ? T(*result) : defaultValue; | |
| } | |
| template <typename CharType> | |
| void ToLowercase(/*inout*/ std::span<CharType> text) noexcept | |
| { | |
| for (CharType& c : text) | |
| { | |
| // TODO: Extend this to other languages besides English? | |
| // It's currently only used for English keywords, and salient case conversions are 1:1 (the German double S is irrelevant now) | |
| c = static_cast<CharType>(::tolower(c)); | |
| } | |
| } | |
| void ToLowercase(/*inout*/ std::span<char> text) { return ToLowercase<char>(text); }; | |
| void ToLowercase(/*inout*/ std::span<char8_t> text) { return ToLowercase<char8_t>(text); }; | |
| void ToLowercase(/*inout*/ std::span<char16_t> text) { return ToLowercase<char16_t>(text); }; | |
| template <typename CharType> | |
| void ToUpperCase(/*inout*/ std::span<CharType> text) | |
| { | |
| for (CharType& c : text) | |
| { | |
| c = static_cast<CharType>(::toupper(c)); | |
| } | |
| } | |
| void ToUpperCase(/*inout*/ std::span<char> text) { return ToUpperCase<char>(text); }; | |
| void ToUpperCase(/*inout*/ std::span<char8_t> text) { return ToUpperCase<char8_t>(text); }; | |
| void ToUpperCase(/*inout*/ std::span<char16_t> text) { return ToUpperCase<char16_t>(text); }; | |
| std::optional<std::u8string_view> TryMapIndexToString(uint32_t index, std::span<const StringAndIndex> nameAndIndexList) noexcept | |
| { | |
| for (auto& nameAndIndex : nameAndIndexList) | |
| { | |
| if (nameAndIndex.index == index) | |
| { | |
| return nameAndIndex.text; | |
| } | |
| } | |
| return {}; | |
| } | |
| template<typename T> | |
| std::optional<T> TryMapIndexToString(T index, std::span<const StringAndIndex> nameAndIndexList) noexcept | |
| { | |
| static_assert(sizeof(T) == sizeof(uint32_t)); | |
| return TryMapIndexToString(static_cast<uint32_t>(index), nameAndIndexList); | |
| } | |
| template<typename T> | |
| std::u8string_view MapIndexToString(T index, std::span<const StringAndIndex> nameAndIndexList, std::u8string_view defaultValue) noexcept | |
| { | |
| auto result = TryMapIndexToString(uint32_t(index), nameAndIndexList); | |
| return result ? *result : defaultValue; | |
| } | |
| export uint32_t MapStringSuffixIcaseToIndex( | |
| std::u8string_view text, | |
| std::span<const StringAndIndex> nameAndIndexList, | |
| uint32_t defaultValue | |
| ) | |
| { | |
| std::u8string lowerCaseFilename(text.begin(), text.end()); | |
| ToLowercase(/*inout*/ lowerCaseFilename); | |
| for (auto const& entry : nameAndIndexList) | |
| { | |
| if (lowerCaseFilename.ends_with(entry.text)) | |
| { | |
| return entry.index; | |
| } | |
| } | |
| return defaultValue; | |
| } | |
| template <typename StringType, typename StringCharType> | |
| void TrimSpaces(/*inout*/ StringType& text, /*nullterminated*/ StringCharType const* spaces) | |
| { | |
| // Trim space (U+0020) and tab. It does not trim all whitespace, like U+200X | |
| // or the new line controls. | |
| // Trim trailing spaces | |
| size_t lastPos = text.find_last_not_of(spaces); | |
| if (lastPos != std::string::npos) | |
| { | |
| text.erase(lastPos + 1); | |
| } | |
| // Trim leading spaces | |
| size_t firstPos = text.find_first_not_of(spaces); | |
| if (firstPos != 0) | |
| { | |
| if (firstPos == std::string::npos) | |
| firstPos = text.size(); | |
| text.erase(0, firstPos); | |
| } | |
| } | |
| void TrimSpaces(/*inout*/ std::string text) { return TrimSpaces<std::string>(text, " \t"); } | |
| void TrimSpaces(/*inout*/ std::u8string text) { return TrimSpaces<std::u8string>(text, u8" \t"); } | |
| void TrimSpaces(/*inout*/ std::u16string& text) { return TrimSpaces<std::u16string>(text, u" \t"); } | |
| template <typename StringType> | |
| void UnquoteString(/*inout*/ StringType& text) | |
| requires requires (StringType& text) { text.empty(); text.back(); text.pop_back(); text.front(); text.erase(); } | |
| { | |
| if (text.empty()) | |
| return; | |
| if (text.back() == '\"') | |
| { | |
| text.pop_back(); | |
| } | |
| if (text.empty()) | |
| return; | |
| if (text.front() == '\"') | |
| { | |
| text.erase(0, 1); | |
| } | |
| } | |
| void UnquoteString(/*inout*/ std::string& text) { return UnquoteString<std::string>(text); } | |
| void UnquoteString(/*inout*/ std::u8string& text) { return UnquoteString<std::u8string>(text); } | |
| void UnquoteString(/*inout*/ std::u16string& text) { return UnquoteString<std::u16string>(text); } | |
| // Useful for reconcatenating main's argc and argv[]. | |
| template <typename CharType, typename StringType, typename ViewType> | |
| StringType ConcatenateStrings(std::span<CharType const* const> stringList, ViewType delimiter) | |
| { | |
| StringType concatenatedString; | |
| for (CharType const* s : stringList) | |
| { | |
| if (!concatenatedString.empty()) | |
| { | |
| concatenatedString.append(delimiter); | |
| } | |
| concatenatedString.append(s); | |
| } | |
| return concatenatedString; | |
| } | |
| std::string ConcatenateStrings(std::span<char const* const> stringList, std::string_view delimiter = " ") | |
| { | |
| return ConcatenateStrings<char, std::string, std::string_view>(stringList, delimiter); | |
| } | |
| std::u8string ConcatenateStrings(std::span<char8_t const* const> stringList, std::u8string_view delimiter = u8" ") | |
| { | |
| return ConcatenateStrings<char8_t, std::u8string, std::u8string_view>(stringList, delimiter); | |
| } | |
| std::u16string ConcatenateStrings(std::span<char16_t const* const> stringList, std::u16string_view delimiter = u" ") | |
| { | |
| return ConcatenateStrings<char16_t, std::u16string, std::u16string_view>(stringList, delimiter); | |
| } | |
| // Helper typedef class to resolve a code unit to type the respective reader type. | |
| // Because we're only targeting Unicode (no Shift-JIS, Big 5, or other), then we can simplify | |
| // the specialization by code unit byte size. | |
| template <size_t CharacterTypeByteSize> struct CharacterReaderResolver; | |
| template <> struct CharacterReaderResolver<1> { using Type = Utf8CharacterReader; }; | |
| template <> struct CharacterReaderResolver<2> { using Type = Utf16CharacterReader; }; | |
| template <> struct CharacterReaderResolver<4> { using Type = Utf32CharacterReader; }; | |
| // Compare two Unicode strings of possibly different encodings. | |
| template <typename CharTypeA, typename CharTypeB> | |
| bool CompareStringSpans(std::span<CharTypeA const> a, std::span<CharTypeB const> b) | |
| { | |
| if constexpr (sizeof(CharTypeA) == sizeof(CharTypeB)) | |
| { | |
| // Can just compare code units directly, since they are the same size. | |
| return std::equal(a.data(), a.data() + a.size(), b.data(), b.data() + b.size()); | |
| } | |
| else // Strings are heterogeneous Unicode encodings. | |
| { | |
| typename CharacterReaderResolver<sizeof(CharTypeA)>::Type readerA(a); | |
| typename CharacterReaderResolver<sizeof(CharTypeB)>::Type readerB(b); | |
| while (true) | |
| { | |
| if (bool aIsDone = readerA.IsAtEnd(), bIsDone = readerB.IsAtEnd(); aIsDone || bIsDone) | |
| { | |
| return aIsDone == bIsDone; // Return false if reached the end of one string before the other. | |
| } | |
| if (readerA.ReadNext() != readerB.ReadNext()) | |
| { | |
| return false; | |
| } | |
| } | |
| return true; | |
| } | |
| } | |
| /* | |
| Usage: | |
| std::string a = "Hello"; // Encoding is actually UTF-8. | |
| std::u8string b = u8"Hello"; | |
| bool result = StringHelpers::CompareStrings(a, b); | |
| std::wstring a = L"Hello"; | |
| std::u8string b = u8"Hello"; | |
| bool result = StringHelpers::CompareStrings(a, b); | |
| std::u32string a = U"Hello"; | |
| std::string b = "Hello"; | |
| bool result = StringHelpers::CompareStrings(a, b); | |
| std::u16string a = u"Hello"; | |
| char32_t b[5] = {'H','e','l','l','o'}; | |
| bool result = StringHelpers::CompareStrings(a, std::spanb); | |
| */ | |
| template <typename StringTypeA, typename StringTypeB> | |
| inline bool CompareStrings(StringTypeA const& a, StringTypeB const& b) | |
| // TODO: Figure out why requires with std::data doesn't work as expected on raw C arrays. | |
| //requires requires (StringTypeA a, StringTypeB b) { std::data(a); std::size(a); std::data(b); std::size(b); } | |
| { | |
| // Sadly we can't just rely on template type deduction here because mutable spans then | |
| // no CompareStringSpans overload because CompareStringSpans expects span<T const>. | |
| using CharTypeA = std::remove_reference_t<decltype(*std::data(a))>; | |
| using CharTypeB = std::remove_reference_t<decltype(*std::data(b))>; | |
| return CompareStringSpans(std::span<CharTypeA const>(a), std::span<CharTypeB const>(b)); | |
| } | |
| } // namespace StringHelpers |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment