Last active
July 18, 2025 21:20
-
-
Save siberex/7684eadde46d5e119b3bc4c6d5ab6212 to your computer and use it in GitHub Desktop.
Simple toUpper() implementation for UTF-8 strings with stdlib only (without Boost or ICU)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* strings | |
* | |
* Author: Stephen Jingle <sib.li> | |
* Created: 13 Jul 2025 | |
*/ | |
#include <cctype> | |
#include <cwctype> | |
#include <format> | |
#include <iostream> | |
#include <locale> | |
#include <ranges> | |
#include <sstream> | |
#include <string> | |
#include <vector> | |
/** | |
* Split UTF-8 string into individual code points (each represented as a string, not char) | |
* See also: https://en.wikipedia.org/wiki/UTF-8#Description | |
* | |
* @param str UTF-8 string / string_view | |
* @return vector of string_views referencing each code point | |
*/ | |
[[nodiscard]] auto splitIntoCodePoints(const std::string_view &str) -> std::vector<std::string_view> { | |
std::vector<std::string_view> codePoints; | |
size_t i = 0; | |
while (i < str.size()) { | |
// Cast the leading byte as char8_t (unsigned) to avoid sign issues | |
const char8_t leadByte = static_cast<char8_t>(str[i]); | |
// Determine sequence length from the leading byte | |
// Default is plain ASCII, U+0000 to U+007F, 1-byte length | |
size_t charLen = 1; | |
if (leadByte >= 0xF0) { // 4-byte sequence (U+10000 to U+10FFFF), 11110*** = 0xF0 | |
charLen = 4; | |
} else if (leadByte >= 0xE0) { // 3-byte sequence (U+0800 to U+FFFF), 1110**** = 0xE0 | |
charLen = 3; | |
} else if (leadByte >= 0xC0) { // 2-byte sequence (U+0080 to U+07FF), 110***** = 0xC0 | |
charLen = 2; | |
} | |
// Ensure we don't exceed the string bounds | |
if (i + charLen > str.size()) { | |
// Probably string is malformed. Will use remaining bytes as a single codepoint | |
charLen = str.size() - i; | |
} | |
// Add the substring view and advance the index | |
codePoints.push_back(str.substr(i, charLen)); | |
i += charLen; | |
} | |
return codePoints; | |
} | |
// Decode a string-represented UTF-8 code point from string into a UTF-32 character (4-byte fixed width) | |
[[nodiscard]] auto decodeCodePoint(const std::string_view &codePoint) -> char32_t { | |
char32_t wChr = 0; | |
switch (codePoint.length()) { | |
case 1: | |
wChr = codePoint.at(0); | |
break; | |
case 2: | |
// Implicit cast from binary-manipulated chars to char32_t should be fine here | |
wChr = ((codePoint.at(0) & 0x1F) << 6) | |
| (codePoint.at(1) & 0x3F); | |
break; | |
case 3: | |
wChr = ((codePoint.at(0) & 0x0F) << 12) | |
| ((codePoint.at(1) & 0x3F) << 6) | |
| (codePoint.at(2) & 0x3F); | |
break; | |
case 4: | |
wChr = ((codePoint.at(0) & 0x07) << 18) | |
| ((codePoint.at(1) & 0x3F) << 12) | |
| ((codePoint.at(2) & 0x3F) << 6) | |
| (codePoint.at(3) & 0x3F); | |
break; | |
default: | |
wChr = 0xFFFD; // >4 bytes, invalid code point: � | |
} | |
return wChr; | |
} | |
// Encode a UTF-32 character into a string-represented UTF-8 code point | |
// See also: https://en.wikipedia.org/wiki/UTF-8#Description | |
[[nodiscard]] auto encodeUTF8(const char32_t wChr) -> std::string { | |
std::string result; | |
if (wChr <= 0x7F) { | |
// U+0000 to U+007F, 1-byte | |
result = static_cast<char8_t>(wChr); | |
} else if (wChr <= 0x7FF) { | |
// U+0080 to U+07FF, 2-byte | |
result = static_cast<char8_t>(0xC0 | ((wChr >> 6) & 0x1F)); | |
result += static_cast<char8_t>(0x80 | (wChr & 0x3F)); | |
} else if (wChr <= 0xFFFF) { | |
// U+0800 to U+FFFF, 3-byte | |
result = static_cast<char8_t>(0xE0 | ((wChr >> 12) & 0x0F)); | |
result += static_cast<char8_t>(0x80 | ((wChr >> 6) & 0x3F)); | |
result += static_cast<char8_t>(0x80 | (wChr & 0x3F)); | |
} else if (wChr <= 0x10FFFF) { | |
// U+10000 to U+10FFFF, 4-byte | |
result = static_cast<char8_t>(0xF0 | ((wChr >> 18) & 0x07)); | |
result += static_cast<char8_t>(0x80 | ((wChr >> 12) & 0x3F)); | |
result += static_cast<char8_t>(0x80 | ((wChr >> 6) & 0x3F)); | |
result += static_cast<char8_t>(0x80 | (wChr & 0x3F)); | |
} else { | |
// Invalid code point (out of U+10FFFF range) | |
// Append replacement character U+FFFD | |
result = "�"; | |
} | |
return result; | |
} | |
// Function to convert UTF-8 encoded string to uppercase | |
// UTF-8 sequence → code points[] → wchar_t[] → toupper → join | |
std::string toUpperCase(const std::string_view &str, const std::locale &loc = std::locale()) { | |
std::string result; | |
const auto codePoints = splitIntoCodePoints(str); | |
for (const std::string_view &codePoint : codePoints) { | |
// Decode to 4-byte char | |
const char32_t cp = decodeCodePoint(codePoint); | |
// Cast to wide char | |
const wchar_t chr = static_cast<wchar_t>(cp); | |
// Convert to uppercase using std::locale::toupper | |
const wchar_t upperChr = std::toupper<wchar_t>(chr, loc); | |
// Cast back to UTF-32 char | |
const char32_t upperCp = static_cast<char32_t>(upperChr); | |
// Encode to UTF-8 code point | |
result += encodeUTF8(upperCp); | |
} | |
return result; | |
} | |
// GCC: | |
// g++ -std=c++20 str_to_upper.cpp -o /tmp/str_to_upper && /tmp/str_to_upper | |
// Clang: | |
// clang++ -std=c++20 -stdlib=libc++ str_to_upper.cpp -o /tmp/str_to_upper && /tmp/str_to_upper | |
int main() { | |
std::locale::global( std::locale("en_US.UTF-8") ); | |
// Test cases including multi-codepoint mappings | |
std::string strTestUpper1 = "hello🌍world"; | |
std::string strTestUpper2 = "naïve café"; | |
std::string strTestUpper3 = "αβγδε"; // Greek | |
std::string strTestUpper4 = "привет мир"; // Cyrillic | |
std::string strTestUpper5 = "MixeD_CaSe1_ÄÖÜ #üñö!"; // Mixed latin | |
std::string strTestUpper6 = "Straße"; // German with ß, should be capitalized as ẞ | |
std::string strTestUpper7 = "æon+早安øӕ"; // Mixed with ligatures U+00E6, U+04D5 | |
/* | |
* Meanwhile with Boost: | |
* #include <boost/locale.hpp> | |
* boost::locale::to_upper(str, "en_US.UTF-8"); | |
*/ | |
std::cout << std::format("Generic string: \"{0}\" → \"{1}\"\n", strTestUpper1, toUpperCase(strTestUpper1)); | |
std::cout << std::format("Latin diacritics: \"{0}\" → \"{1}\"\n", strTestUpper2, toUpperCase(strTestUpper2)); | |
std::cout << std::format("Greek: \"{0}\" → \"{1}\"\n", strTestUpper3, toUpperCase(strTestUpper3)); | |
std::cout << std::format("Cyrillic: \"{0}\" → \"{1}\"\n", strTestUpper4, toUpperCase(strTestUpper4)); | |
std::cout << std::format("Mixed: \"{0}\" → \"{1}\"\n", strTestUpper5, toUpperCase(strTestUpper5)); | |
std::cout << std::format("German: \"{0}\" → \"{1}\"\n", strTestUpper6, toUpperCase(strTestUpper6)); | |
std::cout << std::format("Ligatures mixed: \"{0}\" → \"{1}\"\n", strTestUpper7, toUpperCase(strTestUpper7)); | |
std::cout.flush(); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment