Skip to content

Instantly share code, notes, and snippets.

@siberex
Last active July 18, 2025 21:20
Show Gist options
  • Save siberex/7684eadde46d5e119b3bc4c6d5ab6212 to your computer and use it in GitHub Desktop.
Save siberex/7684eadde46d5e119b3bc4c6d5ab6212 to your computer and use it in GitHub Desktop.
Simple toUpper() implementation for UTF-8 strings with stdlib only (without Boost or ICU)
/**
* strings
*
* Author: Stephen Jingle <sib.li>
* Created: 13 Jul 2025
*/
#include <cctype>
#include <cwctype>
#include <format>
#include <iostream>
#include <locale>
#include <ranges>
#include <sstream>
#include <string>
#include <vector>
/**
* Split UTF-8 string into individual code points (each represented as a string, not char)
* See also: https://en.wikipedia.org/wiki/UTF-8#Description
*
* @param str UTF-8 string / string_view
* @return vector of string_views referencing each code point
*/
[[nodiscard]] auto splitIntoCodePoints(const std::string_view &str) -> std::vector<std::string_view> {
std::vector<std::string_view> codePoints;
size_t i = 0;
while (i < str.size()) {
// Cast the leading byte as char8_t (unsigned) to avoid sign issues
const char8_t leadByte = static_cast<char8_t>(str[i]);
// Determine sequence length from the leading byte
// Default is plain ASCII, U+0000 to U+007F, 1-byte length
size_t charLen = 1;
if (leadByte >= 0xF0) { // 4-byte sequence (U+10000 to U+10FFFF), 11110*** = 0xF0
charLen = 4;
} else if (leadByte >= 0xE0) { // 3-byte sequence (U+0800 to U+FFFF), 1110**** = 0xE0
charLen = 3;
} else if (leadByte >= 0xC0) { // 2-byte sequence (U+0080 to U+07FF), 110***** = 0xC0
charLen = 2;
}
// Ensure we don't exceed the string bounds
if (i + charLen > str.size()) {
// Probably string is malformed. Will use remaining bytes as a single codepoint
charLen = str.size() - i;
}
// Add the substring view and advance the index
codePoints.push_back(str.substr(i, charLen));
i += charLen;
}
return codePoints;
}
// Decode a string-represented UTF-8 code point from string into a UTF-32 character (4-byte fixed width)
[[nodiscard]] auto decodeCodePoint(const std::string_view &codePoint) -> char32_t {
char32_t wChr = 0;
switch (codePoint.length()) {
case 1:
wChr = codePoint.at(0);
break;
case 2:
// Implicit cast from binary-manipulated chars to char32_t should be fine here
wChr = ((codePoint.at(0) & 0x1F) << 6)
| (codePoint.at(1) & 0x3F);
break;
case 3:
wChr = ((codePoint.at(0) & 0x0F) << 12)
| ((codePoint.at(1) & 0x3F) << 6)
| (codePoint.at(2) & 0x3F);
break;
case 4:
wChr = ((codePoint.at(0) & 0x07) << 18)
| ((codePoint.at(1) & 0x3F) << 12)
| ((codePoint.at(2) & 0x3F) << 6)
| (codePoint.at(3) & 0x3F);
break;
default:
wChr = 0xFFFD; // >4 bytes, invalid code point: �
}
return wChr;
}
// Encode a UTF-32 character into a string-represented UTF-8 code point
// See also: https://en.wikipedia.org/wiki/UTF-8#Description
[[nodiscard]] auto encodeUTF8(const char32_t wChr) -> std::string {
std::string result;
if (wChr <= 0x7F) {
// U+0000 to U+007F, 1-byte
result = static_cast<char8_t>(wChr);
} else if (wChr <= 0x7FF) {
// U+0080 to U+07FF, 2-byte
result = static_cast<char8_t>(0xC0 | ((wChr >> 6) & 0x1F));
result += static_cast<char8_t>(0x80 | (wChr & 0x3F));
} else if (wChr <= 0xFFFF) {
// U+0800 to U+FFFF, 3-byte
result = static_cast<char8_t>(0xE0 | ((wChr >> 12) & 0x0F));
result += static_cast<char8_t>(0x80 | ((wChr >> 6) & 0x3F));
result += static_cast<char8_t>(0x80 | (wChr & 0x3F));
} else if (wChr <= 0x10FFFF) {
// U+10000 to U+10FFFF, 4-byte
result = static_cast<char8_t>(0xF0 | ((wChr >> 18) & 0x07));
result += static_cast<char8_t>(0x80 | ((wChr >> 12) & 0x3F));
result += static_cast<char8_t>(0x80 | ((wChr >> 6) & 0x3F));
result += static_cast<char8_t>(0x80 | (wChr & 0x3F));
} else {
// Invalid code point (out of U+10FFFF range)
// Append replacement character U+FFFD
result = "�";
}
return result;
}
// Function to convert UTF-8 encoded string to uppercase
// UTF-8 sequence → code points[] → wchar_t[] → toupper → join
std::string toUpperCase(const std::string_view &str, const std::locale &loc = std::locale()) {
std::string result;
const auto codePoints = splitIntoCodePoints(str);
for (const std::string_view &codePoint : codePoints) {
// Decode to 4-byte char
const char32_t cp = decodeCodePoint(codePoint);
// Cast to wide char
const wchar_t chr = static_cast<wchar_t>(cp);
// Convert to uppercase using std::locale::toupper
const wchar_t upperChr = std::toupper<wchar_t>(chr, loc);
// Cast back to UTF-32 char
const char32_t upperCp = static_cast<char32_t>(upperChr);
// Encode to UTF-8 code point
result += encodeUTF8(upperCp);
}
return result;
}
// GCC:
// g++ -std=c++20 str_to_upper.cpp -o /tmp/str_to_upper && /tmp/str_to_upper
// Clang:
// clang++ -std=c++20 -stdlib=libc++ str_to_upper.cpp -o /tmp/str_to_upper && /tmp/str_to_upper
int main() {
std::locale::global( std::locale("en_US.UTF-8") );
// Test cases including multi-codepoint mappings
std::string strTestUpper1 = "hello🌍world";
std::string strTestUpper2 = "naïve café";
std::string strTestUpper3 = "αβγδε"; // Greek
std::string strTestUpper4 = "привет мир"; // Cyrillic
std::string strTestUpper5 = "MixeD_CaSe1_ÄÖÜ #üñö!"; // Mixed latin
std::string strTestUpper6 = "Straße"; // German with ß, should be capitalized as ẞ
std::string strTestUpper7 = "æon+早安øӕ"; // Mixed with ligatures U+00E6, U+04D5
/*
* Meanwhile with Boost:
* #include <boost/locale.hpp>
* boost::locale::to_upper(str, "en_US.UTF-8");
*/
std::cout << std::format("Generic string: \"{0}\" → \"{1}\"\n", strTestUpper1, toUpperCase(strTestUpper1));
std::cout << std::format("Latin diacritics: \"{0}\" → \"{1}\"\n", strTestUpper2, toUpperCase(strTestUpper2));
std::cout << std::format("Greek: \"{0}\" → \"{1}\"\n", strTestUpper3, toUpperCase(strTestUpper3));
std::cout << std::format("Cyrillic: \"{0}\" → \"{1}\"\n", strTestUpper4, toUpperCase(strTestUpper4));
std::cout << std::format("Mixed: \"{0}\" → \"{1}\"\n", strTestUpper5, toUpperCase(strTestUpper5));
std::cout << std::format("German: \"{0}\" → \"{1}\"\n", strTestUpper6, toUpperCase(strTestUpper6));
std::cout << std::format("Ligatures mixed: \"{0}\" → \"{1}\"\n", strTestUpper7, toUpperCase(strTestUpper7));
std::cout.flush();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment