Skip to content

Instantly share code, notes, and snippets.

@siberex
Last active July 15, 2025 20:55
Show Gist options
  • Save siberex/9cb30540198a73e0ef324cc6be104724 to your computer and use it in GitHub Desktop.
Save siberex/9cb30540198a73e0ef324cc6be104724 to your computer and use it in GitHub Desktop.
[FAILED] Attempt to implement string toUpperCase with LLM
#include <string>
#include <string_view>
#include <locale>
#include <codecvt>
#include <algorithm>
class UTF8ToUpperCase {
private:
static thread_local std::string result_buffer;
// Convert UTF-8 byte sequence to Unicode code point
static std::pair<char32_t, size_t> utf8_to_codepoint(const char* utf8_str, size_t remaining) {
if (remaining == 0) return {0, 0};
unsigned char first = static_cast<unsigned char>(utf8_str[0]);
// ASCII (0xxxxxxx)
if (first < 0x80) {
return {static_cast<char32_t>(first), 1};
}
// 2-byte sequence (110xxxxx 10xxxxxx)
if ((first & 0xE0) == 0xC0 && remaining >= 2) {
if ((utf8_str[1] & 0xC0) == 0x80) {
char32_t codepoint = ((first & 0x1F) << 6) | (utf8_str[1] & 0x3F);
return {codepoint, 2};
}
}
// 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
if ((first & 0xF0) == 0xE0 && remaining >= 3) {
if ((utf8_str[1] & 0xC0) == 0x80 && (utf8_str[2] & 0xC0) == 0x80) {
char32_t codepoint = ((first & 0x0F) << 12) |
((utf8_str[1] & 0x3F) << 6) |
(utf8_str[2] & 0x3F);
return {codepoint, 3};
}
}
// 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
if ((first & 0xF8) == 0xF0 && remaining >= 4) {
if ((utf8_str[1] & 0xC0) == 0x80 && (utf8_str[2] & 0xC0) == 0x80 && (utf8_str[3] & 0xC0) == 0x80) {
char32_t codepoint = ((first & 0x07) << 18) |
((utf8_str[1] & 0x3F) << 12) |
((utf8_str[2] & 0x3F) << 6) |
(utf8_str[3] & 0x3F);
return {codepoint, 4};
}
}
// Invalid UTF-8 sequence, return original byte
return {static_cast<char32_t>(first), 1};
}
// Convert Unicode code point to UTF-8 byte sequence
static std::string codepoint_to_utf8(char32_t codepoint) {
std::string result;
if (codepoint < 0x80) {
result.push_back(static_cast<char>(codepoint));
} else if (codepoint < 0x800) {
result.push_back(static_cast<char>(0xC0 | (codepoint >> 6)));
result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
} else if (codepoint < 0x10000) {
result.push_back(static_cast<char>(0xE0 | (codepoint >> 12)));
result.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
} else if (codepoint < 0x110000) {
result.push_back(static_cast<char>(0xF0 | (codepoint >> 18)));
result.push_back(static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F)));
result.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
} else {
// Invalid codepoint, return replacement character
result.push_back(static_cast<char>(0xEF));
result.push_back(static_cast<char>(0xBF));
result.push_back(static_cast<char>(0xBD));
}
return result;
}
// Simple uppercase mapping for common Unicode ranges
static char32_t simple_toupper(char32_t codepoint) {
// ASCII range
if (codepoint >= 'a' && codepoint <= 'z') {
return codepoint - 'a' + 'A';
}
// Latin-1 Supplement (U+00C0-U+00FF)
if (codepoint >= 0x00E0 && codepoint <= 0x00FE && codepoint != 0x00F7) {
return codepoint - 0x20;
}
// Cyrillic range (U+0430-U+044F)
if (codepoint >= 0x0430 && codepoint <= 0x044F) {
return codepoint - 0x20;
}
// Additional Cyrillic (U+0450-U+045F)
if (codepoint >= 0x0450 && codepoint <= 0x045F) {
return codepoint - 0x50;
}
// Greek range (U+03B1-U+03C9)
if (codepoint >= 0x03B1 && codepoint <= 0x03C9) {
return codepoint - 0x20;
}
// Some common additional mappings
switch (codepoint) {
case 0x00DF: return 0x1E9E; // ß -> ẞ (or could return "SS")
case 0x0149: return 0x02BC; // ʼn -> ʼN (nasal)
case 0x017F: return 0x0053; // ſ -> S (long s)
case 0x1FBE: return 0x0399; // ι -> Ι (Greek)
}
// If no mapping found, return original
return codepoint;
}
public:
static std::string_view toUpperCase(const std::string_view& str) {
result_buffer.clear();
result_buffer.reserve(str.size() * 2); // Reserve space, UTF-8 can expand
const char* data = str.data();
size_t remaining = str.size();
size_t pos = 0;
while (pos < str.size()) {
auto [codepoint, bytes_consumed] = utf8_to_codepoint(data + pos, remaining);
if (bytes_consumed == 0) break;
char32_t upper_codepoint = simple_toupper(codepoint);
std::string utf8_upper = codepoint_to_utf8(upper_codepoint);
result_buffer.append(utf8_upper);
pos += bytes_consumed;
remaining -= bytes_consumed;
}
return std::string_view(result_buffer);
}
};
// Thread-local storage for the result buffer
thread_local std::string UTF8ToUpperCase::result_buffer;
// Main function interface
std::string_view toUpperCase(const std::string_view& str) {
return UTF8ToUpperCase::toUpperCase(str);
}
// Example usage and test
#include <iostream>
/*
PROMPT:
Write C++23 implementation for string toUpperCase() function for UTF-8 strings.
Do not use Boost or ICU.
Function signature: `std::string_view toUpperCase(const std::string_view &str); `
Use standard library methods like `locale::toupper`, function should be portable (should compile with either GCC, LLVM Clang or MSVC) and should work in the upcoming C++26 standard (do not use anything that is going to be removed in C++26).
Full Unicode compatibility is not required, but it should correctly convert strings like "naïve" (expected output is "NAÏVE" and not "NAïVE"), Greek and Cyrillic "мы ебали медведя" (expected output is "МЫ ЕБАЛИ МЕДВЕДЯ").
It should accept std::string_view (not std::wstring_view) and output std::string_view.
Consider using `std::toupper(chr, std::locale("en_US.UTF-8"));` from `#include <locale>`
*/
// g++ -std=c++23 toupper_claude.cpp -o /tmp/toupper_claude && /tmp/toupper_claude
int main() {
// Test cases
std::string test1 = "hello world";
std::string test2 = "naïve";
std::string test3 = "мы ебали медведя";
std::string test4 = "αβγδε"; // Greek lowercase
std::string test5 = "Mixed 123 ñoño";
std::cout << "Original: " << test1 << " -> Upper: " << toUpperCase(test1) << std::endl;
std::cout << "Original: " << test2 << " -> Upper: " << toUpperCase(test2) << std::endl;
std::cout << "Original: " << test3 << " -> Upper: " << toUpperCase(test3) << std::endl;
std::cout << "Original: " << test4 << " -> Upper: " << toUpperCase(test4) << std::endl;
std::cout << "Original: " << test5 << " -> Upper: " << toUpperCase(test5) << std::endl;
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment