Last active
July 15, 2025 20:55
-
-
Save siberex/9cb30540198a73e0ef324cc6be104724 to your computer and use it in GitHub Desktop.
[FAILED] Attempt to implement string toUpperCase with LLM
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <string> | |
#include <string_view> | |
#include <locale> | |
#include <codecvt> | |
#include <algorithm> | |
class UTF8ToUpperCase { | |
private: | |
static thread_local std::string result_buffer; | |
// Convert UTF-8 byte sequence to Unicode code point | |
static std::pair<char32_t, size_t> utf8_to_codepoint(const char* utf8_str, size_t remaining) { | |
if (remaining == 0) return {0, 0}; | |
unsigned char first = static_cast<unsigned char>(utf8_str[0]); | |
// ASCII (0xxxxxxx) | |
if (first < 0x80) { | |
return {static_cast<char32_t>(first), 1}; | |
} | |
// 2-byte sequence (110xxxxx 10xxxxxx) | |
if ((first & 0xE0) == 0xC0 && remaining >= 2) { | |
if ((utf8_str[1] & 0xC0) == 0x80) { | |
char32_t codepoint = ((first & 0x1F) << 6) | (utf8_str[1] & 0x3F); | |
return {codepoint, 2}; | |
} | |
} | |
// 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx) | |
if ((first & 0xF0) == 0xE0 && remaining >= 3) { | |
if ((utf8_str[1] & 0xC0) == 0x80 && (utf8_str[2] & 0xC0) == 0x80) { | |
char32_t codepoint = ((first & 0x0F) << 12) | | |
((utf8_str[1] & 0x3F) << 6) | | |
(utf8_str[2] & 0x3F); | |
return {codepoint, 3}; | |
} | |
} | |
// 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx) | |
if ((first & 0xF8) == 0xF0 && remaining >= 4) { | |
if ((utf8_str[1] & 0xC0) == 0x80 && (utf8_str[2] & 0xC0) == 0x80 && (utf8_str[3] & 0xC0) == 0x80) { | |
char32_t codepoint = ((first & 0x07) << 18) | | |
((utf8_str[1] & 0x3F) << 12) | | |
((utf8_str[2] & 0x3F) << 6) | | |
(utf8_str[3] & 0x3F); | |
return {codepoint, 4}; | |
} | |
} | |
// Invalid UTF-8 sequence, return original byte | |
return {static_cast<char32_t>(first), 1}; | |
} | |
// Convert Unicode code point to UTF-8 byte sequence | |
static std::string codepoint_to_utf8(char32_t codepoint) { | |
std::string result; | |
if (codepoint < 0x80) { | |
result.push_back(static_cast<char>(codepoint)); | |
} else if (codepoint < 0x800) { | |
result.push_back(static_cast<char>(0xC0 | (codepoint >> 6))); | |
result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F))); | |
} else if (codepoint < 0x10000) { | |
result.push_back(static_cast<char>(0xE0 | (codepoint >> 12))); | |
result.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F))); | |
result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F))); | |
} else if (codepoint < 0x110000) { | |
result.push_back(static_cast<char>(0xF0 | (codepoint >> 18))); | |
result.push_back(static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F))); | |
result.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F))); | |
result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F))); | |
} else { | |
// Invalid codepoint, return replacement character | |
result.push_back(static_cast<char>(0xEF)); | |
result.push_back(static_cast<char>(0xBF)); | |
result.push_back(static_cast<char>(0xBD)); | |
} | |
return result; | |
} | |
// Simple uppercase mapping for common Unicode ranges | |
static char32_t simple_toupper(char32_t codepoint) { | |
// ASCII range | |
if (codepoint >= 'a' && codepoint <= 'z') { | |
return codepoint - 'a' + 'A'; | |
} | |
// Latin-1 Supplement (U+00C0-U+00FF) | |
if (codepoint >= 0x00E0 && codepoint <= 0x00FE && codepoint != 0x00F7) { | |
return codepoint - 0x20; | |
} | |
// Cyrillic range (U+0430-U+044F) | |
if (codepoint >= 0x0430 && codepoint <= 0x044F) { | |
return codepoint - 0x20; | |
} | |
// Additional Cyrillic (U+0450-U+045F) | |
if (codepoint >= 0x0450 && codepoint <= 0x045F) { | |
return codepoint - 0x50; | |
} | |
// Greek range (U+03B1-U+03C9) | |
if (codepoint >= 0x03B1 && codepoint <= 0x03C9) { | |
return codepoint - 0x20; | |
} | |
// Some common additional mappings | |
switch (codepoint) { | |
case 0x00DF: return 0x1E9E; // ß -> ẞ (or could return "SS") | |
case 0x0149: return 0x02BC; // ʼn -> ʼN (nasal) | |
case 0x017F: return 0x0053; // ſ -> S (long s) | |
case 0x1FBE: return 0x0399; // ι -> Ι (Greek) | |
} | |
// If no mapping found, return original | |
return codepoint; | |
} | |
public: | |
static std::string_view toUpperCase(const std::string_view& str) { | |
result_buffer.clear(); | |
result_buffer.reserve(str.size() * 2); // Reserve space, UTF-8 can expand | |
const char* data = str.data(); | |
size_t remaining = str.size(); | |
size_t pos = 0; | |
while (pos < str.size()) { | |
auto [codepoint, bytes_consumed] = utf8_to_codepoint(data + pos, remaining); | |
if (bytes_consumed == 0) break; | |
char32_t upper_codepoint = simple_toupper(codepoint); | |
std::string utf8_upper = codepoint_to_utf8(upper_codepoint); | |
result_buffer.append(utf8_upper); | |
pos += bytes_consumed; | |
remaining -= bytes_consumed; | |
} | |
return std::string_view(result_buffer); | |
} | |
}; | |
// Thread-local storage for the result buffer | |
thread_local std::string UTF8ToUpperCase::result_buffer; | |
// Main function interface | |
std::string_view toUpperCase(const std::string_view& str) { | |
return UTF8ToUpperCase::toUpperCase(str); | |
} | |
// Example usage and test | |
#include <iostream> | |
/* | |
PROMPT: | |
Write C++23 implementation for string toUpperCase() function for UTF-8 strings. | |
Do not use Boost or ICU. | |
Function signature: `std::string_view toUpperCase(const std::string_view &str); ` | |
Use standard library methods like `locale::toupper`, function should be portable (should compile with either GCC, LLVM Clang or MSVC) and should work in the upcoming C++26 standard (do not use anything that is going to be removed in C++26). | |
Full Unicode compatibility is not required, but it should correctly convert strings like "naïve" (expected output is "NAÏVE" and not "NAïVE"), Greek and Cyrillic "мы ебали медведя" (expected output is "МЫ ЕБАЛИ МЕДВЕДЯ"). | |
It should accept std::string_view (not std::wstring_view) and output std::string_view. | |
Consider using `std::toupper(chr, std::locale("en_US.UTF-8"));` from `#include <locale>` | |
*/ | |
// g++ -std=c++23 toupper_claude.cpp -o /tmp/toupper_claude && /tmp/toupper_claude | |
int main() { | |
// Test cases | |
std::string test1 = "hello world"; | |
std::string test2 = "naïve"; | |
std::string test3 = "мы ебали медведя"; | |
std::string test4 = "αβγδε"; // Greek lowercase | |
std::string test5 = "Mixed 123 ñoño"; | |
std::cout << "Original: " << test1 << " -> Upper: " << toUpperCase(test1) << std::endl; | |
std::cout << "Original: " << test2 << " -> Upper: " << toUpperCase(test2) << std::endl; | |
std::cout << "Original: " << test3 << " -> Upper: " << toUpperCase(test3) << std::endl; | |
std::cout << "Original: " << test4 << " -> Upper: " << toUpperCase(test4) << std::endl; | |
std::cout << "Original: " << test5 << " -> Upper: " << toUpperCase(test5) << std::endl; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment