Last active
July 15, 2025 20:55
-
-
Save siberex/94886ca5eec99e964cd5a45c703eaf53 to your computer and use it in GitHub Desktop.
[FAILED] Attempt to implement string toUpperCase with LLM
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cctype> | |
#include <cstdint> | |
#include <iostream> | |
#include <string> | |
#include <string_view> | |
static char32_t to_upper(char32_t cp) { | |
if (cp >= 'a' && cp <= 'z') { | |
return cp - ('a' - 'A'); | |
} | |
if (cp >= 0x00E0 && cp <= 0x00FE && cp != 0x00F7) { | |
return cp - 0x20; | |
} | |
if (cp == 0x00FF) { | |
return 0x0178; | |
} | |
if (cp >= 0x0100 && cp <= 0x017F) { | |
if (cp & 1) { | |
return cp - 1; | |
} | |
} | |
switch (cp) { | |
case 0x0131: | |
return 'I'; | |
case 0x017F: | |
return 'S'; | |
default: | |
break; | |
} | |
return cp; | |
} | |
std::string_view toUpperCase(const std::string_view &str) { | |
thread_local std::string buffer; | |
buffer.clear(); | |
buffer.reserve(str.size() * 4); | |
auto it = str.begin(); | |
while (it != str.end()) { | |
unsigned char c = static_cast<unsigned char>(*it); | |
if (c < 0x80) { | |
buffer.push_back(static_cast<char>(std::toupper(c))); | |
++it; | |
} else { | |
int n = 0; | |
if ((c & 0xE0) == 0xC0) { | |
n = 2; | |
} else if ((c & 0xF0) == 0xE0) { | |
n = 3; | |
} else if ((c & 0xF8) == 0xF0) { | |
n = 4; | |
} else { | |
buffer.push_back(*it); | |
++it; | |
continue; | |
} | |
if (static_cast<std::size_t>(std::distance(it, str.end())) < static_cast<std::size_t>(n)) { | |
while (it != str.end()) { | |
buffer.push_back(*it); | |
++it; | |
} | |
break; | |
} | |
char32_t code_point = 0; | |
bool valid = true; | |
auto next_it = it; | |
if (n >= 2) { | |
++next_it; | |
if (next_it == str.end() || (static_cast<unsigned char>(*next_it) & 0xC0) != 0x80) { | |
valid = false; | |
} | |
} | |
if (n >= 3 && valid) { | |
++next_it; | |
if (next_it == str.end() || (static_cast<unsigned char>(*next_it) & 0xC0) != 0x80) { | |
valid = false; | |
} | |
} | |
if (n >= 4 && valid) { | |
++next_it; | |
if (next_it == str.end() || (static_cast<unsigned char>(*next_it) & 0xC0) != 0x80) { | |
valid = false; | |
} | |
} | |
if (!valid) { | |
buffer.push_back(*it); | |
++it; | |
continue; | |
} | |
next_it = it; | |
if (n == 2) { | |
code_point = (static_cast<unsigned char>(*next_it) & 0x1F) << 6; | |
++next_it; | |
code_point |= (static_cast<unsigned char>(*next_it) & 0x3F); | |
} else if (n == 3) { | |
code_point = (static_cast<unsigned char>(*next_it) & 0x0F) << 12; | |
++next_it; | |
code_point |= (static_cast<unsigned char>(*next_it) & 0x3F) << 6; | |
++next_it; | |
code_point |= (static_cast<unsigned char>(*next_it) & 0x3F); | |
} else if (n == 4) { | |
code_point = (static_cast<unsigned char>(*next_it) & 0x07) << 18; | |
++next_it; | |
code_point |= (static_cast<unsigned char>(*next_it) & 0x3F) << 12; | |
++next_it; | |
code_point |= (static_cast<unsigned char>(*next_it) & 0x3F) << 6; | |
++next_it; | |
code_point |= (static_cast<unsigned char>(*next_it) & 0x3F); | |
} | |
char32_t upper_cp = to_upper(code_point); | |
if (upper_cp <= 0x7F) { | |
buffer.push_back(static_cast<char>(upper_cp)); | |
} else if (upper_cp <= 0x7FF) { | |
buffer.push_back(static_cast<char>(0xC0 | (upper_cp >> 6))); | |
buffer.push_back(static_cast<char>(0x80 | (upper_cp & 0x3F))); | |
} else if (upper_cp <= 0xFFFF) { | |
buffer.push_back(static_cast<char>(0xE0 | (upper_cp >> 12))); | |
buffer.push_back(static_cast<char>(0x80 | ((upper_cp >> 6) & 0x3F))); | |
buffer.push_back(static_cast<char>(0x80 | (upper_cp & 0x3F))); | |
} else if (upper_cp <= 0x10FFFF) { | |
buffer.push_back(static_cast<char>(0xF0 | (upper_cp >> 18))); | |
buffer.push_back(static_cast<char>(0x80 | ((upper_cp >> 12) & 0x3F))); | |
buffer.push_back(static_cast<char>(0x80 | ((upper_cp >> 6) & 0x3F))); | |
buffer.push_back(static_cast<char>(0x80 | (upper_cp & 0x3F))); | |
} else { | |
for (int i = 0; i < n; ++i) { | |
buffer.push_back(*(it + i)); | |
} | |
} | |
it += n; | |
} | |
} | |
return buffer; | |
} | |
/* | |
PROMPT: | |
Write C++23 implementation for string toUpperCase() function for UTF-8 strings. Do not use Boost or ICU. | |
Function signature: std::string_view toUpperCase(const std::string_view &str); | |
Use standard library methods like locale::toupper, function should be portable (should compile with either GCC, LLVM Clang or MSVC) and should work in the upcoming C++26 standard (do not use anything that is going to be removed in C++26). | |
Full Unicode compatibility is not required, but it should correctly convert strings like "naïve" (expected output is "NAÏVE" and not "NAïVE"). | |
It should accept std::string_view (not std::wstring_view) and output std::string_view. | |
*/ | |
// g++ -std=c++23 toupper_deepseek.cpp -o /tmp/toupper_deepseek && /tmp/toupper_deepseek | |
int main() { | |
std::string strTestUpper = "naïve"; | |
std::cout << std::format( | |
"Narrow string: {0} → {1}\n", | |
strTestUpper, | |
toUpperCase(strTestUpper) | |
); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment