siberex · July 15, 2025 20:55
diff --git a/toupper_deepseek.cpp b/toupper_deepseek.cpp
 #include <cctype>
 #include <cstdint>
 #include <iostream>
 #include <string>
 #include <string_view>

 static char32_t to_upper(char32_t cp) {
    if (cp >= 'a' && cp <= 'z') {
        return cp - ('a' - 'A');
    }
    if (cp >= 0x00E0 && cp <= 0x00FE && cp != 0x00F7) {
        return cp - 0x20;
    }
    if (cp == 0x00FF) {
        return 0x0178;
    }
    if (cp >= 0x0100 && cp <= 0x017F) {
        if (cp & 1) {
            return cp - 1;
        }
    }
    switch (cp) {
        case 0x0131: 
            return 'I';
        case 0x017F: 
            return 'S';
        default:
            break;
    }
    return cp;
 }

 std::string_view toUpperCase(const std::string_view &str) {
    thread_local std::string buffer;
    buffer.clear();
    buffer.reserve(str.size() * 4);

    auto it = str.begin();
    while (it != str.end()) {
        unsigned char c = static_cast<unsigned char>(*it);
        if (c < 0x80) {
            buffer.push_back(static_cast<char>(std::toupper(c)));
            ++it;
        } else {
            int n = 0;
            if ((c & 0xE0) == 0xC0) {
                n = 2;
            } else if ((c & 0xF0) == 0xE0) {
                n = 3;
            } else if ((c & 0xF8) == 0xF0) {
                n = 4;
            } else {
                buffer.push_back(*it);
                ++it;
                continue;
            }

            if (static_cast<std::size_t>(std::distance(it, str.end())) < static_cast<std::size_t>(n)) {
                while (it != str.end()) {
                    buffer.push_back(*it);
                    ++it;
                }
                break;
            }

            char32_t code_point = 0;
            bool valid = true;
            auto next_it = it;
            if (n >= 2) {
                ++next_it;
                if (next_it == str.end() || (static_cast<unsigned char>(*next_it) & 0xC0) != 0x80) {
                    valid = false;
                }
            }
            if (n >= 3 && valid) {
                ++next_it;
                if (next_it == str.end() || (static_cast<unsigned char>(*next_it) & 0xC0) != 0x80) {
                    valid = false;
                }
            }
            if (n >= 4 && valid) {
                ++next_it;
                if (next_it == str.end() || (static_cast<unsigned char>(*next_it) & 0xC0) != 0x80) {
                    valid = false;
                }
            }

            if (!valid) {
                buffer.push_back(*it);
                ++it;
                continue;
            }

            next_it = it;
            if (n == 2) {
                code_point = (static_cast<unsigned char>(*next_it) & 0x1F) << 6;
                ++next_it;
                code_point |= (static_cast<unsigned char>(*next_it) & 0x3F);
            } else if (n == 3) {
                code_point = (static_cast<unsigned char>(*next_it) & 0x0F) << 12;
                ++next_it;
                code_point |= (static_cast<unsigned char>(*next_it) & 0x3F) << 6;
                ++next_it;
                code_point |= (static_cast<unsigned char>(*next_it) & 0x3F);
            } else if (n == 4) {
                code_point = (static_cast<unsigned char>(*next_it) & 0x07) << 18;
                ++next_it;
                code_point |= (static_cast<unsigned char>(*next_it) & 0x3F) << 12;
                ++next_it;
                code_point |= (static_cast<unsigned char>(*next_it) & 0x3F) << 6;
                ++next_it;
                code_point |= (static_cast<unsigned char>(*next_it) & 0x3F);
            }

            char32_t upper_cp = to_upper(code_point);

            if (upper_cp <= 0x7F) {
                buffer.push_back(static_cast<char>(upper_cp));
            } else if (upper_cp <= 0x7FF) {
                buffer.push_back(static_cast<char>(0xC0 | (upper_cp >> 6)));
                buffer.push_back(static_cast<char>(0x80 | (upper_cp & 0x3F)));
            } else if (upper_cp <= 0xFFFF) {
                buffer.push_back(static_cast<char>(0xE0 | (upper_cp >> 12)));
                buffer.push_back(static_cast<char>(0x80 | ((upper_cp >> 6) & 0x3F)));
                buffer.push_back(static_cast<char>(0x80 | (upper_cp & 0x3F)));
            } else if (upper_cp <= 0x10FFFF) {
                buffer.push_back(static_cast<char>(0xF0 | (upper_cp >> 18)));
                buffer.push_back(static_cast<char>(0x80 | ((upper_cp >> 12) & 0x3F)));
                buffer.push_back(static_cast<char>(0x80 | ((upper_cp >> 6) & 0x3F)));
                buffer.push_back(static_cast<char>(0x80 | (upper_cp & 0x3F)));
            } else {
                for (int i = 0; i < n; ++i) {
                    buffer.push_back(*(it + i));
                }
            }

            it += n;
        }
    }

    return buffer;
 }




 /*
 PROMPT:

 Write C++23 implementation for string toUpperCase() function for UTF-8 strings. Do not use Boost or ICU.
 Function signature: std::string_view toUpperCase(const std::string_view &str);
 Use standard library methods like locale::toupper, function should be portable (should compile with either GCC, LLVM Clang or MSVC) and should work in the upcoming C++26 standard (do not use anything that is going to be removed in C++26).
 Full Unicode compatibility is not required, but it should correctly convert strings like "naïve" (expected output is "NAÏVE" and not "NAïVE").
 It should accept std::string_view (not std::wstring_view) and output std::string_view.
 */

 // g++ -std=c++23 toupper_deepseek.cpp -o /tmp/toupper_deepseek && /tmp/toupper_deepseek
 int main() {

    std::string strTestUpper = "naïve";
    std::cout << std::format(
        "Narrow string: {0} → {1}\n",
        strTestUpper,
        toUpperCase(strTestUpper)
    );

    return 0;
 }
	#include <cctype>
	#include <cstdint>
	#include <iostream>
	#include <string>
	#include <string_view>

	static char32_t to_upper(char32_t cp) {
	if (cp >= 'a' && cp <= 'z') {
	return cp - ('a' - 'A');
	}
	if (cp >= 0x00E0 && cp <= 0x00FE && cp != 0x00F7) {
	return cp - 0x20;
	}
	if (cp == 0x00FF) {
	return 0x0178;
	}
	if (cp >= 0x0100 && cp <= 0x017F) {
	if (cp & 1) {
	return cp - 1;
	}
	}
	switch (cp) {
	case 0x0131:
	return 'I';
	case 0x017F:
	return 'S';
	default:
	break;
	}
	return cp;
	}

	std::string_view toUpperCase(const std::string_view &str) {
	thread_local std::string buffer;
	buffer.clear();
	buffer.reserve(str.size() * 4);

	auto it = str.begin();
	while (it != str.end()) {
	unsigned char c = static_cast<unsigned char>(*it);
	if (c < 0x80) {
	buffer.push_back(static_cast<char>(std::toupper(c)));
	++it;
	} else {
	int n = 0;
	if ((c & 0xE0) == 0xC0) {
	n = 2;
	} else if ((c & 0xF0) == 0xE0) {
	n = 3;
	} else if ((c & 0xF8) == 0xF0) {
	n = 4;
	} else {
	buffer.push_back(*it);
	++it;
	continue;
	}

	if (static_cast<std::size_t>(std::distance(it, str.end())) < static_cast<std::size_t>(n)) {
	while (it != str.end()) {
	buffer.push_back(*it);
	++it;
	}
	break;
	}

	char32_t code_point = 0;
	bool valid = true;
	auto next_it = it;
	if (n >= 2) {
	++next_it;
	if (next_it == str.end() \|\| (static_cast<unsigned char>(*next_it) & 0xC0) != 0x80) {
	valid = false;
	}
	}
	if (n >= 3 && valid) {
	++next_it;
	if (next_it == str.end() \|\| (static_cast<unsigned char>(*next_it) & 0xC0) != 0x80) {
	valid = false;
	}
	}
	if (n >= 4 && valid) {
	++next_it;
	if (next_it == str.end() \|\| (static_cast<unsigned char>(*next_it) & 0xC0) != 0x80) {
	valid = false;
	}
	}

	if (!valid) {
	buffer.push_back(*it);
	++it;
	continue;
	}

	next_it = it;
	if (n == 2) {
	code_point = (static_cast<unsigned char>(*next_it) & 0x1F) << 6;
	++next_it;
	code_point \|= (static_cast<unsigned char>(*next_it) & 0x3F);
	} else if (n == 3) {
	code_point = (static_cast<unsigned char>(*next_it) & 0x0F) << 12;
	++next_it;
	code_point \|= (static_cast<unsigned char>(*next_it) & 0x3F) << 6;
	++next_it;
	code_point \|= (static_cast<unsigned char>(*next_it) & 0x3F);
	} else if (n == 4) {
	code_point = (static_cast<unsigned char>(*next_it) & 0x07) << 18;
	++next_it;
	code_point \|= (static_cast<unsigned char>(*next_it) & 0x3F) << 12;
	++next_it;
	code_point \|= (static_cast<unsigned char>(*next_it) & 0x3F) << 6;
	++next_it;
	code_point \|= (static_cast<unsigned char>(*next_it) & 0x3F);
	}

	char32_t upper_cp = to_upper(code_point);

	if (upper_cp <= 0x7F) {
	buffer.push_back(static_cast<char>(upper_cp));
	} else if (upper_cp <= 0x7FF) {
	buffer.push_back(static_cast<char>(0xC0 \| (upper_cp >> 6)));
	buffer.push_back(static_cast<char>(0x80 \| (upper_cp & 0x3F)));
	} else if (upper_cp <= 0xFFFF) {
	buffer.push_back(static_cast<char>(0xE0 \| (upper_cp >> 12)));
	buffer.push_back(static_cast<char>(0x80 \| ((upper_cp >> 6) & 0x3F)));
	buffer.push_back(static_cast<char>(0x80 \| (upper_cp & 0x3F)));
	} else if (upper_cp <= 0x10FFFF) {
	buffer.push_back(static_cast<char>(0xF0 \| (upper_cp >> 18)));
	buffer.push_back(static_cast<char>(0x80 \| ((upper_cp >> 12) & 0x3F)));
	buffer.push_back(static_cast<char>(0x80 \| ((upper_cp >> 6) & 0x3F)));
	buffer.push_back(static_cast<char>(0x80 \| (upper_cp & 0x3F)));
	} else {
	for (int i = 0; i < n; ++i) {
	buffer.push_back(*(it + i));
	}
	}

	it += n;
	}
	}

	return buffer;
	}




	/*
	PROMPT:

	Write C++23 implementation for string toUpperCase() function for UTF-8 strings. Do not use Boost or ICU.
	Function signature: std::string_view toUpperCase(const std::string_view &str);
	Use standard library methods like locale::toupper, function should be portable (should compile with either GCC, LLVM Clang or MSVC) and should work in the upcoming C++26 standard (do not use anything that is going to be removed in C++26).
	Full Unicode compatibility is not required, but it should correctly convert strings like "naïve" (expected output is "NAÏVE" and not "NAïVE").
	It should accept std::string_view (not std::wstring_view) and output std::string_view.
	*/

	// g++ -std=c++23 toupper_deepseek.cpp -o /tmp/toupper_deepseek && /tmp/toupper_deepseek
	int main() {

	std::string strTestUpper = "naïve";
	std::cout << std::format(
	"Narrow string: {0} → {1}\n",
	strTestUpper,
	toUpperCase(strTestUpper)
	);

	return 0;
	}