siberex · July 15, 2025 20:55
diff --git a/toupper_claude.cpp b/toupper_claude.cpp
 #include <string>
 #include <string_view>
 #include <locale>
 #include <codecvt>
 #include <algorithm>

 class UTF8ToUpperCase {
 private:
    static thread_local std::string result_buffer;
    
    // Convert UTF-8 byte sequence to Unicode code point
    static std::pair<char32_t, size_t> utf8_to_codepoint(const char* utf8_str, size_t remaining) {
        if (remaining == 0) return {0, 0};
        
        unsigned char first = static_cast<unsigned char>(utf8_str[0]);
        
        // ASCII (0xxxxxxx)
        if (first < 0x80) {
            return {static_cast<char32_t>(first), 1};
        }
        
        // 2-byte sequence (110xxxxx 10xxxxxx)
        if ((first & 0xE0) == 0xC0 && remaining >= 2) {
            if ((utf8_str[1] & 0xC0) == 0x80) {
                char32_t codepoint = ((first & 0x1F) << 6) | (utf8_str[1] & 0x3F);
                return {codepoint, 2};
            }
        }
        
        // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
        if ((first & 0xF0) == 0xE0 && remaining >= 3) {
            if ((utf8_str[1] & 0xC0) == 0x80 && (utf8_str[2] & 0xC0) == 0x80) {
                char32_t codepoint = ((first & 0x0F) << 12) | 
                                   ((utf8_str[1] & 0x3F) << 6) | 
                                   (utf8_str[2] & 0x3F);
                return {codepoint, 3};
            }
        }
        
        // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
        if ((first & 0xF8) == 0xF0 && remaining >= 4) {
            if ((utf8_str[1] & 0xC0) == 0x80 && (utf8_str[2] & 0xC0) == 0x80 && (utf8_str[3] & 0xC0) == 0x80) {
                char32_t codepoint = ((first & 0x07) << 18) | 
                                   ((utf8_str[1] & 0x3F) << 12) | 
                                   ((utf8_str[2] & 0x3F) << 6) | 
                                   (utf8_str[3] & 0x3F);
                return {codepoint, 4};
            }
        }
        
        // Invalid UTF-8 sequence, return original byte
        return {static_cast<char32_t>(first), 1};
    }
    
    // Convert Unicode code point to UTF-8 byte sequence
    static std::string codepoint_to_utf8(char32_t codepoint) {
        std::string result;
        
        if (codepoint < 0x80) {
            result.push_back(static_cast<char>(codepoint));
        } else if (codepoint < 0x800) {
            result.push_back(static_cast<char>(0xC0 | (codepoint >> 6)));
            result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
        } else if (codepoint < 0x10000) {
            result.push_back(static_cast<char>(0xE0 | (codepoint >> 12)));
            result.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
            result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
        } else if (codepoint < 0x110000) {
            result.push_back(static_cast<char>(0xF0 | (codepoint >> 18)));
            result.push_back(static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F)));
            result.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
            result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
        } else {
            // Invalid codepoint, return replacement character
            result.push_back(static_cast<char>(0xEF));
            result.push_back(static_cast<char>(0xBF));
            result.push_back(static_cast<char>(0xBD));
        }
        
        return result;
    }
    
    // Simple uppercase mapping for common Unicode ranges
    static char32_t simple_toupper(char32_t codepoint) {
        // ASCII range
        if (codepoint >= 'a' && codepoint <= 'z') {
            return codepoint - 'a' + 'A';
        }
        
        // Latin-1 Supplement (U+00C0-U+00FF)
        if (codepoint >= 0x00E0 && codepoint <= 0x00FE && codepoint != 0x00F7) {
            return codepoint - 0x20;
        }
        
        // Cyrillic range (U+0430-U+044F)
        if (codepoint >= 0x0430 && codepoint <= 0x044F) {
            return codepoint - 0x20;
        }
        
        // Additional Cyrillic (U+0450-U+045F)
        if (codepoint >= 0x0450 && codepoint <= 0x045F) {
            return codepoint - 0x50;
        }
        
        // Greek range (U+03B1-U+03C9)
        if (codepoint >= 0x03B1 && codepoint <= 0x03C9) {
            return codepoint - 0x20;
        }
        
        // Some common additional mappings
        switch (codepoint) {
            case 0x00DF: return 0x1E9E; // ß -> ẞ (or could return "SS")
            case 0x0149: return 0x02BC; // ʼn -> ʼN (nasal)
            case 0x017F: return 0x0053; // ſ -> S (long s)
            case 0x1FBE: return 0x0399; // ι -> Ι (Greek)
        }
        
        // If no mapping found, return original
        return codepoint;
    }

 public:
    static std::string_view toUpperCase(const std::string_view& str) {
        result_buffer.clear();
        result_buffer.reserve(str.size() * 2); // Reserve space, UTF-8 can expand
        
        const char* data = str.data();
        size_t remaining = str.size();
        size_t pos = 0;
        
        while (pos < str.size()) {
            auto [codepoint, bytes_consumed] = utf8_to_codepoint(data + pos, remaining);
            
            if (bytes_consumed == 0) break;
            
            char32_t upper_codepoint = simple_toupper(codepoint);
            std::string utf8_upper = codepoint_to_utf8(upper_codepoint);
            result_buffer.append(utf8_upper);
            
            pos += bytes_consumed;
            remaining -= bytes_consumed;
        }
        
        return std::string_view(result_buffer);
    }
 };

 // Thread-local storage for the result buffer
 thread_local std::string UTF8ToUpperCase::result_buffer;

 // Main function interface
 std::string_view toUpperCase(const std::string_view& str) {
    return UTF8ToUpperCase::toUpperCase(str);
 }


 // Example usage and test
 #include <iostream>

 /*
 PROMPT:

 Write C++23 implementation for string toUpperCase() function for UTF-8 strings.
 Do not use Boost or ICU.
 Function signature: `std::string_view toUpperCase(const std::string_view &str); `

 Use standard library methods like `locale::toupper`, function should be portable (should compile with either GCC, LLVM Clang or MSVC) and should work in the upcoming C++26 standard (do not use anything that is going to be removed in C++26).

 Full Unicode compatibility is not required, but it should correctly convert strings like "naïve" (expected output is "NAÏVE" and not "NAïVE"), Greek and Cyrillic "мы ебали медведя" (expected output is "МЫ ЕБАЛИ МЕДВЕДЯ").

 It should accept std::string_view (not std::wstring_view) and output std::string_view.

 Consider using `std::toupper(chr, std::locale("en_US.UTF-8"));` from `#include <locale>`
 */

 // g++ -std=c++23 toupper_claude.cpp -o /tmp/toupper_claude && /tmp/toupper_claude
 int main() {
    // Test cases
    std::string test1 = "hello world";
    std::string test2 = "naïve";
    std::string test3 = "мы ебали медведя";
    std::string test4 = "αβγδε"; // Greek lowercase
    std::string test5 = "Mixed 123 ñoño";
    
    std::cout << "Original: " << test1 << " -> Upper: " << toUpperCase(test1) << std::endl;
    std::cout << "Original: " << test2 << " -> Upper: " << toUpperCase(test2) << std::endl;
    std::cout << "Original: " << test3 << " -> Upper: " << toUpperCase(test3) << std::endl;
    std::cout << "Original: " << test4 << " -> Upper: " << toUpperCase(test4) << std::endl;
    std::cout << "Original: " << test5 << " -> Upper: " << toUpperCase(test5) << std::endl;
    
    return 0;
 }
	#include <string>
	#include <string_view>
	#include <locale>
	#include <codecvt>
	#include <algorithm>

	class UTF8ToUpperCase {
	private:
	static thread_local std::string result_buffer;

	// Convert UTF-8 byte sequence to Unicode code point
	static std::pair<char32_t, size_t> utf8_to_codepoint(const char* utf8_str, size_t remaining) {
	if (remaining == 0) return {0, 0};

	unsigned char first = static_cast<unsigned char>(utf8_str[0]);

	// ASCII (0xxxxxxx)
	if (first < 0x80) {
	return {static_cast<char32_t>(first), 1};
	}

	// 2-byte sequence (110xxxxx 10xxxxxx)
	if ((first & 0xE0) == 0xC0 && remaining >= 2) {
	if ((utf8_str[1] & 0xC0) == 0x80) {
	char32_t codepoint = ((first & 0x1F) << 6) \| (utf8_str[1] & 0x3F);
	return {codepoint, 2};
	}
	}

	// 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
	if ((first & 0xF0) == 0xE0 && remaining >= 3) {
	if ((utf8_str[1] & 0xC0) == 0x80 && (utf8_str[2] & 0xC0) == 0x80) {
	char32_t codepoint = ((first & 0x0F) << 12) \|
	((utf8_str[1] & 0x3F) << 6) \|
	(utf8_str[2] & 0x3F);
	return {codepoint, 3};
	}
	}

	// 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
	if ((first & 0xF8) == 0xF0 && remaining >= 4) {
	if ((utf8_str[1] & 0xC0) == 0x80 && (utf8_str[2] & 0xC0) == 0x80 && (utf8_str[3] & 0xC0) == 0x80) {
	char32_t codepoint = ((first & 0x07) << 18) \|
	((utf8_str[1] & 0x3F) << 12) \|
	((utf8_str[2] & 0x3F) << 6) \|
	(utf8_str[3] & 0x3F);
	return {codepoint, 4};
	}
	}

	// Invalid UTF-8 sequence, return original byte
	return {static_cast<char32_t>(first), 1};
	}

	// Convert Unicode code point to UTF-8 byte sequence
	static std::string codepoint_to_utf8(char32_t codepoint) {
	std::string result;

	if (codepoint < 0x80) {
	result.push_back(static_cast<char>(codepoint));
	} else if (codepoint < 0x800) {
	result.push_back(static_cast<char>(0xC0 \| (codepoint >> 6)));
	result.push_back(static_cast<char>(0x80 \| (codepoint & 0x3F)));
	} else if (codepoint < 0x10000) {
	result.push_back(static_cast<char>(0xE0 \| (codepoint >> 12)));
	result.push_back(static_cast<char>(0x80 \| ((codepoint >> 6) & 0x3F)));
	result.push_back(static_cast<char>(0x80 \| (codepoint & 0x3F)));
	} else if (codepoint < 0x110000) {
	result.push_back(static_cast<char>(0xF0 \| (codepoint >> 18)));
	result.push_back(static_cast<char>(0x80 \| ((codepoint >> 12) & 0x3F)));
	result.push_back(static_cast<char>(0x80 \| ((codepoint >> 6) & 0x3F)));
	result.push_back(static_cast<char>(0x80 \| (codepoint & 0x3F)));
	} else {
	// Invalid codepoint, return replacement character
	result.push_back(static_cast<char>(0xEF));
	result.push_back(static_cast<char>(0xBF));
	result.push_back(static_cast<char>(0xBD));
	}

	return result;
	}

	// Simple uppercase mapping for common Unicode ranges
	static char32_t simple_toupper(char32_t codepoint) {
	// ASCII range
	if (codepoint >= 'a' && codepoint <= 'z') {
	return codepoint - 'a' + 'A';
	}

	// Latin-1 Supplement (U+00C0-U+00FF)
	if (codepoint >= 0x00E0 && codepoint <= 0x00FE && codepoint != 0x00F7) {
	return codepoint - 0x20;
	}

	// Cyrillic range (U+0430-U+044F)
	if (codepoint >= 0x0430 && codepoint <= 0x044F) {
	return codepoint - 0x20;
	}

	// Additional Cyrillic (U+0450-U+045F)
	if (codepoint >= 0x0450 && codepoint <= 0x045F) {
	return codepoint - 0x50;
	}

	// Greek range (U+03B1-U+03C9)
	if (codepoint >= 0x03B1 && codepoint <= 0x03C9) {
	return codepoint - 0x20;
	}

	// Some common additional mappings
	switch (codepoint) {
	case 0x00DF: return 0x1E9E; // ß -> ẞ (or could return "SS")
	case 0x0149: return 0x02BC; // ʼn -> ʼN (nasal)
	case 0x017F: return 0x0053; // ſ -> S (long s)
	case 0x1FBE: return 0x0399; // ι -> Ι (Greek)
	}

	// If no mapping found, return original
	return codepoint;
	}

	public:
	static std::string_view toUpperCase(const std::string_view& str) {
	result_buffer.clear();
	result_buffer.reserve(str.size() * 2); // Reserve space, UTF-8 can expand

	const char* data = str.data();
	size_t remaining = str.size();
	size_t pos = 0;

	while (pos < str.size()) {
	auto [codepoint, bytes_consumed] = utf8_to_codepoint(data + pos, remaining);

	if (bytes_consumed == 0) break;

	char32_t upper_codepoint = simple_toupper(codepoint);
	std::string utf8_upper = codepoint_to_utf8(upper_codepoint);
	result_buffer.append(utf8_upper);

	pos += bytes_consumed;
	remaining -= bytes_consumed;
	}

	return std::string_view(result_buffer);
	}
	};

	// Thread-local storage for the result buffer
	thread_local std::string UTF8ToUpperCase::result_buffer;

	// Main function interface
	std::string_view toUpperCase(const std::string_view& str) {
	return UTF8ToUpperCase::toUpperCase(str);
	}


	// Example usage and test
	#include <iostream>

	/*
	PROMPT:

	Write C++23 implementation for string toUpperCase() function for UTF-8 strings.
	Do not use Boost or ICU.
	Function signature: `std::string_view toUpperCase(const std::string_view &str); `

	Use standard library methods like `locale::toupper`, function should be portable (should compile with either GCC, LLVM Clang or MSVC) and should work in the upcoming C++26 standard (do not use anything that is going to be removed in C++26).

	Full Unicode compatibility is not required, but it should correctly convert strings like "naïve" (expected output is "NAÏVE" and not "NAïVE"), Greek and Cyrillic "мы ебали медведя" (expected output is "МЫ ЕБАЛИ МЕДВЕДЯ").

	It should accept std::string_view (not std::wstring_view) and output std::string_view.

	Consider using `std::toupper(chr, std::locale("en_US.UTF-8"));` from `#include <locale>`
	*/

	// g++ -std=c++23 toupper_claude.cpp -o /tmp/toupper_claude && /tmp/toupper_claude
	int main() {
	// Test cases
	std::string test1 = "hello world";
	std::string test2 = "naïve";
	std::string test3 = "мы ебали медведя";
	std::string test4 = "αβγδε"; // Greek lowercase
	std::string test5 = "Mixed 123 ñoño";

	std::cout << "Original: " << test1 << " -> Upper: " << toUpperCase(test1) << std::endl;
	std::cout << "Original: " << test2 << " -> Upper: " << toUpperCase(test2) << std::endl;
	std::cout << "Original: " << test3 << " -> Upper: " << toUpperCase(test3) << std::endl;
	std::cout << "Original: " << test4 << " -> Upper: " << toUpperCase(test4) << std::endl;
	std::cout << "Original: " << test5 << " -> Upper: " << toUpperCase(test5) << std::endl;

	return 0;
	}